From 66f555c40caf5b4b1e2e4c17bf6ffc8169ba8baf Mon Sep 17 00:00:00 2001 From: ericblanc20 Date: Mon, 6 May 2024 14:44:04 +0200 Subject: [PATCH] feat: support for somatic variant calling without normal (#503) --- .../config/cancer_wes/config.yaml.jinja2 | 1 + docs/step_generic.rst | 8 + .../workflows/abstract/__init__.py | 11 + .../somatic_variant_annotation/__init__.py | 31 +- .../somatic_variant_calling/__init__.py | 92 +++-- .../somatic_variant_filtration/Snakefile | 369 ++++++++--------- .../somatic_variant_filtration/__init__.py | 373 ++++++++++++------ .../wrappers/bcftools/filter/wrapper.py | 38 +- .../wrappers/bcftools/protected/wrapper.py | 25 +- .../wrappers/bcftools/regions/wrapper.py | 29 +- snappy_wrappers/wrappers/eb_filter/wrapper.py | 51 +-- .../wrappers/mutect2/run/wrapper.py | 8 +- tests/snappy_pipeline/workflows/conftest.py | 1 + ...st_workflows_somatic_variant_filtration.py | 196 +++++---- 14 files changed, 678 insertions(+), 555 deletions(-) diff --git a/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 b/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 index 815a47826..2622bc0e8 100644 --- a/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 +++ b/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 @@ -43,6 +43,7 @@ step_config: somatic_variant_filtration: path_somatic_variant: ../somatic_variant_annotation path_ngs_mapping: ../ngs_mapping + filtration_schema: list filter_list: - dkfz: {} - bcftools: diff --git a/docs/step_generic.rst b/docs/step_generic.rst index c557bbb65..e40c127ab 100644 --- a/docs/step_generic.rst +++ b/docs/step_generic.rst @@ -79,3 +79,11 @@ The versioning allows the pipeline step to check whether there are incompatibili The execution of ``cubi-snake`` in a directory will not automatically generate these files. Rather, they are only generated when used in a pipeline step such as ``somatic_targeted_cnv_calling``. + +.. note:: **Debugging configuration errors** + + The configuration for a complete pipeline can be long, and copy/paste errors can creep in. + When facing errors in configuration, it is advisable to + + - Use absolute paths for files not part of the workflow (panel of normal files are **NOT** part the of workflow), and + - Use the ``snappy-snake --verbose``, which prints the step configuration (and all the dependend workflows) as ``snappy`` has understood it. diff --git a/snappy_pipeline/workflows/abstract/__init__.py b/snappy_pipeline/workflows/abstract/__init__.py index d1029849a..eb3c193cc 100644 --- a/snappy_pipeline/workflows/abstract/__init__.py +++ b/snappy_pipeline/workflows/abstract/__init__.py @@ -671,6 +671,9 @@ def __init__( #: Functions from sub workflows, can be used to generate output paths into these workflows self.sub_workflows = {} + if workflow.verbose: + self._write_step_config() + def _setup_hooks(self): """Setup Snakemake workflow hooks for start/end/error""" # In the following, the "log" parameter to the handler functions is set to "_" as we @@ -931,6 +934,14 @@ def _load_data_search_infos(self): mixed_se_pe=False, ) + def _write_step_config(self, f=sys.stdout): + print(f"\n\n----- Configuration for step {self.name}:\n", file=f) + yaml = ruamel_yaml.YAML() + yaml.preserve_quotes = True + yaml.indent(sequence=4, mapping=4, offset=4) + yaml.dump(self.config, f) + print(f"\n------ Configuration for {self.name} ends here\n", file=f) + @classmethod def wrapper_path(cls, path): """Generate path to wrapper""" diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py index 010ed979a..9b599887d 100644 --- a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py @@ -238,8 +238,8 @@ def params_function(wildcards): def get_normal_lib_name(self, wildcards): """Return name of normal (non-cancer) library""" - pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library] - return pair.normal_sample.dna_ngs_library.name + pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None) + return pair.normal_sample.dna_ngs_library.name if pair else None class JannovarAnnotateSomaticVcfStepPart(AnnotateSomaticVcfStepPart): @@ -426,20 +426,19 @@ def _yield_result_files_matched(self, tpl, **kwargs): Mutect. """ for sheet in filter(is_not_background, self.shortcut_sheets): - for sample_pair in sheet.all_sample_pairs: - if ( - not sample_pair.tumor_sample.dna_ngs_library - or not sample_pair.normal_sample.dna_ngs_library - ): - msg = ( - "INFO: sample pair for cancer bio sample {} has is missing primary" - "normal or primary cancer NGS library" - ) - print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) - continue - yield from expand( - tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs - ) + for bio_entity in sheet.sheet.bio_entities.values(): + for bio_sample in bio_entity.bio_samples.values(): + if not bio_sample.extra_infos.get("isTumor", False): + continue + for test_sample in bio_sample.test_samples.values(): + extraction_type = test_sample.extra_infos.get("extractionType", "unknown") + if extraction_type.lower() != "dna": + if extraction_type == "unknown": + msg = "INFO: sample {} has missing extraction type, ignored" + print(msg.format(test_sample.name), file=sys.stderr) + continue + for ngs_library in test_sample.ngs_libraries.values(): + yield from expand(tpl, tumor_library=[ngs_library], **kwargs) def _yield_result_files_joint(self, tpl, **kwargs): """Build output paths from path template and extension list. diff --git a/snappy_pipeline/workflows/somatic_variant_calling/__init__.py b/snappy_pipeline/workflows/somatic_variant_calling/__init__.py index 84375854e..c58464fba 100644 --- a/snappy_pipeline/workflows/somatic_variant_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_calling/__init__.py @@ -241,7 +241,7 @@ germline_resource: '' # Germline variants resource (same as panel of normals) common_variants: '' # Common germline variants for contamination estimation extra_arguments: [] # List additional Mutect2 arguments - # Each additional argument xust be in the form: + # Each additional argument must be in the form: # "-- " # For example, to filter reads prior to calling & to # add annotations to the output vcf: @@ -387,32 +387,41 @@ def input_function(wildcards): ngs_mapping = self.parent.sub_workflows["ngs_mapping"] # Get names of primary libraries of the selected cancer bio sample and the # corresponding primary normal sample - normal_base_path = ( - "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( - normal_library=self.get_normal_lib_name(wildcards), **wildcards - ) - ) tumor_base_path = ( "output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}" ).format(**wildcards) - return { - "normal_bam": ngs_mapping(normal_base_path + ".bam"), - "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"), + input_files = { "tumor_bam": ngs_mapping(tumor_base_path + ".bam"), "tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"), } + normal_library = self.get_normal_lib_name(wildcards) + if normal_library: + normal_base_path = ( + "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( + normal_library=normal_library, **wildcards + ) + ) + input_files.update( + { + "normal_bam": ngs_mapping(normal_base_path + ".bam"), + "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"), + } + ) + + return input_files + return input_function def get_normal_lib_name(self, wildcards): """Return name of normal (non-cancer) library""" - pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library] - return pair.normal_sample.dna_ngs_library.name + pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None) + return pair.normal_sample.dna_ngs_library.name if pair else None def get_tumor_lib_name(self, wildcards): """Return name of tumor library""" - pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library] - return pair.tumor_sample.dna_ngs_library.name + pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None) + return pair.tumor_sample.dna_ngs_library.name if pair else wildcards.tumor_library def get_output_files(self, action): """Return output files that all somatic variant calling sub steps must @@ -577,19 +586,30 @@ def _get_input_files_run(self, wildcards): ngs_mapping = self.parent.sub_workflows["ngs_mapping"] # Get names of primary libraries of the selected cancer bio sample and the # corresponding primary normal sample - normal_base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( - normal_library=self.get_normal_lib_name(wildcards), **wildcards - ) tumor_base_path = ( "output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}" ).format(**wildcards) - return { - "normal_bam": ngs_mapping(normal_base_path + ".bam"), - "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"), + input_files = { "tumor_bam": ngs_mapping(tumor_base_path + ".bam"), "tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"), } + normal_library = self.get_normal_lib_name(wildcards) + if normal_library: + normal_base_path = ( + "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( + normal_library=normal_library, **wildcards + ) + ) + input_files.update( + { + "normal_bam": ngs_mapping(normal_base_path + ".bam"), + "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"), + } + ) + + return input_files + def _get_input_files_filter(self, wildcards): """Get input files for rule ``filter``. @@ -609,9 +629,10 @@ def _get_input_files_filter(self, wildcards): "stats": base_path + ".raw.vcf.stats", "f1r2": base_path + ".raw.f1r2_tar.tar.gz", } - if "contamination" in self.actions: - input_files["table"] = base_path + ".contamination.tbl" - input_files["segments"] = base_path + ".segments.tbl" + if self.get_normal_lib_name(wildcards): + if "contamination" in self.actions: + input_files["table"] = base_path + ".contamination.tbl" + input_files["segments"] = base_path + ".segments.tbl" return input_files def _get_input_files_pileup_normal(self, wildcards): @@ -1196,20 +1217,19 @@ def _yield_result_files_matched(self, tpl, **kwargs): Mutect. """ for sheet in filter(is_not_background, self.shortcut_sheets): - for sample_pair in sheet.all_sample_pairs: - if ( - not sample_pair.tumor_sample.dna_ngs_library - or not sample_pair.normal_sample.dna_ngs_library - ): - msg = ( - "INFO: sample pair for cancer bio sample {} has is missing primary" - "normal or primary cancer NGS library" - ) - print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) - continue - yield from expand( - tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs - ) + for bio_entity in sheet.sheet.bio_entities.values(): + for bio_sample in bio_entity.bio_samples.values(): + if not bio_sample.extra_infos.get("isTumor", False): + continue + for test_sample in bio_sample.test_samples.values(): + extraction_type = test_sample.extra_infos.get("extractionType", "unknown") + if extraction_type.lower() != "dna": + if extraction_type == "unknown": + msg = "INFO: sample {} has missing extraction type, ignored" + print(msg.format(test_sample.name), file=sys.stderr) + continue + for ngs_library in test_sample.ngs_libraries.values(): + yield from expand(tpl, tumor_library=[ngs_library], **kwargs) def _yield_result_files_joint(self, tpl, **kwargs): """Build output paths from path template and extension list. diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/Snakefile b/snappy_pipeline/workflows/somatic_variant_filtration/Snakefile index d81e9112a..9d8b55792 100644 --- a/snappy_pipeline/workflows/somatic_variant_filtration/Snakefile +++ b/snappy_pipeline/workflows/somatic_variant_filtration/Snakefile @@ -53,27 +53,6 @@ rule somatic_variant_filtration_link_out_run: # Somatic Variant Filtration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Run DKFZ Bias Filter -------------------------------------------------------- - - -rule somatic_variant_filtration_dkfz_bias_filter_run: - input: - **wf.get_input_files("dkfz_bias_filter", "run"), - output: - **wf.get_output_files("dkfz_bias_filter", "run"), - threads: wf.get_resource("dkfz_bias_filter", "run", "threads") - resources: - time=wf.get_resource("dkfz_bias_filter", "run", "time"), - memory=wf.get_resource("dkfz_bias_filter", "run", "memory"), - partition=wf.get_resource("dkfz_bias_filter", "run", "partition"), - tmpdir=wf.get_resource("dkfz_bias_filter", "run", "tmpdir"), - params: - **{"args": wf.get_params("dkfz_bias_filter", "run")}, - log: - **wf.get_log_file("dkfz_bias_filter", "run"), - wrapper: - wf.wrapper_path("dkfz_bias_filter") - # Write out "panel of normals" file for eb_filter ------------------------------ @@ -95,175 +74,197 @@ rule somatic_variant_filtration_eb_filter_write_panel: wf.substep_getattr("eb_filter", "write_panel_of_normals_file")(wildcards) -# Run eb_filter ---------------------------------------------------------------- - - -rule somatic_variant_filtration_eb_filter_run: - input: - unpack(wf.get_input_files("eb_filter", "run")), - output: - **wf.get_output_files("eb_filter", "run"), - threads: wf.get_resource("eb_filter", "run", "threads") - resources: - time=wf.get_resource("eb_filter", "run", "time"), - memory=wf.get_resource("eb_filter", "run", "memory"), - partition=wf.get_resource("eb_filter", "run", "partition"), - tmpdir=wf.get_resource("eb_filter", "run", "tmpdir"), - params: - **{"args": wf.get_params("eb_filter", "run")}, - log: - **wf.get_log_file("eb_filter", "run"), - wrapper: - wf.wrapper_path("eb_filter") - - -# Apply Filters --------------------------------------------------------------- - - -rule variant_filtration_apply_filters_run: - input: - **(wf.get_input_files("apply_filters", "run")), - output: - **wf.get_output_files("apply_filters", "run"), - threads: wf.get_resource("apply_filters", "run", "threads") - resources: - time=wf.get_resource("apply_filters", "run", "time"), - memory=wf.get_resource("apply_filters", "run", "memory"), - partition=wf.get_resource("apply_filters", "run", "partition"), - tmpdir=wf.get_resource("apply_filters", "run", "tmpdir"), - log: - wf.get_log_file("apply_filters", "run"), - params: - args=wf.substep_dispatch("apply_filters", "get_args", "run"), - wrapper: - wf.wrapper_path("somatic_variant_filtration/apply_filters") - - -# Filter to Exons-------------------------------------------------------------- - +# Run DKFZ Bias Filter -------------------------------------------------------- -rule variant_filtration_filter_to_exons_run: - input: - unpack(wf.get_input_files("filter_to_exons", "run")), - output: - **wf.get_output_files("filter_to_exons", "run"), - threads: wf.get_resource("filter_to_exons", "run", "threads") - resources: - time=wf.get_resource("filter_to_exons", "run", "time"), - memory=wf.get_resource("filter_to_exons", "run", "memory"), - partition=wf.get_resource("filter_to_exons", "run", "partition"), - tmpdir=wf.get_resource("filter_to_exons", "run", "tmpdir"), - log: - wf.get_log_file("filter_to_exons", "run"), - wrapper: - wf.wrapper_path("somatic_variant_filtration/filter_to_exons") +if config["step_config"]["somatic_variant_filtration"]["filtration_schema"] == "sets": + + rule somatic_variant_filtration_dkfz_bias_filter_run: + input: + **wf.get_input_files("dkfz_bias_filter", "run"), + output: + **wf.get_output_files("dkfz_bias_filter", "run"), + threads: wf.get_resource("dkfz_bias_filter", "run", "threads") + resources: + time=wf.get_resource("dkfz_bias_filter", "run", "time"), + memory=wf.get_resource("dkfz_bias_filter", "run", "memory"), + partition=wf.get_resource("dkfz_bias_filter", "run", "partition"), + tmpdir=wf.get_resource("dkfz_bias_filter", "run", "tmpdir"), + params: + **{"args": wf.get_params("dkfz_bias_filter", "run")}, + log: + **wf.get_log_file("dkfz_bias_filter", "run"), + wrapper: + wf.wrapper_path("dkfz_bias_filter") + + # Run eb_filter ---------------------------------------------------------------- + + rule somatic_variant_filtration_eb_filter_run: + input: + unpack(wf.get_input_files("eb_filter", "run")), + output: + **wf.get_output_files("eb_filter", "run"), + threads: wf.get_resource("eb_filter", "run", "threads") + resources: + time=wf.get_resource("eb_filter", "run", "time"), + memory=wf.get_resource("eb_filter", "run", "memory"), + partition=wf.get_resource("eb_filter", "run", "partition"), + tmpdir=wf.get_resource("eb_filter", "run", "tmpdir"), + params: + **{"args": wf.get_params("eb_filter", "run")}, + log: + **wf.get_log_file("eb_filter", "run"), + wrapper: + wf.wrapper_path("eb_filter") + + # Apply Filters --------------------------------------------------------------- + + rule somatic_variant_filtration_apply_filters_run: + input: + **(wf.get_input_files("apply_filters", "run")), + output: + **wf.get_output_files("apply_filters", "run"), + threads: wf.get_resource("apply_filters", "run", "threads") + resources: + time=wf.get_resource("apply_filters", "run", "time"), + memory=wf.get_resource("apply_filters", "run", "memory"), + partition=wf.get_resource("apply_filters", "run", "partition"), + tmpdir=wf.get_resource("apply_filters", "run", "tmpdir"), + log: + wf.get_log_file("apply_filters", "run"), + params: + args=wf.substep_dispatch("apply_filters", "get_args", "run"), + wrapper: + wf.wrapper_path("somatic_variant_filtration/apply_filters") + + # Filter to Exons-------------------------------------------------------------- + + rule somatic_variant_filtration_filter_to_exons_run: + input: + unpack(wf.get_input_files("filter_to_exons", "run")), + output: + **wf.get_output_files("filter_to_exons", "run"), + threads: wf.get_resource("filter_to_exons", "run", "threads") + resources: + time=wf.get_resource("filter_to_exons", "run", "time"), + memory=wf.get_resource("filter_to_exons", "run", "memory"), + partition=wf.get_resource("filter_to_exons", "run", "partition"), + tmpdir=wf.get_resource("filter_to_exons", "run", "tmpdir"), + log: + wf.get_log_file("filter_to_exons", "run"), + wrapper: + wf.wrapper_path("somatic_variant_filtration/filter_to_exons") # Flexible Somatic Variant Filtration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -checkpoint one_dkfz: - input: - unpack(wf.get_input_files("one_dkfz", "run")), - output: - **wf.get_output_files("one_dkfz", "run"), - threads: wf.get_resource("one_dkfz", "run", "threads") - resources: - time=wf.get_resource("one_dkfz", "run", "time"), - memory=wf.get_resource("one_dkfz", "run", "memory"), - partition=wf.get_resource("one_dkfz", "run", "partition"), - tmpdir=wf.get_resource("one_dkfz", "run", "tmpdir"), - log: - **wf.get_log_file("one_dkfz", "run"), - wrapper: - wf.wrapper_path("dkfz_bias_filter") - - -checkpoint one_ebfilter: - input: - unpack(wf.get_input_files("one_ebfilter", "run")), - output: - **wf.get_output_files("one_ebfilter", "run"), - params: - wf.get_params("one_ebfilter", "run"), - threads: wf.get_resource("one_ebfilter", "run", "threads") - resources: - time=wf.get_resource("one_ebfilter", "run", "time"), - memory=wf.get_resource("one_ebfilter", "run", "memory"), - partition=wf.get_resource("one_ebfilter", "run", "partition"), - tmpdir=wf.get_resource("one_ebfilter", "run", "tmpdir"), - log: - **wf.get_log_file("one_ebfilter", "run"), - wrapper: - wf.wrapper_path("eb_filter") - - -checkpoint one_bcftools: - input: - unpack(wf.get_input_files("one_bcftools", "run")), - output: - **wf.get_output_files("one_bcftools", "run"), - threads: wf.get_resource("one_bcftools", "run", "threads") - resources: - time=wf.get_resource("one_bcftools", "run", "time"), - memory=wf.get_resource("one_bcftools", "run", "memory"), - partition=wf.get_resource("one_bcftools", "run", "partition"), - tmpdir=wf.get_resource("one_bcftools", "run", "tmpdir"), - log: - **wf.get_log_file("one_bcftools", "run"), - wrapper: - wf.wrapper_path("bcftools/filter") - - -checkpoint one_regions: - input: - unpack(wf.get_input_files("one_regions", "run")), - output: - **wf.get_output_files("one_regions", "run"), - threads: wf.get_resource("one_regions", "run", "threads") - resources: - time=wf.get_resource("one_regions", "run", "time"), - memory=wf.get_resource("one_regions", "run", "memory"), - partition=wf.get_resource("one_regions", "run", "partition"), - tmpdir=wf.get_resource("one_regions", "run", "tmpdir"), - log: - **wf.get_log_file("one_regions", "run"), - wrapper: - wf.wrapper_path("bcftools/regions") - - -checkpoint one_protected: - input: - unpack(wf.get_input_files("one_protected", "run")), - output: - **wf.get_output_files("one_protected", "run"), - threads: wf.get_resource("one_protected", "run", "threads") - resources: - time=wf.get_resource("one_protected", "run", "time"), - memory=wf.get_resource("one_protected", "run", "memory"), - partition=wf.get_resource("one_protected", "run", "partition"), - tmpdir=wf.get_resource("one_protected", "run", "tmpdir"), - log: - **wf.get_log_file("one_protected", "run"), - wrapper: - wf.wrapper_path("bcftools/protected") - - -rule last_filter: - input: - **wf.get_input_files("last_filter", "run"), - output: - **wf.get_output_files("last_filter", "run"), - threads: wf.get_resource("last_filter", "run", "threads") - resources: - time=wf.get_resource("last_filter", "run", "time"), - memory=wf.get_resource("last_filter", "run", "memory"), - partition=wf.get_resource("last_filter", "run", "partition"), - tmpdir=wf.get_resource("last_filter", "run", "tmpdir"), - log: - **wf.get_log_file("last_filter", "run"), - wrapper: - wf.wrapper_path("somatic_variant_filtration/apply_all_filters") +if config["step_config"]["somatic_variant_filtration"]["filtration_schema"] == "list": + + checkpoint one_dkfz: + input: + unpack(wf.get_input_files("one_dkfz", "run")), + output: + **wf.get_output_files("one_dkfz", "run"), + params: + **{"args": wf.get_params("one_dkfz", "run")}, + threads: wf.get_resource("one_dkfz", "run", "threads") + resources: + time=wf.get_resource("one_dkfz", "run", "time"), + memory=wf.get_resource("one_dkfz", "run", "memory"), + partition=wf.get_resource("one_dkfz", "run", "partition"), + tmpdir=wf.get_resource("one_dkfz", "run", "tmpdir"), + log: + **wf.get_log_file("one_dkfz", "run"), + wrapper: + wf.wrapper_path("dkfz_bias_filter") + + checkpoint one_ebfilter: + input: + unpack(wf.get_input_files("one_ebfilter", "run")), + output: + **wf.get_output_files("one_ebfilter", "run"), + params: + **{"args": wf.get_params("one_ebfilter", "run")}, + threads: wf.get_resource("one_ebfilter", "run", "threads") + resources: + time=wf.get_resource("one_ebfilter", "run", "time"), + memory=wf.get_resource("one_ebfilter", "run", "memory"), + partition=wf.get_resource("one_ebfilter", "run", "partition"), + tmpdir=wf.get_resource("one_ebfilter", "run", "tmpdir"), + log: + **wf.get_log_file("one_ebfilter", "run"), + wrapper: + wf.wrapper_path("eb_filter") + + checkpoint one_bcftools: + input: + unpack(wf.get_input_files("one_bcftools", "run")), + output: + **wf.get_output_files("one_bcftools", "run"), + params: + **{"args": wf.get_params("one_bcftools", "run")}, + threads: wf.get_resource("one_bcftools", "run", "threads") + resources: + time=wf.get_resource("one_bcftools", "run", "time"), + memory=wf.get_resource("one_bcftools", "run", "memory"), + partition=wf.get_resource("one_bcftools", "run", "partition"), + tmpdir=wf.get_resource("one_bcftools", "run", "tmpdir"), + log: + **wf.get_log_file("one_bcftools", "run"), + wrapper: + wf.wrapper_path("bcftools/filter") + + checkpoint one_regions: + input: + unpack(wf.get_input_files("one_regions", "run")), + output: + **wf.get_output_files("one_regions", "run"), + params: + **{"args": wf.get_params("one_regions", "run")}, + threads: wf.get_resource("one_regions", "run", "threads") + resources: + time=wf.get_resource("one_regions", "run", "time"), + memory=wf.get_resource("one_regions", "run", "memory"), + partition=wf.get_resource("one_regions", "run", "partition"), + tmpdir=wf.get_resource("one_regions", "run", "tmpdir"), + log: + **wf.get_log_file("one_regions", "run"), + wrapper: + wf.wrapper_path("bcftools/regions") + + checkpoint one_protected: + input: + unpack(wf.get_input_files("one_protected", "run")), + output: + **wf.get_output_files("one_protected", "run"), + params: + **{"args": wf.get_params("one_protected", "run")}, + threads: wf.get_resource("one_protected", "run", "threads") + resources: + time=wf.get_resource("one_protected", "run", "time"), + memory=wf.get_resource("one_protected", "run", "memory"), + partition=wf.get_resource("one_protected", "run", "partition"), + tmpdir=wf.get_resource("one_protected", "run", "tmpdir"), + log: + **wf.get_log_file("one_protected", "run"), + wrapper: + wf.wrapper_path("bcftools/protected") + + rule last_filter: + input: + **wf.get_input_files("last_filter", "run"), + output: + **wf.get_output_files("last_filter", "run"), + threads: wf.get_resource("last_filter", "run", "threads") + resources: + time=wf.get_resource("last_filter", "run", "time"), + memory=wf.get_resource("last_filter", "run", "memory"), + partition=wf.get_resource("last_filter", "run", "partition"), + tmpdir=wf.get_resource("last_filter", "run", "tmpdir"), + log: + **wf.get_log_file("last_filter", "run"), + wrapper: + wf.wrapper_path("somatic_variant_filtration/apply_all_filters") # Variant Statistics Computation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/__init__.py b/snappy_pipeline/workflows/somatic_variant_filtration/__init__.py index 2eed184f7..2581b639c 100644 --- a/snappy_pipeline/workflows/somatic_variant_filtration/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_filtration/__init__.py @@ -155,6 +155,7 @@ tools_somatic_variant_calling: null # Default: use those defined in somatic_variant_calling step tools_somatic_variant_annotation: null # Default: use those defined in somatic_variant_annotation step has_annotation: True + filtration_schema: "list" # Either "sets" (old scheme- filter_sets) or "list" (new scheme- filter_list) filter_sets: # Deprecated filtration method, use filter_list # no_filter: no_filters # implicit, always defined dkfz_only: '' # empty @@ -226,8 +227,8 @@ def __init__(self, parent): def get_normal_lib_name(self, wildcards): """Return name of normal (non-cancer) library""" - pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library] - return pair.normal_sample.dna_ngs_library.name + pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None) + return pair.normal_sample.dna_ngs_library.name if pair else None def get_params(self, action): """Return arguments to pass down.""" @@ -254,32 +255,17 @@ class OneFilterStepPart(SomaticVariantFiltrationStepPart): #: Class available actions actions = ("run",) + #: Default filtration resource usage (should be light) + resource_usage = {"run": ResourceUsage(threads=1, time="02:00:00", memory=f"{8 * 1024}M")} + def get_input_files(self, action): - """Return path to input or previous filter vcf file & normal/tumor bams""" + """Return path to input or previous filter vcf file""" # Validate action self._validate_action(action) @dictify def input_function(wildcards): - yield "bam", os.path.join( - self.config["path_ngs_mapping"], - "output", - "{mapper}.{tumor_library}", - "out", - "{mapper}.{tumor_library}.bam", - ) - normal_library = self.tumor_to_normal_library[wildcards["tumor_library"]] - yield "normal", os.path.join( - self.config["path_ngs_mapping"], - "output", - f"{{mapper}}.{normal_library}", - "out", - f"{{mapper}}.{normal_library}.bam", - ) filter_nb = int(wildcards["filter_nb"]) - filter_name = list(self.config["filter_list"][filter_nb - 1].keys())[0] - if filter_name == "ebfilter": - yield "txt", self._get_output_files_write_panel()["txt"].format(**wildcards) if filter_nb > 1: prev = list(self.config["filter_list"][filter_nb - 2].keys())[0] n = filter_nb - 1 @@ -340,45 +326,70 @@ def get_log_file(self, action): self.name_pattern + "." + self.filter_name + "_{filter_nb}" + ext + ".md5", ) - def get_resource_usage(self, action): + def get_params(self, action): # Validate action self._validate_action(action) - def time_usage(wildcards): - filter_nb = int(wildcards["filter_nb"]) - 1 - filter_name = list(self.config["filter_list"][filter_nb].keys())[0] - if filter_name == "dkfz": - return "12:00:00" - elif filter_name == "ebfilter": - return "24:00:00" - else: - return "02:00:00" - - def memory_usage(wildcards): - filter_nb = int(wildcards["filter_nb"]) - 1 - filter_name = list(self.config["filter_list"][filter_nb].keys())[0] - if filter_name == "dkfz": - return f"{3 * 1024}M" - elif filter_name == "ebfilter": - return f"{2 * 1024}M" - else: - return f"{8 * 1024}M" + def input_function(wildcards): + return {"filter_name": "{}_{}".format(self.filter_name, wildcards["filter_nb"])} - return ResourceUsage( - threads=1, - time=time_usage, - memory=memory_usage, - ) + return input_function + + +class OneFilterWithBamStepPart(OneFilterStepPart): + def get_input_files(self, action): + """Return path to input or previous filter vcf file & normal/tumor bams""" + # Validate action + self._validate_action(action) + + @dictify + def input_function(wildcards): + parent = super(OneFilterWithBamStepPart, self).get_input_files(action) + yield from parent(wildcards).items() + + yield "bam", os.path.join( + self.config["path_ngs_mapping"], + "output", + "{mapper}.{tumor_library}", + "out", + "{mapper}.{tumor_library}.bam", + ) + if normal_library := self.tumor_to_normal_library.get(wildcards["tumor_library"], None): + yield "normal", os.path.join( + self.config["path_ngs_mapping"], + "output", + f"{{mapper}}.{normal_library}", + "out", + f"{{mapper}}.{normal_library}.bam", + ) + + return input_function -class OneFilterDkfzStepPart(OneFilterStepPart): +class OneFilterDkfzStepPart(OneFilterWithBamStepPart): name = "one_dkfz" filter_name = "dkfz" + resource_usage = {"run": ResourceUsage(threads=1, time="12:00:00", memory=f"{3 * 1024}M")} -class OneFilterEbfilterStepPart(OneFilterStepPart): +class OneFilterEbfilterStepPart(OneFilterWithBamStepPart): name = "one_ebfilter" filter_name = "ebfilter" + resource_usage = {"run": ResourceUsage(threads=1, time="24:00:00", memory=f"{2 * 1024}M")} + + def get_input_files(self, action): + """Return path to input or previous filter vcf file & normal/tumor bams""" + # Validate action + self._validate_action(action) + + @dictify + def input_function(wildcards): + parent = super(OneFilterEbfilterStepPart, self).get_input_files(action) + yield from parent(wildcards).items() + + yield "txt", self._get_output_files_write_panel()["txt"].format(**wildcards) + + return input_function @dictify def _get_output_files_write_panel(self): @@ -388,12 +399,18 @@ def _get_output_files_write_panel(self): ) def get_params(self, action): + """Return add EBFilter parameters to parameters""" # Validate action self._validate_action(action) @dictify def input_function(wildcards): - yield "args", {"filter_nb": wildcards["filter_nb"]} + parent = super(OneFilterEbfilterStepPart, self).get_params(action) + parameters = parent(wildcards) + filter_nb = int(wildcards["filter_nb"]) + ebfilter_config = self.config["filter_list"][filter_nb - 1][self.filter_name] + parameters.update(ebfilter_config) + return parameters return input_function @@ -402,16 +419,87 @@ class OneFilterBcftoolsStepPart(OneFilterStepPart): name = "one_bcftools" filter_name = "bcftools" + def get_params(self, action): + # Validate action + self._validate_action(action) + + def input_function(wildcards): + parent = super(OneFilterBcftoolsStepPart, self).get_params(action) + parameters = parent(wildcards) + filter_nb = int(wildcards["filter_nb"]) + keywords = self.config["filter_list"][filter_nb - 1][self.filter_name] + msg = "Only one include or exclude expression is allowed in {} filter {} (configuration: {})" + assert len(keywords) == 1, msg.format(self.filter_name, filter_nb, keywords) + keyword = list(keywords.keys())[0] + msg = 'Unknown keyword "{}" in {} filter {} (allowed values: include, exclude. Configuration: {})' + assert keyword in ("include", "exclude"), msg.format( + keyword, self.filter_name, filter_nb, keywords + ) + parameters.update(keywords) + return parameters + + return input_function + class OneFilterRegionsStepPart(OneFilterStepPart): name = "one_regions" filter_name = "regions" + def get_params(self, action): + # Validate action + self._validate_action(action) + + def input_function(wildcards): + parent = super(OneFilterRegionsStepPart, self).get_params(action) + parameters = parent(wildcards) + filter_nb = int(wildcards["filter_nb"]) + keywords = self.config["filter_list"][filter_nb - 1][self.filter_name] + msg = ( + "Only one include or exclude region is allowed in {} filter {} (configuration: {})" + ) + assert len(keywords) == 1, msg.format(self.filter_name, filter_nb, keywords) + keyword = list(keywords.keys())[0] + msg = 'Unknown keyword "{}" in {} filter {} (allowed values: include, exclude, path_bed (deprecated). Configuration: {})' + assert keyword in ("include", "exclude", "path_bed"), msg.format( + keyword, self.filter_name, filter_nb, keywords + ) + if keyword == "path_bed": + keywords = {"exclude": keywords["path_bed"]} + parameters.update(keywords) + return parameters + + return input_function + class OneFilterProtectedStepPart(OneFilterStepPart): name = "one_protected" filter_name = "protected" + def get_params(self, action): + # Validate action + self._validate_action(action) + + def input_function(wildcards): + parent = super(OneFilterProtectedStepPart, self).get_params(action) + parameters = parent(wildcards) + filter_nb = int(wildcards["filter_nb"]) + keywords = self.config["filter_list"][filter_nb - 1][self.filter_name] + msg = ( + "Only one protected region bed file is allowed in {} filter {} (configuration: {})" + ) + assert len(keywords) == 1, msg.format(self.filter_name, filter_nb, keywords) + keyword = list(keywords.keys())[0] + msg = ( + 'Unknown keyword "{}" in {} filter {} (allowed value: path_bed. Configuration: {})' + ) + assert keyword in ("path_bed",), msg.format( + keyword, self.filter_name, filter_nb, keywords + ) + parameters.update(keywords) + return parameters + + return input_function + class LastFilterStepPart(SomaticVariantFiltrationStepPart): """Mark last filter as final output""" @@ -426,10 +514,8 @@ def get_input_files(self, action): # Validate action self._validate_action(action) - filter_nb = len(self.config["filter_list"]) - if filter_nb == 0: - return {} filter_names = [list(filter_name.keys())[0] for filter_name in self.config["filter_list"]] + filter_nb = len(self.config["filter_list"]) filter_name = filter_names[filter_nb - 1] vcf = os.path.join( "work", @@ -531,10 +617,10 @@ def get_output_files(self, action): name_pattern += ".{annotator}" prefix = ( - rf"work/{name_pattern}." - r"dkfz_bias_filter.{tumor_library,[^\.]+}/" - rf"out/{name_pattern}." - r"dkfz_bias_filter.{tumor_library}" + rf"work/{name_pattern}.{self.name}." + r"{tumor_library,[^\.]+}/" + rf"out/{name_pattern}.{self.name}." + r"{tumor_library}" ) key_ext = { "vcf": ".vcf.gz", @@ -554,14 +640,21 @@ def _get_log_file(self, action): name_pattern = "{mapper}.{var_caller}" if self.config["has_annotation"]: name_pattern += ".{annotator}" - name_pattern += ".dkfz_bias_filter.{tumor_library}" + name_pattern += f".{self.name}" + prefix = os.path.join( + "work", + name_pattern + r".{tumor_library,[^\.]+}", + "log", + name_pattern + ".{tumor_library}", + ) + key_ext = ( ("log", ".log"), ("conda_info", ".conda_info.txt"), ("conda_list", ".conda_list.txt"), ) for key, ext in key_ext: - yield key, os.path.join("work", name_pattern, "log", name_pattern + ext) + yield key, prefix + ext def get_resource_usage(self, action): """Get Resource Usage @@ -600,6 +693,9 @@ def _get_input_files_run(self, wildcards): if self.config["has_annotation"]: name_pattern += ".{annotator}" # VCF file and index + name_pattern = "{mapper}.{var_caller}" + if self.config["has_annotation"]: + name_pattern += ".{annotator}" tpl = ( f"work/{name_pattern}." "dkfz_bias_filter.{tumor_library}/" @@ -611,7 +707,7 @@ def _get_input_files_run(self, wildcards): for key, ext in key_ext.items(): yield key, tpl.format(**wildcards) + ext # BAM file and index - tpl = "output/{mapper}.{tumor_library}/out/{mapper}.{tumor_library}" + tpl = r"output/{mapper}.{tumor_library}/out/{mapper}.{tumor_library}" key_ext = {"bam": ".bam", "bai": ".bam.bai"} ngs_mapping = self.parent.sub_workflows["ngs_mapping"] for key, ext in key_ext.items(): @@ -629,6 +725,14 @@ def get_output_files(self, action): self._validate_action(action) return getattr(self, "_get_output_files_{}".format(action))() + def get_params(self, action): + """Return EBFilter parameters from the config""" + # Validate action + self._validate_action(action) + parameters = self.config["eb_filter"] + parameters.update(self.config["filter_sets"]["dkfz_and_ebfilter"]) + return parameters + @dictify def _get_output_files_run(self): name_pattern = "{mapper}.{var_caller}" @@ -670,14 +774,20 @@ def _get_log_file(self, action): name_pattern = "{mapper}.{var_caller}" if self.config["has_annotation"]: name_pattern += ".{annotator}" - name_pattern += ".dkfz_bias_filter.eb_filter.{tumor_library}" + name_pattern += ".dkfz_bias_filter.eb_filter" + prefix = os.path.join( + "work", + name_pattern + r".{tumor_library,[^\.]+}", + "log", + name_pattern + ".{tumor_library}", + ) key_ext = ( ("log", ".log"), ("conda_info", ".conda_info.txt"), ("conda_list", ".conda_list.txt"), ) for key, ext in key_ext: - yield key, os.path.join("work", name_pattern, "log", name_pattern + ext) + yield key, prefix + ext def write_panel_of_normals_file(self, wildcards): """Write out file with paths to panels-of-normal""" @@ -721,46 +831,36 @@ def get_resource_usage(self, action): ) -class ApplyFiltersStepPartBase(SomaticVariantFiltrationStepPart): - """Base class for the different filters.""" +class ApplyFiltersStepPart(SomaticVariantFiltrationStepPart): + """Apply the configured filters.""" #: Step name - name = None + name = "apply_filters" #: Class available actions actions = ("run",) + #: Default filtration resource usage (should be light) + resource_usage = {"run": ResourceUsage(threads=1, time="02:00:00", memory=f"{8 * 1024}M")} + def __init__(self, parent): super().__init__(parent) name_pattern = "{mapper}.{var_caller}" if self.config["has_annotation"]: name_pattern += ".{annotator}" - name_pattern += ".dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}.{exon_list}" - self.base_path_out = os.path.join("work", name_pattern, "out", name_pattern + "{ext}") - self.path_log = os.path.join("work", name_pattern, "log", name_pattern + ".log") - - def get_resource_usage(self, action): - """Get Resource Usage - - :param action: Action (i.e., step) in the workflow, example: 'run'. - :type action: str - - :return: Returns ResourceUsage for step. - """ - # Validate action - self._validate_action(action) - return ResourceUsage( - threads=2, - time="01:00:00", # 1 hour - memory=f"{int(3.75 * 1024 * 2)}M", + name_pattern += ".dkfz_bias_filter.eb_filter" + self.base_path_out = os.path.join( + "work", + name_pattern + r".{tumor_library,[^\.]+}.{filter_set,[^\.]+}", + "out", + name_pattern + ".{tumor_library}.{filter_set}" + "{ext}", + ) + self.path_log = os.path.join( + "work", + name_pattern + r".{tumor_library,[^\.]+}.{filter_set,[^\.]+}", + "log", + name_pattern + ".{tumor_library}.{filter_set}" + ".log", ) - - -class ApplyFiltersStepPart(ApplyFiltersStepPartBase): - """Apply the configured filters.""" - - #: Step name - name = "apply_filters" def get_args(self, action): # Validate action @@ -793,39 +893,65 @@ def get_output_files(self, action): # Validate action self._validate_action(action) for key, ext in zip(EXT_NAMES, EXT_VALUES): - yield key, self.base_path_out.replace("{step}", self.name).replace( - "{exon_list}", "genome_wide" - ).replace("{ext}", ext) + yield key, self.base_path_out.replace("{ext}", ext) def get_log_file(self, action): # Validate action self._validate_action(action) - return self.path_log.replace("{step}", self.name).replace("{exon_list}", "genome_wide") + return self.path_log -class FilterToExonsStepPart(ApplyFiltersStepPartBase): +class FilterToExonsStepPart(SomaticVariantFiltrationStepPart): """Apply the configured filters.""" #: Step name name = "filter_to_exons" + #: Class available actions + actions = ("run",) + + #: Default filtration resource usage (should be light) + resource_usage = {"run": ResourceUsage(threads=1, time="02:00:00", memory=f"{8 * 1024}M")} + + def __init__(self, parent): + super().__init__(parent) + name_pattern = "{mapper}.{var_caller}" + if self.config["has_annotation"]: + name_pattern += ".{annotator}" + name_pattern += ".dkfz_bias_filter.eb_filter" + self.base_path_out = os.path.join( + "work", + name_pattern + r".{tumor_library,[^\.]+}.{filter_set,[^\.]+}.{exon_list}", + "out", + name_pattern + ".{tumor_library}.{filter_set}.{exon_list}" + "{ext}", + ) + self.path_log = os.path.join( + "work", + name_pattern + r".{tumor_library,[^\.]+}.{filter_set,[^\.]+}.{exon_list}", + "log", + name_pattern + ".{tumor_library}.{filter_set}.{exon_list}" + ".log", + ) + self.base_path_in_ = os.path.join( + "work", + name_pattern + ".{tumor_library}.{filter_set}", + "out", + name_pattern + ".{tumor_library}.{filter_set}" + "{ext}", + ) + def get_input_files(self, action): # Validate action self._validate_action(action) @dictify def input_function(wildcards): - # TODO: Possible bug, missing entry for `tumor_library` - # tests lead to "KeyError: 'tumor_library'" for key, ext in zip(EXT_NAMES, EXT_VALUES): - yield key, self.base_path_out.format( - step="apply_filters", + yield key, self.base_path_in_.format( tumor_library=wildcards.tumor_library, mapper=wildcards.mapper, var_caller=wildcards.var_caller, annotator=wildcards.get("annotator", ""), filter_set=wildcards.filter_set, - exon_list="genome_wide", + exon_list=wildcards.exon_list, ext=ext, ) @@ -836,12 +962,12 @@ def get_output_files(self, action): # Validate action self._validate_action(action) for key, ext in zip(EXT_NAMES, EXT_VALUES): - yield key, self.base_path_out.replace("{step}", "filter_to_exons").replace("{ext}", ext) + yield key, self.base_path_out.replace("{ext}", ext) def get_log_file(self, action): # Validate action self._validate_action(action) - return self.path_log.replace("{step}", self.name) + return self.path_log class SomaticVariantFiltrationWorkflow(BaseStep): @@ -854,7 +980,7 @@ class SomaticVariantFiltrationWorkflow(BaseStep): sheet_shortcut_class = CancerCaseSheet sheet_shortcut_kwargs = { - "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=False) } @classmethod @@ -930,11 +1056,11 @@ def get_result_files(self): log_ext = [e + m for e in ("log", "conda_list.txt", "conda_info.txt") for m in ("", ".md5")] - if len(self.config["filter_list"]) > 0: + if self.config["filtration_schema"] == "list": name_pattern = "{mapper}.{caller}" if self.config["has_annotation"]: name_pattern += ".{annotator}" - name_pattern += ".filtered.{tumor_library.name}" + name_pattern += ".filtered.{tumor_library}" yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), @@ -966,9 +1092,7 @@ def get_result_files(self): name_pattern = "{mapper}.{caller}" if self.config["has_annotation"]: name_pattern += ".{annotator}" - name_pattern += ( - ".dkfz_bias_filter.eb_filter.{tumor_library.name}.{filter_set}.{exon_list}" - ) + name_pattern += ".dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}.{exon_list}" yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), @@ -986,7 +1110,7 @@ def get_result_files(self): annotator=annotators, filter_set=filter_sets, exon_list=exon_lists, - ext=log_ext, + ext=("log",), ) # TODO: filtration for joint calling not implemented yet @@ -998,20 +1122,19 @@ def _yield_result_files_matched(self, tpl, **kwargs): Mutect. """ for sheet in filter(is_not_background, self.shortcut_sheets): - for sample_pair in sheet.all_sample_pairs: - if ( - not sample_pair.tumor_sample.dna_ngs_library - or not sample_pair.normal_sample.dna_ngs_library - ): - msg = ( - "INFO: sample pair for cancer bio sample {} has is missing primary" - "normal or primary cancer NGS library" - ) - print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) - continue - yield from expand( - tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs - ) + for bio_entity in sheet.sheet.bio_entities.values(): + for bio_sample in bio_entity.bio_samples.values(): + if not bio_sample.extra_infos.get("isTumor", False): + continue + for test_sample in bio_sample.test_samples.values(): + extraction_type = test_sample.extra_infos.get("extractionType", "unknown") + if extraction_type.lower() != "dna": + if extraction_type == "unknown": + msg = "INFO: sample {} has missing extraction type, ignored" + print(msg.format(test_sample.name), file=sys.stderr) + continue + for ngs_library in test_sample.ngs_libraries.values(): + yield from expand(tpl, tumor_library=[ngs_library.name], **kwargs) def check_config(self): """Check that the path to the NGS mapping is present""" @@ -1019,3 +1142,9 @@ def check_config(self): ("step_config", "somatic_variant_filtration", "path_somatic_variant"), "Path to variant calling not configured but required for somatic variant annotation", ) + assert self.config["filtration_schema"] in ( + "sets", + "list", + ), "Filtration schema must be either 'list' or 'sets' (deprecated)" + if self.config["filtration_schema"] == "list": + assert len(self.config["filter_list"]) > 0, "No filter defined in the filter list" diff --git a/snappy_wrappers/wrappers/bcftools/filter/wrapper.py b/snappy_wrappers/wrappers/bcftools/filter/wrapper.py index c6386348c..454e7b00a 100644 --- a/snappy_wrappers/wrappers/bcftools/filter/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/filter/wrapper.py @@ -4,35 +4,13 @@ from snakemake.shell import shell -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step] - -if "include" in config or "exclude" in config: - include = config.get("include", "") - exclude = config.get("exclude", "") -elif "bcftools" in config and ("include" in config["bcftools"] or "exclude" in config["bcftools"]): - include = config["bcftools"].get("include", "") - exclude = config["bcftools"].get("exclude", "") -elif "filter_nb" in snakemake.wildcards.keys(): - filter_nb = int(snakemake.wildcards["filter_nb"]) - 1 - include = config["filter_list"][filter_nb]["bcftools"].get("include", "") - exclude = config["filter_list"][filter_nb]["bcftools"].get("exclude", "") -else: - include = "" - exclude = "" -if include: - include = '--include "' + include + '"' -if exclude: - exclude = '--exclude "' + exclude + '"' - -if "filter_name" in config: - filter_name = config.get("filter_name", "") -elif "bcftools" in config and "filter_name" in config["bcftools"]: - filter_name = config["bcftools"].get("filter_name", "") -elif "filter_nb" in snakemake.wildcards.keys(): - filter_name = "bcftools_{}".format(int(snakemake.wildcards["filter_nb"])) -else: - filter_name = "+" +params = dict(snakemake.params)["args"] +filter_name = params["filter_name"] +expression = ( + '--include "{}"'.format(params["include"]) + if "include" in params + else '--exclude "{}"'.format(params["exclude"]) +) # Actually run the script. shell( @@ -50,7 +28,7 @@ conda info > {snakemake.log.conda_info} bcftools filter --soft-filter {filter_name} --mode + \ - {include} {exclude} \ + {expression} \ -O z -o {snakemake.output.vcf} \ {snakemake.input.vcf} tabix {snakemake.output.vcf} diff --git a/snappy_wrappers/wrappers/bcftools/protected/wrapper.py b/snappy_wrappers/wrappers/bcftools/protected/wrapper.py index 2f933ea55..5d28a23ab 100644 --- a/snappy_wrappers/wrappers/bcftools/protected/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/protected/wrapper.py @@ -4,28 +4,9 @@ from snakemake.shell import shell -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step] - -if "path_bed" in config: - bed = config.get("path_bed", "") -elif "bcftools" in config and "path_bed" in config["bcftools"]: - bed = config["bcftools"].get("path_bed", "") -elif "filter_nb" in snakemake.wildcards.keys(): - filter_nb = int(snakemake.wildcards["filter_nb"]) - 1 - bed = config["filter_list"][filter_nb]["protected"].get("path_bed", "") -else: - bed = "" -assert bed, "No bed file defining the protected regions" - -if "filter_name" in config: - filter_name = config.get("filter_name", "") -elif "bcftools" in config and "filter_name" in config["bcftools"]: - filter_name = config["bcftools"].get("filter_name", "") -elif "filter_nb" in snakemake.wildcards.keys(): - filter_name = "protected_{}".format(int(snakemake.wildcards["filter_nb"])) -else: - filter_name = "+" +params = dict(snakemake.params)["args"] +filter_name = params["filter_name"] +bed = params["path_bed"] # Actually run the script. shell( diff --git a/snappy_wrappers/wrappers/bcftools/regions/wrapper.py b/snappy_wrappers/wrappers/bcftools/regions/wrapper.py index 3c2fc464d..13a5aa4c9 100644 --- a/snappy_wrappers/wrappers/bcftools/regions/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/regions/wrapper.py @@ -1,31 +1,12 @@ # -*- coding: utf-8 -*- -"""Wrapper for running bcftools mpileup +"""Wrapper for running bcftools filter over regions defined by a bed file """ from snakemake.shell import shell -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step] - -if "path_bed" in config: - bed = config.get("path_bed", "") -elif "bcftools" in config and "path_bed" in config["bcftools"]: - bed = config["bcftools"].get("path_bed", "") -elif "filter_nb" in snakemake.wildcards.keys(): - filter_nb = int(snakemake.wildcards["filter_nb"]) - 1 - bed = config["filter_list"][filter_nb]["regions"].get("path_bed", "") -else: - bed = "" -assert bed, "No bed file defining the regions to be filtered" - -if "filter_name" in config: - filter_name = config.get("filter_name", "") -elif "bcftools" in config and "filter_name" in config["bcftools"]: - filter_name = config["bcftools"].get("filter_name", "") -elif "filter_nb" in snakemake.wildcards.keys(): - filter_name = "bcftools_{}".format(int(snakemake.wildcards["filter_nb"])) -else: - filter_name = "+" +params = dict(snakemake.params)["args"] +filter_name = params["filter_name"] +bed = f'^{params["include"]}' if "include" in params else params["exclude"] # Actually run the script. shell( @@ -43,7 +24,7 @@ conda info > {snakemake.log.conda_info} bcftools filter --soft-filter {filter_name} --mode + \ - --mask-file "^{bed}" \ + --mask-file "{bed}" \ -O z -o {snakemake.output.vcf} \ {snakemake.input.vcf} tabix {snakemake.output.vcf} diff --git a/snappy_wrappers/wrappers/eb_filter/wrapper.py b/snappy_wrappers/wrappers/eb_filter/wrapper.py index 78147399f..2e5f4b28e 100644 --- a/snappy_wrappers/wrappers/eb_filter/wrapper.py +++ b/snappy_wrappers/wrappers/eb_filter/wrapper.py @@ -6,41 +6,16 @@ __author__ = "Manuel Holtgrewe " -if "args" in snakemake.params and snakemake.params["args"].get("interval", None): +params = dict(snakemake.params)["args"] +filter_name = params["filter_name"] if "filter_name" in params else "" + +if "interval" in params: cmd_fetch = "tabix --print-header {} {}".format( snakemake.input.vcf, snakemake.params["args"]["interval"] ) else: cmd_fetch = "zcat {}".format(snakemake.input.vcf) -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step] - -if "ebfilter_threshold" in config: - threshold = config.get("ebfilter_threshold", 0) -elif "eb_filter" in config and "ebfilter_threshold" in config["eb_filter"]: - threshold = config["eb_filter"].get("ebfilter_threshold", 0) -elif "ebfilter" in config and "ebfilter_threshold" in config["ebfilter"]: - threshold = config["ebfilter"].get("ebfilter_threshold", 0) -else: - try: - filter_nb = int(snakemake.wildcards["filter_nb"]) - 1 - threshold = config["filter_list"][filter_nb]["ebfilter"].get("ebfilter_threshold", 0) - except: - threshold = 0 - -if "filter_name" in config: - filter_name = config.get("filter_name", "") -elif "eb_filter" in config and "filter_name" in config["eb_filter"]: - filter_name = config["eb_filter"].get("filter_name", "") -elif "ebfilter" in config and "filter_name" in config["ebfilter"]: - filter_name = config["ebfilter"].get("filter_name", "") -else: - try: - filter_name = "ebfilter_{}".format(int(snakemake.wildcards["filter_nb"])) - except: - filter_name = "+" - shell( r""" set -x @@ -112,16 +87,22 @@ EBFilter \ -f vcf \ -t 1 \ - -q {snakemake.config[step_config][somatic_variant_filtration][eb_filter][min_mapq]} \ - -Q {snakemake.config[step_config][somatic_variant_filtration][eb_filter][min_baseq]} \ + -q {params[min_mapq]} \ + -Q {params[min_baseq]} \ $TMPDIR/for_eb_filter.vcf.gz \ {snakemake.input.bam} \ {snakemake.input.txt} \ $TMPDIR/after_running_eb_filter.vcf - bcftools filter --soft-filter {filter_name} --mode + \ - --exclude "INFO/EB < {threshold}" \ - -O z -o $TMPDIR/after_eb_filter.vcf.gz \ - $TMPDIR/after_running_eb_filter.vcf + if [[ -n "{filter_name}" ]] + then + bcftools filter --soft-filter {filter_name} --mode + \ + --exclude "INFO/EB < {params[ebfilter_threshold]}" \ + -O z -o $TMPDIR/after_eb_filter.vcf.gz \ + $TMPDIR/after_running_eb_filter.vcf + else + mv $TMPDIR/after_running_eb_filter.vcf $TMPDIR/after_eb_filter.vcf + bgzip $TMPDIR/after_eb_filter.vcf + fi else mv $TMPDIR/for_eb_filter.vcf.gz $TMPDIR/after_eb_filter.vcf.gz fi diff --git a/snappy_wrappers/wrappers/mutect2/run/wrapper.py b/snappy_wrappers/wrappers/mutect2/run/wrapper.py index 363fa9e1c..c3ff0a407 100644 --- a/snappy_wrappers/wrappers/mutect2/run/wrapper.py +++ b/snappy_wrappers/wrappers/mutect2/run/wrapper.py @@ -11,6 +11,11 @@ extra_arguments = " ".join(config["extra_arguments"]) +if "normal_bam" in snakemake.input.keys(): + normal = f'--normal "{snakemake.params.normal_lib_name}" --input {snakemake.input.normal_bam}' +else: + normal = "" + shell.executable("/bin/bash") shell( @@ -55,9 +60,8 @@ gatk Mutect2 \ --tmp-dir $tmpdir \ - --input {snakemake.input.normal_bam} \ + {normal} \ --input {snakemake.input.tumor_bam} \ - --normal "{snakemake.params.normal_lib_name}" \ --reference {reference} \ --output $out_base.vcf \ --f1r2-tar-gz ${{out_base}}.f1r2.tar.gz \ diff --git a/tests/snappy_pipeline/workflows/conftest.py b/tests/snappy_pipeline/workflows/conftest.py index 7321e14a1..c7723397c 100644 --- a/tests/snappy_pipeline/workflows/conftest.py +++ b/tests/snappy_pipeline/workflows/conftest.py @@ -604,6 +604,7 @@ def cancer_sheet_tsv(): P002\tT1\tY\tWGS\tP002_T1_DNA1_WGS1\tAgilent SureSelect Human All Exon V6\tDNA P002\tT2\tY\tWGS\tP002_T2_DNA1_WGS1\tAgilent SureSelect Human All Exon V6\tDNA P002\tT2\tY\tmRNA_seq\tP002_T2-RNA1_mRNA_seq1\tNone\tRNA + # P003\tT1\tY\tWGS\tP003_T1_DNA1_WGS1\tNone\tDNA """ ).lstrip() diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_filtration.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_filtration.py index b56f30825..0bba22937 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_filtration.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_filtration.py @@ -35,6 +35,7 @@ def minimal_config(): tools_ngs_mapping: ['bwa'] tools_somatic_variant_calling: ['mutect2'] tools_somatic_variant_annotation: ['jannovar'] + filtration_schema: sets data_sets: first_batch: @@ -99,9 +100,9 @@ def test_dkfz_bias_filter_step_part_get_input_files(somatic_variant_filtration_w def test_dkfz_bias_filter_step_part_get_output_files(somatic_variant_filtration_workflow): """Tests DkfzBiasFilterStepPart.get_output_files()""" base_out = ( - "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter." - "{tumor_library,[^\\.]+}/out/{mapper}.{var_caller}.{annotator}." - "dkfz_bias_filter.{tumor_library}" + r"work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter." + r"{tumor_library,[^\.]+}/out/{mapper}.{var_caller}.{annotator}." + r"dkfz_bias_filter.{tumor_library}" ) expected = get_expected_output_vcf_files_dict(base_out=base_out) actual = somatic_variant_filtration_workflow.get_output_files("dkfz_bias_filter", "run") @@ -111,8 +112,8 @@ def test_dkfz_bias_filter_step_part_get_output_files(somatic_variant_filtration_ def test_dkfz_bias_filter_step_part_get_log_file(somatic_variant_filtration_workflow): """Tests DkfzBiasFilterStepPart.get_log_file()""" base_out = ( - "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.{tumor_library}/" - "log/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.{tumor_library}" + r"work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.{tumor_library,[^\.]+}/" + r"log/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.{tumor_library}" ) expected = get_expected_log_files_dict(base_out=base_out) actual = somatic_variant_filtration_workflow.get_log_file("dkfz_bias_filter", "run") @@ -183,9 +184,9 @@ def test_eb_filter_step_part_get_input_files_write_panel(somatic_variant_filtrat def test_eb_filter_step_part_get_output_files_run(somatic_variant_filtration_workflow): """Tests EbFilterStepPart._get_output_files_run()""" base_out = ( - "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter." - "eb_filter.{tumor_library,[^\\.]+}/out/{mapper}.{var_caller}.{annotator}." - "dkfz_bias_filter.eb_filter.{tumor_library}" + r"work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter." + r"eb_filter.{tumor_library,[^\.]+}/out/{mapper}.{var_caller}.{annotator}." + r"dkfz_bias_filter.eb_filter.{tumor_library}" ) expected = get_expected_output_vcf_files_dict(base_out=base_out) actual = somatic_variant_filtration_workflow.get_output_files("eb_filter", "run") @@ -203,19 +204,19 @@ def test_eb_filter_step_part_get_output_files_write_panel(somatic_variant_filtra assert actual == expected -def test_eb_eb_filter_step_part_get_log_file_run(somatic_variant_filtration_workflow): +def test_eb_filter_step_part_get_log_file_run(somatic_variant_filtration_workflow): """Tests EbFilterStepPart._get_log_file_run()""" base_out = ( - "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." - "{tumor_library}/log/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter." - "eb_filter.{tumor_library}" + r"work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." + r"{tumor_library,[^\.]+}/log/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter." + r"eb_filter.{tumor_library}" ) expected = get_expected_log_files_dict(base_out=base_out) actual = somatic_variant_filtration_workflow.get_log_file("eb_filter", "run") assert actual == expected -def test_eb_eb_filter_step_part_get_log_file_write_panel(somatic_variant_filtration_workflow): +def test_eb_filter_step_part_get_log_file_write_panel(somatic_variant_filtration_workflow): """Tests EbFilterStepPart._get_log_file_write_panel()""" expected = {} actual = somatic_variant_filtration_workflow.get_log_file("eb_filter", "write_panel") @@ -257,10 +258,9 @@ def test_apply_filters_step_part_get_input_files(somatic_variant_filtration_work def test_apply_filters_step_part_get_output_files(somatic_variant_filtration_workflow): """Tests ApplyFiltersStepPart.get_output_files()""" base_out = ( - "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." - "{tumor_library}.{filter_set}.genome_wide/out/{mapper}.{var_caller}." - "{annotator}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}." - "genome_wide" + r"work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." + r"{tumor_library,[^\.]+}.{filter_set,[^\.]+}/out/{mapper}.{var_caller}." + r"{annotator}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}" ) expected = get_expected_output_vcf_files_dict(base_out=base_out) actual = somatic_variant_filtration_workflow.get_output_files("apply_filters", "run") @@ -270,10 +270,9 @@ def test_apply_filters_step_part_get_output_files(somatic_variant_filtration_wor def test_apply_filters_step_part_get_log_file(somatic_variant_filtration_workflow): """Tests ApplyFiltersStepPart.get_log_file()""" expected = ( - "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." - "{tumor_library}.{filter_set}.genome_wide/log/{mapper}.{var_caller}." - "{annotator}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}." - "genome_wide.log" + r"work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." + r"{tumor_library,[^\.]+}.{filter_set,[^\.]+}/log/{mapper}.{var_caller}." + r"{annotator}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}.log" ) actual = somatic_variant_filtration_workflow.get_log_file("apply_filters", "run") assert actual == expected @@ -282,7 +281,7 @@ def test_apply_filters_step_part_get_log_file(somatic_variant_filtration_workflo def test_apply_filters_step_part_get_resource_usage(somatic_variant_filtration_workflow): """Tests ApplyFiltersStepPart.get_resource()""" # Define expected - expected_dict = {"threads": 2, "time": "01:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "02:00:00", "memory": "8192M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -293,38 +292,37 @@ def test_apply_filters_step_part_get_resource_usage(somatic_variant_filtration_w # Tests for FilterToExonsStepPart ------------------------------------------------------------------ -# def test_filter_to_exons_step_part_get_input_files(somatic_variant_filtration_workflow): -# """Tests FilterToExonsStepPart.get_input_files()""" -# # TODO: fix use of `tumor_library` in get_input_files() -# wildcards = Wildcards( -# fromdict={ -# "mapper": "bwa", -# "var_caller": "mutect2", -# "filter_set": "custom_exon_list", # totally made up, not sure it should look like -# "tumor_library": "P001-T1-DNA1-WGS1", -# } -# ) -# base_out = ( -# "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." -# "{tumor_library}/out/{mapper}.{var_caller}.{annotator}." -# "dkfz_bias_filter.eb_filter.{tumor_library}" -# ) -# expected = { -# "vcf": base_out + ".vcf.gz", -# "vcf_tbi": base_out + ".vcf.gz.tbi", -# } -# # _ = somatic_variant_filtration_workflow.get_input_files("filter_to_exons", "run")(wildcards) -# _ = expected -# assert True +def test_filter_to_exons_step_part_get_input_files(somatic_variant_filtration_workflow): + """Tests FilterToExonsStepPart.get_input_files()""" + # TODO: fix use of `tumor_library` in get_input_files() + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "var_caller": "mutect2", + "filter_set": "custom_exon_list", # totally made up, not sure it should look like + "tumor_library": "P001-T1-DNA1-WGS1", + } + ) + base_out = ( + "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." + "{tumor_library}/out/{mapper}.{var_caller}.{annotator}." + "dkfz_bias_filter.eb_filter.{tumor_library}" + ) + expected = { + "vcf": base_out + ".vcf.gz", + "vcf_tbi": base_out + ".vcf.gz.tbi", + } + # _ = somatic_variant_filtration_workflow.get_input_files("filter_to_exons", "run")(wildcards) + _ = expected + assert True def test_filter_to_exons_step_part_get_output_files(somatic_variant_filtration_workflow): """Tests FilterToExonsStepPart.get_output_files()""" base_out = ( - "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." - "{tumor_library}.{filter_set}.{exon_list}/out/{mapper}.{var_caller}." - "{annotator}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}." - "{exon_list}" + r"work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." + r"{tumor_library,[^\.]+}.{filter_set,[^\.]+}.{exon_list}/out/{mapper}.{var_caller}." + r"{annotator}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}.{exon_list}" ) expected = get_expected_output_vcf_files_dict(base_out=base_out) actual = somatic_variant_filtration_workflow.get_output_files("filter_to_exons", "run") @@ -334,10 +332,9 @@ def test_filter_to_exons_step_part_get_output_files(somatic_variant_filtration_w def test_filter_to_exons_step_part_get_log_file(somatic_variant_filtration_workflow): """Tests FilterToExonsStepPart.get_log_file()""" expected = ( - "work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter." - "eb_filter.{tumor_library}.{filter_set}.{exon_list}/log/{mapper}.{var_caller}." - "{annotator}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}." - "{exon_list}.log" + r"work/{mapper}.{var_caller}.{annotator}.dkfz_bias_filter.eb_filter." + r"{tumor_library,[^\.]+}.{filter_set,[^\.]+}.{exon_list}/log/{mapper}.{var_caller}." + r"{annotator}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}.{exon_list}.log" ) actual = somatic_variant_filtration_workflow.get_log_file("filter_to_exons", "run") assert actual == expected @@ -346,7 +343,7 @@ def test_filter_to_exons_step_part_get_log_file(somatic_variant_filtration_workf def test_filter_to_exons_step_part_get_resource_usage(somatic_variant_filtration_workflow): """Tests FilterToExonsStepPart.get_resource()""" # Define expected - expected_dict = {"threads": 2, "time": "01:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "02:00:00", "memory": "8192M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -407,8 +404,8 @@ def test_somatic_variant_filtration_workflow(somatic_variant_filtration_workflow expected += [ tpl.format(mapper=mapper, filter=filter_, i=i, t=t, ext=ext, chksum=chksum) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("log", "conda_list.txt", "conda_info.txt") - for chksum in ("", ".md5") + for ext in ("log",) + for chksum in ("",) for mapper in ("bwa",) for filter_ in ( "dkfz_and_ebfilter.genome_wide", @@ -448,14 +445,15 @@ def minimal_config_list(): tools_ngs_mapping: ['bwa'] tools_somatic_variant_calling: ['mutect2'] tools_somatic_variant_annotation: ['jannovar'] + filtration_schema: list filter_list: - - bcftools: - include: "include_statment" - - dkfz: + - dkfz: {} - ebfilter: threshold: 2.3 + - bcftools: + include: "include_statment" - regions: - path_bed: /path/to/regions.bed + exclude: /path/to/regions.bed - protected: path_bed: /path/to/protected.bed @@ -473,6 +471,7 @@ def minimal_config_list(): @pytest.fixture +# @pytest.mark.filterwarnings("ignore:.*Could not find .*") def somatic_variant_filtration_workflow_list( dummy_workflow, minimal_config_list, @@ -503,7 +502,7 @@ def somatic_variant_filtration_workflow_list( def test_one_filter_step_part_get_input_files(somatic_variant_filtration_workflow_list): - """Tests ApplyFiltersStepPart.get_input_files()""" + """Tests OneFilterStepPart.get_input_files()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -518,6 +517,21 @@ def test_one_filter_step_part_get_input_files(somatic_variant_filtration_workflo "bam": "../ngs_mapping/output/{mapper}.{tumor_library}/out/{mapper}.{tumor_library}.bam", "normal": "../ngs_mapping/output/{mapper}.P001-N1-DNA1-WGS1/out/{mapper}.P001-N1-DNA1-WGS1.bam", } + actual = somatic_variant_filtration_workflow_list.get_input_files("one_dkfz", "run")(wildcards) + assert actual == expected + + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "var_caller": "mutect2", + "annotator": "jannovar", + "tumor_library": "P001-T1-DNA1-WGS1", + "filter_nb": 3, + } + ) + expected = { + "vcf": "work/{mapper}.{var_caller}.{annotator}.{tumor_library}/out/{mapper}.{var_caller}.{annotator}.{tumor_library}.ebfilter_2.vcf.gz", + } actual = somatic_variant_filtration_workflow_list.get_input_files("one_bcftools", "run")( wildcards ) @@ -525,7 +539,7 @@ def test_one_filter_step_part_get_input_files(somatic_variant_filtration_workflo def test_one_filter_step_part_get_output_files(somatic_variant_filtration_workflow_list): - """Tests ApplyFiltersStepPart.get_output_files()""" + """Tests OneFilterStepPart.get_output_files()""" base_out = "work/{mapper}.{var_caller}.{annotator}.{tumor_library}/out/{mapper}.{var_caller}.{annotator}.{tumor_library}.dkfz_{filter_nb}" expected = get_expected_output_vcf_files_dict(base_out=base_out) actual = somatic_variant_filtration_workflow_list.get_output_files("one_dkfz", "run") @@ -533,37 +547,51 @@ def test_one_filter_step_part_get_output_files(somatic_variant_filtration_workfl def test_one_filter_step_part_get_log_file(somatic_variant_filtration_workflow_list): - """Tests ApplyFiltersStepPart.get_log_file()""" + """Tests OneFilterStepPart.get_log_file()""" base_out = "work/{mapper}.{var_caller}.{annotator}.{tumor_library}/log/{mapper}.{var_caller}.{annotator}.{tumor_library}.ebfilter_{filter_nb}" expected = get_expected_log_files_dict(base_out=base_out) actual = somatic_variant_filtration_workflow_list.get_log_file("one_ebfilter", "run") assert actual == expected +def test_one_filter_step_part_get_params(somatic_variant_filtration_workflow_list): + """Tests OneFilterStepPart.get_params()""" + wildcards = Wildcards(fromdict={"filter_nb": 1}) + expected = {"filter_name": "dkfz_1"} + actual = somatic_variant_filtration_workflow_list.get_params("one_dkfz", "run")(wildcards) + assert actual == expected + + wildcards = Wildcards(fromdict={"filter_nb": 2}) + expected = {"filter_name": "ebfilter_2", "threshold": 2.3} + actual = somatic_variant_filtration_workflow_list.get_params("one_ebfilter", "run")(wildcards) + assert actual == expected + + wildcards = Wildcards(fromdict={"filter_nb": 3}) + expected = {"filter_name": "bcftools_3", "include": "include_statment"} + actual = somatic_variant_filtration_workflow_list.get_params("one_bcftools", "run")(wildcards) + assert actual == expected + + wildcards = Wildcards(fromdict={"filter_nb": 4}) + expected = {"filter_name": "regions_4", "exclude": "/path/to/regions.bed"} + actual = somatic_variant_filtration_workflow_list.get_params("one_regions", "run")(wildcards) + assert actual == expected + + wildcards = Wildcards(fromdict={"filter_nb": 5}) + expected = {"filter_name": "protected_5", "path_bed": "/path/to/protected.bed"} + actual = somatic_variant_filtration_workflow_list.get_params("one_protected", "run")(wildcards) + assert actual == expected + + def test_one_filter_step_part_get_resource_usage(somatic_variant_filtration_workflow_list): - """Tests ApplyFiltersStepPart.get_resource()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - "var_caller": "mutect2", - "annotator": "jannovar", - "tumor_library": "P001-T1-DNA1-WGS1", - "filter_nb": 3, - } - ) + """Tests OneFilterStepPart.get_resource()""" # Define expected expected_dict = {"threads": 1, "time": "24:00:00", "memory": "2048M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - if resource == "threads" or resource == "partition": - actual = somatic_variant_filtration_workflow_list.get_resource( - "one_regions", "run", resource - ) - else: - actual = somatic_variant_filtration_workflow_list.get_resource( - "one_regions", "run", resource - )(wildcards) + actual = somatic_variant_filtration_workflow_list.get_resource( + "one_ebfilter", "run", resource + ) assert actual == expected, msg_error @@ -571,13 +599,13 @@ def test_one_filter_step_part_get_resource_usage(somatic_variant_filtration_work def test_last_filter_step_part_get_input_files(somatic_variant_filtration_workflow_list): - """Tests ApplyFiltersStepPart.get_input_files()""" + """Tests LastFilterStepPart.get_input_files()""" base_tpl = "{{mapper}}.{{var_caller}}.{{annotator}}.{{tumor_library}}" log_tpl = "work/" + base_tpl + "/log/" + base_tpl + ".{filt}.{ext}{chksum}" expected = { "logs": [ log_tpl.format(filt=filt, ext=ext, chksum=chksum) - for filt in ("bcftools_1", "dkfz_2", "ebfilter_3", "regions_4", "protected_5") + for filt in ("dkfz_1", "ebfilter_2", "bcftools_3", "regions_4", "protected_5") for ext in ("log", "conda_list.txt", "conda_info.txt") for chksum in ("", ".md5") ], @@ -588,7 +616,7 @@ def test_last_filter_step_part_get_input_files(somatic_variant_filtration_workfl def test_last_filter_step_part_get_output_files(somatic_variant_filtration_workflow_list): - """Tests ApplyFiltersStepPart.get_output_files()""" + """Tests LastFilterStepPart.get_output_files()""" base_name = "{mapper}.{var_caller}.{annotator}.filtered.{tumor_library}" base_out = "work/" + base_name + "/out/" + base_name expected = get_expected_output_vcf_files_dict(base_out=base_out)