diff --git a/snappy_pipeline/workflows/somatic_msi_calling/Snakefile b/snappy_pipeline/workflows/somatic_msi_calling/Snakefile index fc5c07f36..94eda4c85 100644 --- a/snappy_pipeline/workflows/somatic_msi_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_msi_calling/Snakefile @@ -51,21 +51,21 @@ rule somatic_msi_calling_link_out_run: # Somatic Microsatellite Instability analysis~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Run MANTIS ------------------------------------------------------------------ +# Run MANTIS_msi2 ------------------------------------------------------------------ -rule somatic_msi_calling_mantis: +rule somatic_msi_calling_mantis_msi2: input: - unpack(wf.get_input_files("mantis", "run")), + unpack(wf.get_input_files("mantis_msi2", "run")), output: - **wf.get_output_files("mantis", "run"), - threads: wf.get_resource("mantis", "run", "threads") + **wf.get_output_files("mantis_msi2", "run"), + threads: wf.get_resource("mantis_msi2", "run", "threads") resources: - time=wf.get_resource("mantis", "run", "time"), - memory=wf.get_resource("mantis", "run", "memory"), - partition=wf.get_resource("mantis", "run", "partition"), - tmpdir=wf.get_resource("mantis", "run", "tmpdir"), + time=wf.get_resource("mantis_msi2", "run", "time"), + memory=wf.get_resource("mantis_msi2", "run", "memory"), + partition=wf.get_resource("mantis_msi2", "run", "partition"), + tmpdir=wf.get_resource("mantis_msi2", "run", "tmpdir"), log: - wf.get_log_file("mantis", "run"), + **wf.get_log_file("mantis_msi2", "run"), wrapper: - wf.wrapper_path("mantis/run") + wf.wrapper_path("mantis/mantis_msi2/run") diff --git a/snappy_pipeline/workflows/somatic_msi_calling/__init__.py b/snappy_pipeline/workflows/somatic_msi_calling/__init__.py index e3734ab55..a2831bc31 100644 --- a/snappy_pipeline/workflows/somatic_msi_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_msi_calling/__init__.py @@ -5,6 +5,11 @@ whole genomes, exomes or large panels). MANTIS starts from the aligned reads (thus off ``ngs_mapping``) and generates a result file per tumor/normal pair. +As MANTIS is not maintained anymore, the pipeline now supports only +`MANTIS2 `_. +The new version appears to be very silimar to the old one, both in terms of input & output files, +and in terms of requirements. + ========== Step Input ========== @@ -19,13 +24,13 @@ .. note:: Tool-Specific Output - As the only integrated tool is MANTIS at the moment, the output is very tailored to the result + As the only integrated tool is MANTIS2 at the moment, the output is very tailored to the result of this tool. In the future, this section might contain "common" output and tool-specific output sub sections. -- ``mantis.{mapper}.{lib_name}-{lib_pk}/out/`` - - ``mantis.{mapper}.{lib_name}-{lib_pk}.results.txt`` - - ``mantis.{mapper}.{lib_name}-{lib_pk}.results.txt.status`` +- ``{mapper}.mantis_msi2.{lib_name}-{lib_pk}/out/`` + - ``{mapper}.mantis_msi2.{lib_name}-{lib_pk}.results.txt`` + - ``{mapper}.mantis_msi2.{lib_name}-{lib_pk}.results.txt.status`` ===================== Default Configuration @@ -39,14 +44,15 @@ Available Somatic Targeted CNV Caller ===================================== -- ``mantis`` +- ``mantis_msi2`` """ from collections import OrderedDict +import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, is_not_background +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand from snappy_pipeline.utils import dictify, listify @@ -60,30 +66,50 @@ __author__ = "Clemens Messerschmidt " +#: Extensions of files to create as main payload +EXT_VALUES = (".results.txt", ".results.txt.status", ".results.txt.md5", ".results.txt.status.md5") + +#: Names of the files to create for the extension +EXT_NAMES = ("result", "status", "result_md5", "status_md5") + +EXT_MATCHED = { + "mantis_msi2": { + "result": ".results.txt", + "status": ".results.txt.status", + "result_md5": ".results.txt.md5", + "status_md5": ".results.txt.status.md5", + }, +} + +#: Available somatic variant callers assuming matched samples. +MSI_CALLERS_MATCHED = ("mantis_msi2",) + + #: Default configuration for the somatic_msi_calling step DEFAULT_CONFIG = r""" # Default configuration somatic_msi_calling step_config: somatic_msi_calling: - tools: ['mantis'] # REQUIRED - available: 'mantis' + tools: ['mantis_msi2'] # REQUIRED - available: 'mantis_msi2' path_ngs_mapping: ../ngs_mapping # REQUIRED - loci_bed: /fast/groups/cubi/projects/biotools/Mantis/appData/hg19/loci.bed # REQUIRED + loci_bed: "" # REQUIRED + # hg19: /fast/work/groups/cubi/projects/biotools/Mantis/appData/hg19/loci.bed + # hg38: /fast/work/groups/cubi/projects/biotools/Mantis/appData/hg38/GRCh38.d1.vd1.all_loci.bed """ -class MantisStepPart(BaseStepPart): - """Perform somatic microsatellite instability with MANTIS""" +class Mantis2StepPart(BaseStepPart): + """Perform somatic microsatellite instability with MANTIS_msi2""" - #: Step name - name = "mantis" + name = "mantis_msi2" - #: Class available actions actions = ("run",) def __init__(self, parent): super().__init__(parent) self.base_path_out = ( - "work/mantis.{{mapper}}.{{library_name}}/out/" "mantis.{{mapper}}.{{library_name}}{ext}" + "work/{{mapper}}.{msi_caller}.{{tumor_library}}/out/" + "{{mapper}}.{msi_caller}.{{tumor_library}}{ext}" ) # Build shortcut from cancer bio sample name to matched cancer sample self.tumor_ngs_library_to_sample_pair = OrderedDict() @@ -92,12 +118,10 @@ def __init__(self, parent): sheet.all_sample_pairs_by_tumor_dna_ngs_library ) - def get_normal_lib_name(self, wildcards): - """Return name of normal (non-cancer) library""" - pair = self.tumor_ngs_library_to_sample_pair[wildcards.library_name] - return pair.normal_sample.dna_ngs_library.name - def get_input_files(self, action): + # Validate action + self._validate_action(action) + def input_function(wildcards): """Helper wrapper function""" # Get shorcut to Snakemake sub workflow @@ -110,7 +134,7 @@ def input_function(wildcards): ) ) tumor_base_path = ( - "output/{mapper}.{library_name}/out/" "{mapper}.{library_name}" + "output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}" ).format(**wildcards) return { "normal_bam": ngs_mapping(normal_base_path + ".bam"), @@ -119,24 +143,38 @@ def input_function(wildcards): "tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"), } - assert action == "run", "Unsupported actions" return input_function - @dictify + def get_normal_lib_name(self, wildcards): + """Return name of normal (non-cancer) library""" + pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library] + return pair.normal_sample.dna_ngs_library.name + def get_output_files(self, action): - assert action == "run", "Unsupported actions" - exts = {"result": "results.txt", "status": "results.txt.status"} - for key, ext in exts.items(): - yield key, ( - "work/mantis.{{mapper}}.{{library_name}}/out/" - "mantis.{{mapper}}.{{library_name}}_{sfx}" - ).format(sfx=ext) - - @staticmethod - def get_log_file(action): - """Return path to log file for the given action""" - _ = action - return "work/mantis.{mapper}.{library_name}/log/snakemake.mantis_run.log" + # Validate action + self._validate_action(action) + return dict( + zip(EXT_NAMES, expand(self.base_path_out, msi_caller=[self.name], ext=EXT_VALUES)) + ) + + @dictify + def _get_log_file(self, action): + """Return dict of log files.""" + # Validate action + self._validate_action(action) + + prefix = ( + "work/{{mapper}}.{msi_caller}.{{tumor_library}}/log/" + "{{mapper}}.{msi_caller}.{{tumor_library}}" + ).format(msi_caller=self.__class__.name) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" def get_resource_usage(self, action): """Get Resource Usage @@ -150,7 +188,7 @@ def get_resource_usage(self, action): self._validate_action(action) return ResourceUsage( threads=3, - time="08:00:00", # 2 hours + time="08:00:00", # 8 hours memory=f"{30 * 1024 * 3}M", ) @@ -164,9 +202,13 @@ class SomaticMsiCallingWorkflow(BaseStep): #: Default biomed sheet class sheet_shortcut_class = CancerCaseSheet + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + } + @classmethod def default_config_yaml(cls): - """Return default config YAML, to be overwritten by project-specific one""" + """Return default config YAML, to be overwritten by project-specific one.""" return DEFAULT_CONFIG def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): @@ -179,14 +221,41 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) (NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available - self.register_sub_step_classes((MantisStepPart, LinkOutStepPart)) + self.register_sub_step_classes((Mantis2StepPart, LinkOutStepPart)) # Initialize sub-workflows self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) @listify def get_result_files(self): - """Return list of result files for the somatic targeted sequencing CNV calling step""" - tool_actions = {"mantis": ("run",)} + """Return list of result files for the MSI calling workflow""" + name_pattern = "{mapper}.{msi_caller}.{tumor_library.name}" + for msi_caller in set(self.config["tools"]) & set(MSI_CALLERS_MATCHED): + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + msi_caller=msi_caller, + ext=EXT_MATCHED[msi_caller].values() if msi_caller in EXT_MATCHED else EXT_VALUES, + ) + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + msi_caller=msi_caller, + ext=( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ), + ) + + def _yield_result_files_matched(self, tpl, **kwargs): + """Build output paths from path template and extension list. + + This function returns the results from the matched msi callers such as + mantis. + """ for sheet in filter(is_not_background, self.shortcut_sheets): for sample_pair in sheet.all_sample_pairs: if ( @@ -199,21 +268,9 @@ def get_result_files(self): ) print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) continue - for tool in self.config["tools"]: - for action in tool_actions[tool]: - try: - tpls = self.sub_steps[tool].get_output_files(action).values() - except AttributeError: - tpls = self.sub_steps[tool].get_output_files(action) - for tpl in tpls: - filenames = expand( - tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - library_name=[sample_pair.tumor_sample.dna_ngs_library.name], - ) - for f in filenames: - if ".tmp." not in f: - yield f.replace("work/", "output/") + yield from expand( + tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs + ) def check_config(self): """Check that the necessary globalc onfiguration is present""" diff --git a/snappy_wrappers/wrappers/mantis/run/environment.yaml b/snappy_wrappers/wrappers/mantis/mantis_msi2/run/environment.yaml similarity index 53% rename from snappy_wrappers/wrappers/mantis/run/environment.yaml rename to snappy_wrappers/wrappers/mantis/mantis_msi2/run/environment.yaml index 8426cdf70..ee30de636 100644 --- a/snappy_wrappers/wrappers/mantis/run/environment.yaml +++ b/snappy_wrappers/wrappers/mantis/mantis_msi2/run/environment.yaml @@ -1,7 +1,5 @@ channels: - conda-forge -- anaconda - bioconda dependencies: -- numpy ==1.6.2 -- pysam ==0.8.3 +- mantis-msi2 diff --git a/snappy_wrappers/wrappers/mantis/run/wrapper.py b/snappy_wrappers/wrappers/mantis/mantis_msi2/run/wrapper.py similarity index 63% rename from snappy_wrappers/wrappers/mantis/run/wrapper.py rename to snappy_wrappers/wrappers/mantis/mantis_msi2/run/wrapper.py index 82fa7b8b4..e424261d0 100644 --- a/snappy_wrappers/wrappers/mantis/run/wrapper.py +++ b/snappy_wrappers/wrappers/mantis/mantis_msi2/run/wrapper.py @@ -14,16 +14,21 @@ set -x # Also pipe everything to log file -if [[ -n "{snakemake.log}" ]]; then +if [[ -n "{snakemake.log.log}" ]]; then if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log}) - exec &> >(tee -a "{snakemake.log}" >&2) + rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec &> >(tee -a "{snakemake.log.log}" >&2) else - rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log}) - echo "No tty, logging disabled" >"{snakemake.log}" + rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" fi fi +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} | sed -re "s/ (\.?.+\/)([^\/]+)$/ \2/" > {snakemake.log.conda_list}.md5 +md5sum {snakemake.log.conda_info} | sed -re "s/ (\.?.+\/)([^\/]+)$/ \2/" > {snakemake.log.conda_info}.md5 + # Setup auto-cleaned TMPDIR export TMPDIR=$(mktemp -d) trap "rm -rf $TMPDIR" EXIT @@ -32,8 +37,10 @@ # The following config is recommended by the authors for WES, # but should also work for reasonable deep WGS according to them. # https://github.com/OSU-SRLab/MANTIS/issues/25 +# +# Only one thread is used, see https://github.com/OSU-SRLab/MANTIS/issues/57 -python /fast/groups/cubi/projects/biotools/Mantis/MANTIS/mantis.py \ +mantis-msi2 \ -t {snakemake.input.tumor_bam} \ -n {snakemake.input.normal_bam} \ --genome {snakemake.config[static_data_config][reference][path]} \ @@ -43,7 +50,7 @@ --min-locus-quality 25.0 \ --min-locus-coverage 20 \ --min-repeat-reads 1 \ - --threads 3 \ + --threads 1 \ -o $TMPDIR/out/$(basename {snakemake.output.result}) pushd $TMPDIR/out @@ -55,3 +62,11 @@ mv $TMPDIR/out/* $(dirname {snakemake.output.result}) """ ) + +# Compute MD5 sums of logs. +shell( + r""" +sleep 1s # try to wait for log file flush +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_msi_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_msi_calling.py index 6d52a1d84..c3c486107 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_msi_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_msi_calling.py @@ -9,6 +9,7 @@ from snappy_pipeline.workflows.somatic_msi_calling import SomaticMsiCallingWorkflow +from .common import get_expected_log_files_dict from .conftest import patch_module_fs __author__ = "Clemens Messerschmidt" @@ -32,7 +33,7 @@ def minimal_config(): bwa: path_index: /path/to/bwa/index.fasta somatic_msi_calling: - tools: ['mantis'] + tools: ["mantis_msi2"] path_ngs_mapping: ../ngs_mapping # REQUIRED loci_bed: /path/to/hg19/loci.bed # REQUIRED @@ -73,70 +74,99 @@ def somatic_msi_calling_workflow( ) -# Tests for FeatureCountsStepPart ------------------------------------------------------------------ +# Tests for Mantis_msi2 ------------------------------------------------------------------ -def test_mantis_step_part_get_input_files(somatic_msi_calling_workflow): - """Tests FeatureCountsStepPart.get_input_files()""" - wildcards = Wildcards(fromdict={"library_name": "P001-T1-DNA1-WGS1", "mapper": "bwa"}) +def test_mantis_msi2_step_part_get_input_files(somatic_msi_calling_workflow): + """Tests mantis_msi2.get_input_files()""" + wildcards = Wildcards(fromdict={"tumor_library": "P001-T1-DNA1-WGS1", "mapper": "bwa"}) expected = { "normal_bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", "normal_bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "tumor_bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", "tumor_bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", } - actual = somatic_msi_calling_workflow.get_input_files("mantis", "run")(wildcards) + actual = somatic_msi_calling_workflow.get_input_files("mantis_msi2", "run")(wildcards) assert actual == expected -def test_mantis_step_part_get_output_files(somatic_msi_calling_workflow): - """Tests FeatureCountsStepPart.get_output_files()""" - # Define expected - base_name_out = "work/mantis.{mapper}.{library_name}/out/mantis.{mapper}.{library_name}_results" +def test_mantis_msi2_step_part_get_output_files(somatic_msi_calling_workflow): + """Test mantis_msi2.get_output_files""" + base_out = "work/{mapper}.mantis_msi2.{tumor_library}/out/{mapper}.mantis_msi2.{tumor_library}" expected = { - "result": base_name_out + ".txt", - "status": base_name_out + ".txt.status", + "result": base_out + ".results.txt", + "status": base_out + ".results.txt.status", + "result_md5": base_out + ".results.txt.md5", + "status_md5": base_out + ".results.txt.status.md5", } - # Get actual - actual = somatic_msi_calling_workflow.get_output_files("mantis", "run") + actual = somatic_msi_calling_workflow.get_output_files("mantis_msi2", "run") assert actual == expected -def test_mantis_step_part_get_log_file(somatic_msi_calling_workflow): - """Tests FeatureCountsStepPart.get_log_file()""" - expected = "work/mantis.{mapper}.{library_name}/log/snakemake.mantis_run.log" - actual = somatic_msi_calling_workflow.get_log_file("mantis", "run") +def test_mantis_msi2_step_part_get_log_files(somatic_msi_calling_workflow): + """Tests mantis_msi2.get_log_files()""" + base_out = ( + "work/{mapper}.mantis_msi2.{tumor_library}/log/" "{mapper}.mantis_msi2.{tumor_library}" + ) + expected = get_expected_log_files_dict(base_out=base_out) + actual = somatic_msi_calling_workflow.get_log_file("mantis_msi2", "run") assert actual == expected -def test_mantis_step_part_get_resource_usage(somatic_msi_calling_workflow): - """Tests FeatureCountsStepPart.get_resource()""" +def test_mantis_msi2_step_part_get_resource_usage( + somatic_msi_calling_workflow, +): + """Tests mantis_msi2.get_resource_usage()""" # Define expected - expected_dict = {"threads": 3, "time": "08:00:00", "memory": "92160M", "partition": "medium"} + expected_dict = {"threads": 3, "time": "08:00:00", "memory": f"{30 * 1024 * 3}M"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_msi_calling_workflow.get_resource("mantis", "run", resource) + actual = somatic_msi_calling_workflow.get_resource("mantis_msi2", "run", resource) assert actual == expected, msg_error -# Tests for SomaticMsiCallingWorkflow -------------------------------------------------------------- +# Tests for somatic msi calling workflow------------------------------------------------------------------ def test_somatic_msi_calling_workflow(somatic_msi_calling_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["link_out", "mantis"] - assert list(sorted(somatic_msi_calling_workflow.sub_steps.keys())) == expected + expected = ["link_out", "mantis_msi2"] + assert set(expected).issubset(list(sorted(somatic_msi_calling_workflow.sub_steps.keys()))) + # Check result file construction + tpl = ( + "output/{mapper}.{msi_caller}.P00{i}-T{t}-DNA1-WGS1/{dir_}/" + "{mapper}.{msi_caller}.P00{i}-T{t}-DNA1-WGS1{ext}" + ) expected = [ - "output/mantis.bwa.P001-T1-DNA1-WGS1/out/mantis.bwa.P001-T1-DNA1-WGS1_results.txt", - "output/mantis.bwa.P001-T1-DNA1-WGS1/out/mantis.bwa.P001-T1-DNA1-WGS1_results.txt.status", - "output/mantis.bwa.P002-T1-DNA1-WGS1/out/mantis.bwa.P002-T1-DNA1-WGS1_results.txt", - "output/mantis.bwa.P002-T1-DNA1-WGS1/out/mantis.bwa.P002-T1-DNA1-WGS1_results.txt.status", - "output/mantis.bwa.P002-T2-DNA1-WGS1/out/mantis.bwa.P002-T2-DNA1-WGS1_results.txt", - "output/mantis.bwa.P002-T2-DNA1-WGS1/out/mantis.bwa.P002-T2-DNA1-WGS1_results.txt.status", + tpl.format(mapper=mapper, msi_caller=msi_caller, i=i, t=t, ext=ext, dir_="out") + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ( + ".results.txt", + ".results.txt.status", + ".results.txt.md5", + ".results.txt.status.md5", + ) + for mapper in ("bwa",) + for msi_caller in ("mantis_msi2",) ] - actual = set(somatic_msi_calling_workflow.get_result_files()) - expected = set(expected) - assert actual == expected + expected += [ + tpl.format(mapper=mapper, msi_caller=msi_caller, i=i, t=t, ext=ext, dir_="log") + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ( + ".conda_info.txt", + ".conda_list.txt", + ".log", + ".conda_info.txt.md5", + ".conda_list.txt.md5", + ".log.md5", + ) + for mapper in ("bwa",) + for msi_caller in ("mantis_msi2",) + ] + expected = list(sorted(expected)) + + actual = list(sorted(somatic_msi_calling_workflow.get_result_files())) + assert expected == actual