Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Fixing Mantis #427

Merged
merged 7 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions snappy_pipeline/workflows/somatic_msi_calling/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,21 @@ rule somatic_msi_calling_link_out_run:

# Somatic Microsatellite Instability analysis~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Run MANTIS ------------------------------------------------------------------
# Run MANTIS_msi2 ------------------------------------------------------------------


rule somatic_msi_calling_mantis:
rule somatic_msi_calling_mantis_msi2:
input:
unpack(wf.get_input_files("mantis", "run")),
unpack(wf.get_input_files("mantis_msi2", "run")),
output:
**wf.get_output_files("mantis", "run"),
threads: wf.get_resource("mantis", "run", "threads")
**wf.get_output_files("mantis_msi2", "run"),
threads: wf.get_resource("mantis_msi2", "run", "threads")
resources:
time=wf.get_resource("mantis", "run", "time"),
memory=wf.get_resource("mantis", "run", "memory"),
partition=wf.get_resource("mantis", "run", "partition"),
tmpdir=wf.get_resource("mantis", "run", "tmpdir"),
time=wf.get_resource("mantis_msi2", "run", "time"),
memory=wf.get_resource("mantis_msi2", "run", "memory"),
partition=wf.get_resource("mantis_msi2", "run", "partition"),
tmpdir=wf.get_resource("mantis_msi2", "run", "tmpdir"),
log:
wf.get_log_file("mantis", "run"),
**wf.get_log_file("mantis_msi2", "run"),
wrapper:
wf.wrapper_path("mantis/run")
wf.wrapper_path("mantis/mantis_msi2/run")
167 changes: 112 additions & 55 deletions snappy_pipeline/workflows/somatic_msi_calling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
whole genomes, exomes or large panels). MANTIS starts from the aligned reads
(thus off ``ngs_mapping``) and generates a result file per tumor/normal pair.

As MANTIS is not maintained anymore, the pipeline now supports only
`MANTIS2 <https://github.com/nh13/MANTIS2>`_.
The new version appears to be very silimar to the old one, both in terms of input & output files,
and in terms of requirements.

==========
Step Input
==========
Expand All @@ -19,13 +24,13 @@

.. note:: Tool-Specific Output

As the only integrated tool is MANTIS at the moment, the output is very tailored to the result
As the only integrated tool is MANTIS2 at the moment, the output is very tailored to the result
of this tool. In the future, this section might contain "common" output and tool-specific
output sub sections.

- ``mantis.{mapper}.{lib_name}-{lib_pk}/out/``
- ``mantis.{mapper}.{lib_name}-{lib_pk}.results.txt``
- ``mantis.{mapper}.{lib_name}-{lib_pk}.results.txt.status``
- ``{mapper}.mantis_msi2.{lib_name}-{lib_pk}/out/``
- ``{mapper}.mantis_msi2.{lib_name}-{lib_pk}.results.txt``
- ``{mapper}.mantis_msi2.{lib_name}-{lib_pk}.results.txt.status``

=====================
Default Configuration
Expand All @@ -39,14 +44,15 @@
Available Somatic Targeted CNV Caller
=====================================

- ``mantis``
- ``mantis_msi2``

"""

from collections import OrderedDict
import os
import sys

from biomedsheets.shortcuts import CancerCaseSheet, is_not_background
from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
from snakemake.io import expand

from snappy_pipeline.utils import dictify, listify
Expand All @@ -60,30 +66,50 @@

__author__ = "Clemens Messerschmidt <[email protected]>"

#: Extensions of files to create as main payload
EXT_VALUES = (".results.txt", ".results.txt.status", ".results.txt.md5", ".results.txt.status.md5")

#: Names of the files to create for the extension
EXT_NAMES = ("result", "status", "result_md5", "status_md5")

EXT_MATCHED = {
"mantis_msi2": {
ericblanc20 marked this conversation as resolved.
Show resolved Hide resolved
"result": ".results.txt",
"status": ".results.txt.status",
"result_md5": ".results.txt.md5",
"status_md5": ".results.txt.status.md5",
},
}

#: Available somatic variant callers assuming matched samples.
MSI_CALLERS_MATCHED = ("mantis_msi2",)


#: Default configuration for the somatic_msi_calling step
DEFAULT_CONFIG = r"""
# Default configuration somatic_msi_calling
step_config:
somatic_msi_calling:
tools: ['mantis'] # REQUIRED - available: 'mantis'
tools: ['mantis_msi2'] # REQUIRED - available: 'mantis_msi2'
path_ngs_mapping: ../ngs_mapping # REQUIRED
loci_bed: /fast/groups/cubi/projects/biotools/Mantis/appData/hg19/loci.bed # REQUIRED
loci_bed: "" # REQUIRED
# hg19: /fast/work/groups/cubi/projects/biotools/Mantis/appData/hg19/loci.bed
# hg38: /fast/work/groups/cubi/projects/biotools/Mantis/appData/hg38/GRCh38.d1.vd1.all_loci.bed
"""


class MantisStepPart(BaseStepPart):
"""Perform somatic microsatellite instability with MANTIS"""
class Mantis2StepPart(BaseStepPart):
"""Perform somatic microsatellite instability with MANTIS_msi2"""

#: Step name
name = "mantis"
name = "mantis_msi2"

#: Class available actions
actions = ("run",)

def __init__(self, parent):
super().__init__(parent)
self.base_path_out = (
"work/mantis.{{mapper}}.{{library_name}}/out/" "mantis.{{mapper}}.{{library_name}}{ext}"
"work/{{mapper}}.{msi_caller}.{{tumor_library}}/out/"
"{{mapper}}.{msi_caller}.{{tumor_library}}{ext}"
)
# Build shortcut from cancer bio sample name to matched cancer sample
self.tumor_ngs_library_to_sample_pair = OrderedDict()
Expand All @@ -92,12 +118,10 @@ def __init__(self, parent):
sheet.all_sample_pairs_by_tumor_dna_ngs_library
)

def get_normal_lib_name(self, wildcards):
"""Return name of normal (non-cancer) library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.library_name]
return pair.normal_sample.dna_ngs_library.name

def get_input_files(self, action):
# Validate action
self._validate_action(action)

def input_function(wildcards):
"""Helper wrapper function"""
# Get shorcut to Snakemake sub workflow
Expand All @@ -110,7 +134,7 @@ def input_function(wildcards):
)
)
tumor_base_path = (
"output/{mapper}.{library_name}/out/" "{mapper}.{library_name}"
"output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}"
).format(**wildcards)
return {
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
Expand All @@ -119,24 +143,38 @@ def input_function(wildcards):
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}

assert action == "run", "Unsupported actions"
return input_function

@dictify
def get_normal_lib_name(self, wildcards):
ericblanc20 marked this conversation as resolved.
Show resolved Hide resolved
"""Return name of normal (non-cancer) library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
return pair.normal_sample.dna_ngs_library.name

def get_output_files(self, action):
assert action == "run", "Unsupported actions"
exts = {"result": "results.txt", "status": "results.txt.status"}
for key, ext in exts.items():
yield key, (
"work/mantis.{{mapper}}.{{library_name}}/out/"
"mantis.{{mapper}}.{{library_name}}_{sfx}"
).format(sfx=ext)

@staticmethod
def get_log_file(action):
"""Return path to log file for the given action"""
_ = action
return "work/mantis.{mapper}.{library_name}/log/snakemake.mantis_run.log"
# Validate action
self._validate_action(action)
return dict(
zip(EXT_NAMES, expand(self.base_path_out, msi_caller=[self.name], ext=EXT_VALUES))
)

@dictify
def _get_log_file(self, action):
"""Return dict of log files."""
# Validate action
self._validate_action(action)

prefix = (
"work/{{mapper}}.{msi_caller}.{{tumor_library}}/log/"
"{{mapper}}.{msi_caller}.{{tumor_library}}"
).format(msi_caller=self.__class__.name)
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
)
for key, ext in key_ext:
yield key, prefix + ext
yield key + "_md5", prefix + ext + ".md5"

def get_resource_usage(self, action):
"""Get Resource Usage
Expand All @@ -150,7 +188,7 @@ def get_resource_usage(self, action):
self._validate_action(action)
return ResourceUsage(
threads=3,
time="08:00:00", # 2 hours
time="08:00:00", # 8 hours
memory=f"{30 * 1024 * 3}M",
)

Expand All @@ -164,9 +202,13 @@ class SomaticMsiCallingWorkflow(BaseStep):
#: Default biomed sheet class
sheet_shortcut_class = CancerCaseSheet

sheet_shortcut_kwargs = {
"options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
}

@classmethod
def default_config_yaml(cls):
"""Return default config YAML, to be overwritten by project-specific one"""
"""Return default config YAML, to be overwritten by project-specific one."""
return DEFAULT_CONFIG

def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
Expand All @@ -179,14 +221,41 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
(NgsMappingWorkflow,),
)
# Register sub step classes so the sub steps are available
self.register_sub_step_classes((MantisStepPart, LinkOutStepPart))
self.register_sub_step_classes((Mantis2StepPart, LinkOutStepPart))
# Initialize sub-workflows
self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])

@listify
def get_result_files(self):
"""Return list of result files for the somatic targeted sequencing CNV calling step"""
tool_actions = {"mantis": ("run",)}
"""Return list of result files for the MSI calling workflow"""
name_pattern = "{mapper}.{msi_caller}.{tumor_library.name}"
for msi_caller in set(self.config["tools"]) & set(MSI_CALLERS_MATCHED):
yield from self._yield_result_files_matched(
ericblanc20 marked this conversation as resolved.
Show resolved Hide resolved
os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
msi_caller=msi_caller,
ext=EXT_MATCHED[msi_caller].values() if msi_caller in EXT_MATCHED else EXT_VALUES,
)
yield from self._yield_result_files_matched(
os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
msi_caller=msi_caller,
ext=(
".log",
".log.md5",
".conda_info.txt",
".conda_info.txt.md5",
".conda_list.txt",
".conda_list.txt.md5",
),
)

def _yield_result_files_matched(self, tpl, **kwargs):
"""Build output paths from path template and extension list.

This function returns the results from the matched msi callers such as
mantis.
"""
for sheet in filter(is_not_background, self.shortcut_sheets):
for sample_pair in sheet.all_sample_pairs:
if (
Expand All @@ -199,21 +268,9 @@ def get_result_files(self):
)
print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
continue
for tool in self.config["tools"]:
for action in tool_actions[tool]:
try:
tpls = self.sub_steps[tool].get_output_files(action).values()
except AttributeError:
tpls = self.sub_steps[tool].get_output_files(action)
for tpl in tpls:
filenames = expand(
tpl,
mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
library_name=[sample_pair.tumor_sample.dna_ngs_library.name],
)
for f in filenames:
if ".tmp." not in f:
yield f.replace("work/", "output/")
yield from expand(
tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
)

def check_config(self):
"""Check that the necessary globalc onfiguration is present"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
channels:
- conda-forge
- anaconda
- bioconda
dependencies:
- numpy ==1.6.2
- pysam ==0.8.3
- mantis-msi2
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@
set -x

# Also pipe everything to log file
if [[ -n "{snakemake.log}" ]]; then
if [[ -n "{snakemake.log.log}" ]]; then
if [[ "$(set +e; tty; set -e)" != "" ]]; then
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log})
exec &> >(tee -a "{snakemake.log}" >&2)
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log.log})
exec &> >(tee -a "{snakemake.log.log}" >&2)
else
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log})
echo "No tty, logging disabled" >"{snakemake.log}"
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log.log})
echo "No tty, logging disabled" >"{snakemake.log.log}"
fi
fi

conda list >{snakemake.log.conda_list}
conda info >{snakemake.log.conda_info}
md5sum {snakemake.log.conda_list} | sed -re "s/ (\.?.+\/)([^\/]+)$/ \2/" > {snakemake.log.conda_list}.md5
md5sum {snakemake.log.conda_info} | sed -re "s/ (\.?.+\/)([^\/]+)$/ \2/" > {snakemake.log.conda_info}.md5

# Setup auto-cleaned TMPDIR
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" EXIT
Expand All @@ -32,8 +37,10 @@
# The following config is recommended by the authors for WES,
# but should also work for reasonable deep WGS according to them.
# https://github.com/OSU-SRLab/MANTIS/issues/25
#
# Only one thread is used, see https://github.com/OSU-SRLab/MANTIS/issues/57

python /fast/groups/cubi/projects/biotools/Mantis/MANTIS/mantis.py \
mantis-msi2 \
-t {snakemake.input.tumor_bam} \
-n {snakemake.input.normal_bam} \
--genome {snakemake.config[static_data_config][reference][path]} \
Expand All @@ -43,7 +50,7 @@
--min-locus-quality 25.0 \
--min-locus-coverage 20 \
--min-repeat-reads 1 \
--threads 3 \
--threads 1 \
-o $TMPDIR/out/$(basename {snakemake.output.result})

pushd $TMPDIR/out
Expand All @@ -55,3 +62,11 @@
mv $TMPDIR/out/* $(dirname {snakemake.output.result})
"""
)

# Compute MD5 sums of logs.
shell(
r"""
sleep 1s # try to wait for log file flush
md5sum {snakemake.log.log} >{snakemake.log.log_md5}
"""
)
Loading
Loading