Skip to content

Commit

Permalink
feat: clean-up manta. TODO: enable exome & rna
Browse files Browse the repository at this point in the history
  • Loading branch information
ericblanc20 committed Dec 11, 2024
1 parent 5d11f31 commit ef6c75a
Show file tree
Hide file tree
Showing 8 changed files with 300 additions and 55 deletions.
23 changes: 22 additions & 1 deletion snappy_pipeline/workflows/somatic_wgs_sv_calling/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,39 @@ rule somatic_wgs_sv_calling_link_out_run:
# Run Manta -------------------------------------------------------------------


rule somatic_wgs_sv_calling_manta_ignore_chroms:
input:
unpack(wf.get_input_files("manta", "ignore_chroms")),
output:
**wf.get_output_files("manta", "ignore_chroms"),
params:
**{"args": wf.get_args("manta", "ignore_chroms")},
threads: wf.get_resource("manta", "ignore_chroms", "threads")
resources:
time=wf.get_resource("manta", "ignore_chroms", "time"),
memory=wf.get_resource("manta", "ignore_chroms", "memory"),
partition=wf.get_resource("manta", "ignore_chroms", "partition"),
tmpdir=wf.get_resource("manta", "ignore_chroms", "tmpdir"),
log:
**wf.get_log_file("manta", "ignore_chroms"),
wrapper:
wf.wrapper_path("manta/ignore_chroms")


rule somatic_wgs_sv_calling_manta_run:
input:
unpack(wf.get_input_files("manta", "run")),
output:
**wf.get_output_files("manta", "run"),
params:
**{"args": wf.get_args("manta", "run")},
threads: wf.get_resource("manta", "run", "threads")
resources:
time=wf.get_resource("manta", "run", "time"),
memory=wf.get_resource("manta", "run", "memory"),
partition=wf.get_resource("manta", "run", "partition"),
tmpdir=wf.get_resource("manta", "run", "tmpdir"),
log:
wf.get_log_file("manta", "run"),
**wf.get_log_file("manta", "run"),
wrapper:
wf.wrapper_path("manta/somatic_wgs")
96 changes: 84 additions & 12 deletions snappy_pipeline/workflows/somatic_wgs_sv_calling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
from collections import OrderedDict

from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
from snakemake.io import expand
from snakemake.io import Wildcards, expand

from snappy_pipeline.utils import dictify, listify
from snappy_pipeline.workflows.abstract import (
Expand Down Expand Up @@ -118,6 +118,7 @@ class SomaticWgsSvCallingStepPart(BaseStepPart):

def __init__(self, parent):
super().__init__(parent)
self.cfg = self.config[self.name]
self.base_path_out = (
"work/{{mapper}}.{var_caller}.{{cancer_library}}/out/"
"{{mapper}}.{var_caller}.{{cancer_library}}{ext}"
Expand Down Expand Up @@ -187,23 +188,94 @@ class MantaStepPart(SomaticWgsSvCallingStepPart):
name = "manta"

#: Class available actions
actions = ("run",)
actions = ("ignore_chroms", "run")

def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage:
"""Get Resource Usage
resource_usage = {
"ignore_chroms": ResourceUsage(threads=1, time="1:00:00", memory="4096M"),
"run": ResourceUsage(threads=16, time="1-16:00:00", memory="64000M"),
}

:param action: Action (i.e., step) in the workflow, example: 'run'.
:type action: str
def get_input_files(self, action):
# Validate action
self._validate_action(action)
match action:
case "ignore_chroms":
return self._get_input_files_ignore_chroms
case "run":
return self._get_input_files_run

:return: Returns ResourceUsage for step.
"""
def get_args(self, action: str) -> dict[str, str]:
# Validate action
self._validate_action(action)
return ResourceUsage(
threads=16,
time="1-16:00:00", # 1 day and 16 hours
memory=f"{int(3.75 * 1024 * 16)}M",

match action:
case "run":
args = {"extra_config": dict(self.cfg.extra_config)}
args["extra_config"]["reference"] = self.w_config.static_data_config.reference.path
args["extended_options"] = dict(self.cfg.extended_options)
return args
case "ignore_chroms":
return {"ignore_chroms": self.cfg.ignore_chroms}

def get_output_files(self, action: str) -> dict[str, str]:
# Validate action
self._validate_action(action)

match action:
case "run":
tpl = "work/{mapper}.manta.{cancer_library}/out/{mapper}.manta.{cancer_library}."
return {
"vcf": tpl + "vcf.gz",
"vcf_tbi": tpl + "vcf.gz.tbi",
"candidate": tpl + "candidate.vcf.gz",
"candidate_tbi": tpl + "candidate.vcf.gz.tbi",
"vcf_md5": tpl + "vcf.gz.md5",
"vcf_tbi_md5": tpl + "vcf.gz.tbi.md5",
"candidate_md5": tpl + "candidate.vcf.gz.md5",
"candidate_tbi_md5": tpl + "candidate.vcf.gz.tbi.md5",
}
case "ignore_chroms":
return {"callRegions": "work/{mapper}.manta/out/callRegions.bed.gz"}

@dictify
def get_log_file(self, action):
# Validate action
self._validate_action(action)
match action:
case "ignore_chroms":
tpl = "work/{mapper}.manta/log/callRegions."
case "run":
tpl = "work/{mapper}.manta.{cancer_library}/log/{mapper}.manta.{cancer_library}."
for ext in ("conda_info.txt", "conda_list.txt", "log", "sh"):
yield ext.replace(".txt", ""), tpl + ext
yield ext.replace(".txt", "") + "_md5", tpl + ext + ".md5"

def _get_input_files_ignore_chroms(self, wildcards: Wildcards) -> dict[str, str]:
return {"reference": self.w_config.static_data_config.reference.path}

def _get_input_files_run(self, wildcards: Wildcards) -> dict[str, str]:
input_files = {"reference": self.w_config.static_data_config.reference.path}
if len(self.cfg.ignore_chroms) > 0:
input_files["callRegions"] = "work/{mapper}.manta/out/callRegions.bed.gz".format(
**wildcards
)

ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
cancer_library = wildcards["cancer_library"]
tpl = "output/{mapper}.{library}/out/{mapper}.{library}"
input_files["tumor_bam"] = ngs_mapping(
tpl.format(mapper=wildcards["mapper"], library=cancer_library) + ".bam"
)
input_files["tumor_bai"] = input_files["tumor_bam"] + ".bai"

if self.cancer_ngs_library_to_sample_pair.get(cancer_library, None) is not None:
normal_library = self.get_normal_lib_name(wildcards)
input_files["normal_bam"] = ngs_mapping(
tpl.format(mapper=wildcards["mapper"], library=normal_library) + ".bam"
)
input_files["normal_bai"] = input_files["normal_bam"] + ".bai"

return input_files


class Delly2StepPart(BaseStepPart):
Expand Down
108 changes: 107 additions & 1 deletion snappy_pipeline/workflows/somatic_wgs_sv_calling/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,114 @@ class Tool(enum.StrEnum):
delly2 = "delly2"


class MantaExtraConfig(SnappyModel):
minCandidateVariantSize: int = 8
rnaMinCandidateVariantSize: int = 1000
"""
Run discovery and candidate reporting for all SVs/indels at or above this size
Separate option (to provide different default) used for runs in RNA-mode
"""

minEdgeObservations: int = 3
"""
Remove all edges from the graph unless they're supported by this many 'observations'.
Note that one supporting read pair or split read usually equals one observation, but evidence is sometimes downweighted.
"""

graphNodeMaxEdgeCount: int = 10
"""If both nodes of an edge have an edge count higher than this, then skip evaluation of the edge. Set to 0 to turn this filtration off"""

minCandidateSpanningCount: int = 3
"""Run discovery and candidate reporting for all SVs/indels with at least this many spanning support observations"""

minScoredVariantSize: int = 50
"""After candidate identification, only score and report SVs/indels at or above this size"""

minDiploidVariantScore: int = 10
"""Minimum VCF "QUAL" score for a variant to be included in the diploid vcf"""

minPassDiploidVariantScore: int = 20
"""VCF "QUAL" score below which a variant is marked as filtered in the diploid vcf"""

minPassDiploidGTScore: int = 15
"""Minimum genotype quality score below which single samples are filtered for a variant in the diploid vcf"""

minSomaticScore: int = 10
"""Somatic quality scores below this level are not included in the somatic vcf"""

minPassSomaticScore: int = 30
"""Somatic quality scores below this level are filtered in the somatic vcf"""

enableRemoteReadRetrievalForInsertionsInGermlineCallingModes: int = 1
enableRemoteReadRetrievalForInsertionsInCancerCallingModes: int = 0
"""
Remote read retrieval is used ot improve the assembly of putative insertions by retrieving any mate reads in remote
locations with poor mapping quality, which pair to confidently mapping reads near the insertion locus. These reads
can help to fully assemble longer insertions, under certain circumstances this feature can add a very large runtime
burden. For instance, given the very high chimeric pair rates found in degraded FFPE samples, the runtime of the read
retrieval process can be unpredicable. For this reason the feature is disabled by default for somatic variant calling.
This feature can be enabled/disabled separately for germline and cancer calling below.
Here "CancerCallingModes" includes tumor-normal subtraction and tumor-only calling. "GermlineCallingModes" includes all other calling modes.
"""

useOverlapPairEvidence: int = 0
"""Set if an overlapping read pair will be considered as evidence. Set to 0 to skip overlapping read pairs"""

enableEvidenceSignalFilter: int = 1
"""Set the filter on candidates of insignificant evidence signal. This is forced to 0 for runs in RNA-mode"""


class MantaExtendedOptions(SnappyModel):
existingAlignStatsFile: str | None = None
"""Pre-calculated alignment statistics file. Skips alignment stats calculation."""

useExistingChromDepths: bool = False
"""Use pre-calculated chromosome depths."""

generateEvidenceBam: bool = False
"""Generate a bam of supporting reads for all SVs."""

outputContig: bool = False
"""Output assembled contig sequences in VCF file."""

region: list[str] = []
"""
Limit the analysis to a region of the genome for debugging purposes.
If this argument is provided multiple times all specified regions will be analyzed together.
All regions must be non-overlapping to get a meaningful result. Examples:
- '--region chr20' (whole chromosome),
- '--region chr2:100-2000 --region chr3:2500-3000' (two regions)
If this option is specified (one or more times) together with the --callRegions BED file,
then all region arguments will be intersected with the callRegions BED track.
"""

retainTempFiles: bool = False
"""Keep all temporary files (for workflow debugging)"""

scanSizeMb: int = 12
"""Maximum sequence region size (in megabases) scanned by each task during SV Locus graph generation."""

callMemMb: int | None = None
"""
Set default task memory requirement (in megabytes) for common tasks.
This may benefit an analysis of unusual depth, chimera rate, etc..
'Common' tasks refers to most compute intensive scatter-phase tasks of graph creation and candidate generation.
"""


class Manta(SnappyModel):
pass
ignore_chroms: list[str] = []

unstrandedRNA: bool = False
"""Set if RNA-Seq input is unstranded: Allows splice-junctions on either strand"""

extra_config: MantaExtraConfig = MantaExtraConfig()
extended_options: MantaExtendedOptions = MantaExtendedOptions()


class Delly2(SnappyModel):
Expand Down
6 changes: 6 additions & 0 deletions snappy_wrappers/wrappers/manta/ignore_chroms/environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- htslib
22 changes: 22 additions & 0 deletions snappy_wrappers/wrappers/manta/ignore_chroms/wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
"""Wrapper for running Manta in somatic variant calling mode on WGS data"""

import os

from snappy_wrappers.simple_wrapper import SimpleWrapper
from snappy_wrappers.tools.genome_windows import ignore_chroms

__author__ = "Eric Blanc"
__email__ = "[email protected]"

args = snakemake.params.get("args", {})

bed = str(snakemake.output.callRegions).replace(".gz", "")

with open(bed, "wt") as f:
for contig, length in ignore_chroms(str(snakemake.input.reference), set(args["ignore_chroms"]), return_ignored=False):
print(f"{contig}\t0\t{length}", file=f)

cmd=f"bgzip {bed} ; tabix {snakemake.output.callRegions}"

SimpleWrapper(snakemake, cmd).run_bash()
3 changes: 2 additions & 1 deletion snappy_wrappers/wrappers/manta/somatic_wgs/environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ channels:
- bioconda
- nodefaults
dependencies:
- manta==1.5.0
- manta==1.6.0
- boost-cpp==1.85
Loading

0 comments on commit ef6c75a

Please sign in to comment.