Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implementing PureCN segmentation & CNV calling #449

Merged
merged 23 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
64e2a89
feat: wrapper to install R packages from CRAN, Bioconductor, github, …
ericblanc20 Aug 30, 2023
a5c4aa2
Merge branch '414-defining-tumor-purityploidy' of github.com:bihealth…
ericblanc20 Aug 30, 2023
06397e2
style: make black happy
ericblanc20 Aug 30, 2023
cca9a3e
feat: Added sequenza CNV tool (to be used by scarHRD)
ericblanc20 Aug 30, 2023
595ce3d
style: make snakefmt happy
ericblanc20 Aug 30, 2023
c6ba0a5
Merge branch '414-defining-tumor-purityploidy' of github.com:bihealth…
ericblanc20 Aug 30, 2023
a6ffd19
refactor: scarHRD now takes advantage of sequenza in the somatic_targ…
ericblanc20 Aug 30, 2023
418db7c
Merge branch '414-defining-tumor-purityploidy' of github.com:bihealth…
ericblanc20 Aug 31, 2023
32fe867
docs: describe scanHRD & sequenza relation
ericblanc20 Sep 14, 2023
ea2b9a8
docs: make flake8 happy
ericblanc20 Sep 14, 2023
d168dcc
feat: adds PureCN panel of normals creation
ericblanc20 Sep 14, 2023
0990d48
Merge branch '414-defining-tumor-purityploidy' of github.com:bihealth…
ericblanc20 Sep 14, 2023
9b205ed
refact: interval files moved to output
ericblanc20 Sep 14, 2023
49b97d6
fix: re-add the somatic_cnv_checking step removed by error in commit …
ericblanc20 Sep 14, 2023
d6f470f
feat: wrapper to install singularity/apptainer images
ericblanc20 Oct 4, 2023
f6db2c0
feat: save the genomics DB for PureCN
ericblanc20 Oct 4, 2023
abd8fd7
refact: PureCN panel of normals with singularity container
ericblanc20 Oct 4, 2023
12ec666
fix: typo
ericblanc20 Oct 4, 2023
2a5fa9a
Merge branch '414-defining-tumor-purityploidy' of github.com:bihealth…
ericblanc20 Oct 4, 2023
c8f0a7c
refactor: Allow input from snakemake or from config
ericblanc20 Oct 5, 2023
0a7eeb7
feat: Add support for purecn (requires purecn pon)
ericblanc20 Oct 5, 2023
6ece8b3
fix: Update resource usage & enumerate purcecn outputs
ericblanc20 Oct 10, 2023
e60ffc5
refactor: removed requirements made obsolete by using containers
ericblanc20 Oct 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -306,3 +306,40 @@ rule somatic_targeted_seq_cnv_calling_sequenza_report:
**wf.get_log_file("sequenza", "report"),
wrapper:
wf.wrapper_path("sequenza/report")


# Run PureCN ------------------------------------------------------


rule somatic_targeted_seq_cnv_calling_purecn_coverage:
input:
unpack(wf.get_input_files("purecn", "coverage")),
output:
**wf.get_output_files("purecn", "coverage"),
threads: wf.get_resource("purecn", "coverage", "threads")
resources:
time=wf.get_resource("purecn", "coverage", "time"),
memory=wf.get_resource("purecn", "coverage", "memory"),
partition=wf.get_resource("purecn", "coverage", "partition"),
tmpdir=wf.get_resource("purecn", "coverage", "tmpdir"),
log:
**wf.get_log_file("purecn", "coverage"),
wrapper:
wf.wrapper_path("purecn/coverage")


rule somatic_targeted_seq_cnv_calling_purecn_run:
input:
unpack(wf.get_input_files("purecn", "run")),
output:
**wf.get_output_files("purecn", "run"),
threads: wf.get_resource("purecn", "run", "threads")
resources:
time=wf.get_resource("purecn", "run", "time"),
memory=wf.get_resource("purecn", "run", "memory"),
partition=wf.get_resource("purecn", "run", "partition"),
tmpdir=wf.get_resource("purecn", "run", "tmpdir"),
log:
**wf.get_log_file("purecn", "run"),
wrapper:
wf.wrapper_path("purecn/run")
213 changes: 158 additions & 55 deletions snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,24 @@
features: EnsDb.Hsapiens.v75::EnsDb.Hsapiens.v75
prefix: ''
nThread: 8
purecn:
genome_name: "unknown" # Must be one from hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3
enrichment_kit_name: "unknown" # For filename only...
mappability: "" # GRCh38: /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw
reptiming: "" # Nothing for GRCh38
seed: 1234567
extra_commands: # Recommended extra arguments for PureCN, extra_arguments: [] to clear them all
model: betabin
"fun-segmentation": PSCBS
"post-optimize": "" # post-optimize is a flag
# A PureCN panel of normals is required, with the container, the intervals & the PON rds file
path_container: REQUIRED # ../panel_of_normals/work/containers/out/purecn.simg
path_intervals: REQUIRED # ../panel_of_normals/output/purecn/out/<enrichement_kit_name>_<genome_name>.list
path_panel_of_normals: REQUIRED # ../panel_of_normals/output/bwa.purecn/out/bwa.purecn.panel_of_normals.rds
path_mapping_bias: REQUIRED # ../panel_of_normals/output/bwa.purecn/out/bwa.purecn.mapping_bias.rds
# IMPORTANT NOTE: Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true"
somatic_variant_caller: "mutect2"
path_somatic_variants: ../somatic_variant_calling_for_purecn
cnvetti_on_target:
path_target_regions: REQUIRED # REQUIRED
cnvetti_off_target:
Expand Down Expand Up @@ -237,6 +255,18 @@ def get_normal_lib_name(self, wildcards):
pair = self.tumor_ngs_library_to_sample_pair[wildcards.library_name]
return pair.normal_sample.dna_ngs_library.name

@staticmethod
@dictify
def _get_log_file_from_prefix(prefix):
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
)
for key, ext in key_ext:
yield key, prefix + ext
yield key + "_md5", prefix + ext + ".md5"


class CnvettiStepPartBase(SomaticTargetedSeqCnvCallingStepPart):
"""Perform somatic targeted CNV calling using CNVetti; shared code.
Expand Down Expand Up @@ -373,19 +403,13 @@ def check_config(self):
"Path to target regions is missing for {}".format(self.name),
)

@dictify
def _get_log_file(self, action):
"""Return path to log file for the given action"""
# Validate action
self._validate_action(action)
name_pattern = self.name_pattern.format(action=action)
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
)
for key, ext in key_ext:
yield key, os.path.join("work", name_pattern, "log", name_pattern + ext)
prefix = os.path.join("work", name_pattern, "log", name_pattern)
return self._get_log_file_from_prefix(prefix)

def get_resource_usage(self, action):
"""Get Resource Usage
Expand Down Expand Up @@ -537,37 +561,128 @@ def get_params(self, action):
def _get_params_report(self, wildcards):
return wildcards["library_name"]

@dictify
def get_log_file(self, action):
"""Return dict of log files."""
# Validate action
self._validate_action(action)
if action == "install":
prefix = "work/R_packages/log/sequenza"
elif action == "gcreference":
prefix = "work/static_data/log/sequenza.{length}".format(
length=self.config["sequenza"]["length"],
)
elif action == "run":
prefix = (
"work/{mapper}.sequenza.{library_name}/log/{mapper}.sequenza.{library_name}.run"
)
elif action == "report":
prefix = (
"work/{mapper}.sequenza.{library_name}/log/{mapper}.sequenza.{library_name}.report"
)
else:
raise UnsupportedActionException(
"Action '{action}' is not supported. Valid options: {valid}".format(
action=action, valid=", ".join(self.actions)
name_pattern = "{mapper}.sequenza.{library_name}"
prefix = os.path.join("work", name_pattern, "log", name_pattern + "." + action)
return self._get_log_file_from_prefix(prefix)


class PureCNStepPart(SomaticTargetedSeqCnvCallingStepPart):
"""Perform somatic targeted CNV calling using PureCN"""

#: Step name
name = "purecn"

#: Class available actions
actions = ("coverage", "run")

resource_usage = {
"coverage": ResourceUsage(
threads=1,
time="04:00:00", # 4 hours
memory="24G",
),
"run": ResourceUsage(
threads=4,
time="24:00:00", # 4 hours
memory="96G",
),
}

def get_input_files(self, action):
"""Return input paths input function, dependent on rule"""
# Validate action
self._validate_action(action)
action_mapping = {
"coverage": self._get_input_files_coverage,
"run": self._get_input_files_run,
}
return action_mapping[action]

@dictify
def _get_input_files_run(self, wildcards):
name_pattern = "{mapper}.purecn.{library_name}".format(**wildcards)
yield "tumor", os.path.join(
"work",
name_pattern,
"out",
"{mapper}.{library_name}_coverage_loess.txt.gz".format(**wildcards),
)
name_pattern = "{mapper}.{caller}.{library_name}".format(
caller=self.config["purecn"]["somatic_variant_caller"],
**wildcards,
)
base_path = os.path.join("output", name_pattern, "out", name_pattern + ".full.vcf.gz")
variant_calling = self.parent.sub_workflows["somatic_variants"]
yield "vcf", variant_calling(base_path)

@dictify
def _get_input_files_coverage(self, wildcards):
ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
name_pattern = "{mapper}.{library_name}".format(**wildcards)
base_path = os.path.join("output", name_pattern, "out", name_pattern)
yield "bam", ngs_mapping(base_path + ".bam")
yield "bai", ngs_mapping(base_path + ".bam.bai")

def get_output_files(self, action):
"""Return output paths, dependent on rule"""
# Validate action
self._validate_action(action)
name_pattern = "{mapper}.purecn.{library_name}"
prefix = os.path.join("work", name_pattern, "out", "{library_name}")
action_mapping = {
"coverage": {
"coverage": os.path.join(
"work", name_pattern, "out", "{mapper}.{library_name}_coverage_loess.txt.gz"
)
)
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
},
"run": {
"segments": prefix + "_dnacopy.seg",
"ploidy": prefix + ".csv",
"pvalues": prefix + "_amplification_pvalues.csv",
"vcf": prefix + ".vcf.gz",
"vcf_tbi": prefix + ".vcf.gz.tbi",
"loh": prefix + "_loh.csv",
"segments_md5": prefix + "_dnacopy.seg.md5",
"ploidy_md5": prefix + ".csv.md5",
"pvalues_md5": prefix + "_amplification_pvalues.csv.md5",
"vcf_md5": prefix + ".vcf.gz.md5",
"vcf_tbi_md5": prefix + ".vcf.gz.tbi.md5",
"loh_md5": prefix + "_loh.csv.md5",
},
}
return action_mapping[action]

def get_log_file(self, action):
"""Return dict of log files."""
# Validate action
self._validate_action(action)

name_pattern = "{mapper}.purecn.{library_name}"
prefix = os.path.join("work", name_pattern, "log", name_pattern + "." + action)
return self._get_log_file_from_prefix(prefix)

def check_config(self):
if self.name not in self.config["tools"]:
return # skip check
self.parent.ensure_w_config(
("step_config", "somatic_targeted_seq_cnv_calling", self.name, "path_panel_of_normals"),
"Path to the PureCN panel of normal is missing",
)
self.parent.ensure_w_config(
("step_config", "somatic_targeted_seq_cnv_calling", self.name, "path_mapping_bias"),
"Path to the PureCN mapping bias file is missing (created in the panel_of_normals step)",
)
for key, ext in key_ext:
yield key, prefix + ext
yield key + "_md5", prefix + ext + ".md5"


class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart):
Expand Down Expand Up @@ -837,16 +952,7 @@ def get_log_file(self, action):
"work/{{mapper}}.cnvkit.{{library_name}}/log/"
"{{mapper}}.cnvkit.{action}.{{library_name}}"
).format(action=action)
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
)
log_files = {}
for key, ext in key_ext:
log_files[key] = prefix + ext
log_files[key + "_md5"] = prefix + ext + ".md5"
return log_files
return self._get_log_file_from_prefix(prefix)


class CopywriterStepPart(SomaticTargetedSeqCnvCallingStepPart):
Expand Down Expand Up @@ -975,29 +1081,18 @@ def check_config(self):
"Path to target regions is missing",
)

@dictify
def _get_log_file(self, action):
def get_log_file(self, action):
"""Return path to log file for the given action"""
# Validate action
self._validate_action(action)

key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
)

if action == "prepare":
log_file = "work/copywriter.prepare/log/snakemake.log"
yield "log", log_file
yield "log_md5", log_file + ".md5"
elif action in ("call", "run"):
tpl = (
"work/{mapper}.copywriter.{library_name}/log/{mapper}.copywriter.{library_name}."
+ action
)
for key, ext in key_ext:
yield key, tpl + ext
return {"log": log_file, "log_md5": log_file + ".md5"}
else:
name_pattern = "{mapper}.copywriter.{library_name}"
prefix = os.path.join("work", name_pattern, "log", name_pattern + "." + action)
return self._get_log_file_from_prefix(prefix)


class SomaticTargetedSeqCnvCallingWorkflow(BaseStep):
Expand Down Expand Up @@ -1035,18 +1130,26 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
CnvKitStepPart,
CopywriterStepPart,
SequenzaStepPart,
PureCNStepPart,
LinkOutStepPart,
)
)
# Initialize sub-workflows
self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])
if "purecn" in self.config["tools"]:
self.register_sub_workflow(
"somatic_variant_calling",
self.config["purecn"]["path_somatic_variants"],
"somatic_variants",
)

@listify
def get_result_files(self):
"""Return list of result files for the somatic targeted sequencing CNV calling step"""
tool_actions = {
"cnvkit": ["fix", "postprocess", "report", "export"],
"sequenza": ("run",),
"purecn": ("run",),
"copywriter": ("call",),
"cnvetti_on_target": ("coverage", "segment", "postprocess"),
"cnvetti_off_target": ("coverage", "segment", "postprocess"),
Expand Down
Loading
Loading