Skip to content

Commit

Permalink
refactor: scarHRD now takes advantage of sequenza in the somatic_targ…
Browse files Browse the repository at this point in the history
…eted_seq_cnv_calling step
  • Loading branch information
ericblanc20 committed Aug 30, 2023
1 parent c6ba0a5 commit a6ffd19
Show file tree
Hide file tree
Showing 9 changed files with 77 additions and 262 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,11 @@ rule homologous_recombination_deficiency_link_out_run:
rule homologous_recombination_deficiency_scarHRD_install:
output:
**wf.get_output_files("scarHRD", "install"),
threads: wf.get_resource("scarHRD", "install", "threads")
params:
packages=[
{"name": "sztup/scarHRD", "repo": "github"},
{"name": "aroneklund/copynumber", "repo": "github"},
],
resources:
time=wf.get_resource("scarHRD", "install", "time"),
memory=wf.get_resource("scarHRD", "install", "memory"),
Expand All @@ -70,22 +74,7 @@ rule homologous_recombination_deficiency_scarHRD_install:
log:
**wf.get_log_file("scarHRD", "install"),
wrapper:
wf.wrapper_path("scarHRD/install")


rule homologous_recombination_deficiency_scarHRD_gcreference:
output:
**wf.get_output_files("scarHRD", "gcreference"),
threads: wf.get_resource("scarHRD", "gcreference", "threads")
resources:
time=wf.get_resource("scarHRD", "gcreference", "time"),
memory=wf.get_resource("scarHRD", "gcreference", "memory"),
partition=wf.get_resource("scarHRD", "gcreference", "partition"),
tmpdir=wf.get_resource("scarHRD", "gcreference", "tmpdir"),
log:
**wf.get_log_file("scarHRD", "gcreference"),
wrapper:
wf.wrapper_path("scarHRD/gcreference")
wf.wrapper_path("r")


rule homologous_recombination_deficiency_scarHRD_run:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
"""

from collections import OrderedDict
import sys

from biomedsheets.shortcuts import CancerCaseSheet, is_not_background
Expand All @@ -66,7 +65,9 @@
LinkOutStepPart,
ResourceUsage,
)
from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow
from snappy_pipeline.workflows.somatic_targeted_seq_cnv_calling import (
SomaticTargetedSeqCnvCallingWorkflow,
)

__author__ = "Eric Blanc <[email protected]>"

Expand All @@ -75,8 +76,8 @@
# Default configuration homologous_recombination_deficiency
step_config:
homologous_recombination_deficiency:
tools: ['scarHRD'] # REQUIRED - available: 'mantis'
path_ngs_mapping: ../ngs_mapping # REQUIRED
tools: ['scarHRD'] # REQUIRED - available: 'scarHRD'
path_cnv_calling: ../somatic_targeted_seq_cnv_calling # REQUIRED
scarHRD:
genome_name: "grch37" # Must be either "grch37", "grch38" or "mouse"
chr_prefix: False
Expand All @@ -93,81 +94,31 @@ class ScarHRDStepPart(BaseStepPart):
#: Class available actions
actions = (
"install",
"gcreference",
"run",
)

def __init__(self, parent):
super().__init__(parent)
self.base_path_out = (
"work/{{mapper}}.scarHRD.{{library_name}}/out/{{mapper}}.scarHRD.{{library_name}}{ext}"
)
# Build shortcut from cancer bio sample name to matched cancer sample
self.tumor_ngs_library_to_sample_pair = OrderedDict()
for sheet in self.parent.shortcut_sheets:
self.tumor_ngs_library_to_sample_pair.update(
sheet.all_sample_pairs_by_tumor_dna_ngs_library
)

def get_normal_lib_name(self, wildcards):
"""Return name of normal (non-cancer) library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.library_name]
return pair.normal_sample.dna_ngs_library.name

def get_input_files(self, action):
def input_function_run(wildcards):
"""Helper wrapper function"""
# Get shorcut to Snakemake sub workflow
ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
# Get names of primary libraries of the selected cancer bio sample and the
# corresponding primary normal sample
normal_base_path = (
"output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
normal_library=self.get_normal_lib_name(wildcards), **wildcards
)
)
tumor_base_path = (
"output/{mapper}.{library_name}/out/" "{mapper}.{library_name}"
).format(**wildcards)
return {
"lib_path": "work/R_packages/out/.done",
"gc": "work/static_data/out/{genome_name}_{length}.wig.gz".format(
genome_name=self.config["scarHRD"]["genome_name"],
length=self.config["scarHRD"]["length"],
),
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
"normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
"tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}
self._validate_action(action)

if action == "install":
return None
elif action == "gcreference":
return None
elif action == "run":
return input_function_run
else:
raise UnsupportedActionException(
"Action '{action}' is not supported. Valid options: {valid}".format(
action=action, valid=", ".join(self.actions)
)
)
return self._get_input_files_run

@dictify
def _get_input_files_run(self, wildcards):
self.cnv_calling = self.parent.sub_workflows["cnv_calling"]
base_name = f"{wildcards.mapper}.{wildcards.caller}.{wildcards.library_name}"
yield "done", "work/R_packages/out/scarHRD.done"
yield "seqz", self.cnv_calling(f"output/{base_name}/out/{base_name}.seqz.gz")

def get_output_files(self, action):
if action == "install":
return {"lib_path": "work/R_packages/out/.done"}
elif action == "gcreference":
return {
"gc": "work/static_data/out/{genome_name}_{length}.wig.gz".format(
genome_name=self.config["scarHRD"]["genome_name"],
length=self.config["scarHRD"]["length"],
)
}
return {"done": "work/R_packages/out/scarHRD.done"}
elif action == "run":
return {
"sequenza": "work/{mapper}.scarHRD.{library_name}/out/{mapper}.scarHRD.{library_name}.seqz.gz",
"scarHRD": "work/{mapper}.scarHRD.{library_name}/out/{mapper}.scarHRD.{library_name}.json",
"scarHRD": "work/{mapper}.{caller}.scarHRD.{library_name}/out/{mapper}.{caller}.scarHRD.{library_name}.json",
"scarHRD_md5": "work/{mapper}.{caller}.scarHRD.{library_name}/out/{mapper}.{caller}.scarHRD.{library_name}.json.md5",
}
else:
raise UnsupportedActionException(
Expand All @@ -180,14 +131,9 @@ def get_output_files(self, action):
def _get_log_file(self, action):
"""Return dict of log files."""
if action == "install":
prefix = "work/R_packages/log/R_packages"
elif action == "gcreference":
prefix = "work/static_data/log/{genome_name}_{length}".format(
genome_name=self.config["scarHRD"]["genome_name"],
length=self.config["scarHRD"]["length"],
)
prefix = "work/R_packages/log/scarHRD"
elif action == "run":
prefix = "work/{mapper}.scarHRD.{library_name}/log/{mapper}.scarHRD.{library_name}"
prefix = "work/{mapper}.{caller}.scarHRD.{library_name}/log/{mapper}.{caller}.scarHRD.{library_name}"
else:
raise UnsupportedActionException(
"Action '{action}' is not supported. Valid options: {valid}".format(
Expand All @@ -204,32 +150,15 @@ def _get_log_file(self, action):
yield key + "_md5", prefix + ext + ".md5"

def get_resource_usage(self, action):
"""Get Resource Usage
:param action: Action (i.e., step) in the workflow, example: 'run'.
:type action: str
:return: Returns ResourceUsage for step.
"""
if action == "install" or action == "gcreference":
self._validate_action(action)
if action == "run":
return ResourceUsage(
threads=1,
time="02:00:00", # 2 hours
memory="4096M",
partition="short",
)
elif action == "run":
return ResourceUsage(
threads=2,
time="48:00:00", # 2 hours
memory="32G",
time="24:00:00",
)
else:
raise UnsupportedActionException(
"Action '{action}' is not supported. Valid options: {valid}".format(
action=action, valid=", ".join(self.actions)
)
)
return super().get_resource_usage(action)


class HomologousRecombinationDeficiencyWorkflow(BaseStep):
Expand All @@ -253,16 +182,18 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
config_lookup_paths,
config_paths,
workdir,
(NgsMappingWorkflow,),
(SomaticTargetedSeqCnvCallingWorkflow,),
)
# Register sub step classes so the sub steps are available
self.register_sub_step_classes((ScarHRDStepPart, LinkOutStepPart))
# Initialize sub-workflows
self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])
self.register_sub_workflow(
"somatic_targeted_seq_cnv_calling", self.config["path_cnv_calling"], "cnv_calling"
)

@listify
def get_result_files(self):
"""Return list of result files for the somatic targeted sequencing CNV calling step"""
"""Return list of result files for the homologous recombination deficiency step"""
tool_actions = {"scarHRD": ("run",)}
for sheet in filter(is_not_background, self.shortcut_sheets):
for sample_pair in sheet.all_sample_pairs:
Expand All @@ -282,14 +213,17 @@ def get_result_files(self):
tpls = self.sub_steps[tool].get_output_files(action).values()
except AttributeError:
tpls = self.sub_steps[tool].get_output_files(action)
tpls = list(tpls)
tpls += list(self.sub_steps[tool].get_log_file(action).values())
for tpl in tpls:
filenames = expand(
tpl,
mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
caller=["sequenza"],
library_name=[sample_pair.tumor_sample.dna_ngs_library.name],
)
for f in filenames:
if ".tmp." not in f:
if ".tmp." not in f and not f.endswith("/.done"):
yield f.replace("work/", "output/")

def check_config(self):
Expand All @@ -298,3 +232,6 @@ def check_config(self):
("static_data_config", "reference", "path"),
"Path to reference FASTA file not configured but required",
)
assert (
"sequenza" in self.w_config["step_config"]["somatic_targeted_seq_cnv_calling"]["tools"]
)
2 changes: 0 additions & 2 deletions snappy_wrappers/wrappers/scarHRD/environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ channels:
- bioconda
dependencies:
- python =3.9
- sequenza-utils
- r-sequenza
- r-devtools
- r-data.table
- samtools

This file was deleted.

49 changes: 0 additions & 49 deletions snappy_wrappers/wrappers/scarHRD/gcreference/wrapper.py

This file was deleted.

1 change: 0 additions & 1 deletion snappy_wrappers/wrappers/scarHRD/install/environment.yaml

This file was deleted.

48 changes: 0 additions & 48 deletions snappy_wrappers/wrappers/scarHRD/install/wrapper.py

This file was deleted.

5 changes: 3 additions & 2 deletions snappy_wrappers/wrappers/scarHRD/run/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@
| sequenza-utils seqz_binning -w {length} -s - \
| gzip > {snakemake.output.sequenza}
export R_LIBS_PATH="{lib_path}"
export VROOM_CONNECTION_SIZE=2000000000
cat << __EOF | R --vanilla --slave
.libPaths(c("{lib_path}", .libPaths()))
Sys.setenv(VROOM_CONNECTION_SIZE=2000000000)
library("scarHRD")
tbl <- scar_score("{snakemake.output.sequenza}", reference="{genome_name}", seqz=TRUE, chr.in.name={chr_in_name})
Expand Down
Loading

0 comments on commit a6ffd19

Please sign in to comment.