Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Somatic Update #159

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions snappy_pipeline/workflows/ngs_mapping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1269,9 +1269,11 @@ def get_result_files(self):
)
)

target_coverage = False
for sheet in self.shortcut_sheets:
for ngs_library in sheet.all_ngs_libraries:
if ngs_library.name in self.ngs_library_to_kit:
target_coverage = True
extraction_type = ngs_library.test_sample.extra_infos["extractionType"]
suffix = (
"_long"
Expand All @@ -1287,8 +1289,9 @@ def get_result_files(self):
ngs_library=[ngs_library],
ext=["txt", "txt.md5"],
)
yield "output/target_cov_report/out/target_cov_report.txt"
yield "output/target_cov_report/out/target_cov_report.txt.md5"
if target_coverage:
yield "output/target_cov_report/out/target_cov_report.txt"
yield "output/target_cov_report/out/target_cov_report.txt.md5"
if (
self.config["picard_hs_metrics"]["path_targets_interval_list"]
and self.config["picard_hs_metrics"]["path_baits_interval_list"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@ def get_output_files(self, action):
"gene_log2_txt": "gene_log2.txt",
"segments_txt": "segments.txt",
}
tpl = "work/{mapper}.copywriter.{library_name}/out/{mapper}.copywriter.{library_name}_"
tpl = "work/{mapper}.copywriter.{library_name}/out/{mapper}.copywriter.{library_name}."
output_files = {}
for k, v in exts.items():
output_files[k] = tpl + v
Expand Down Expand Up @@ -908,7 +908,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
def get_result_files(self):
"""Return list of result files for the somatic targeted sequencing CNV calling step"""
tool_actions = {
"cnvkit": ("call", "report", "export", "plot"), # ("report", "export", "plot"),
"cnvkit": ("call", "report", "export", "plot"),
"copywriter": ("call",),
"cnvetti_on_target": ("coverage", "segment", "postprocess"),
"cnvetti_off_target": ("coverage", "segment", "postprocess"),
Expand Down
15 changes: 12 additions & 3 deletions snappy_pipeline/workflows/somatic_variant_calling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,16 @@
__author__ = "Manuel Holtgrewe <[email protected]>"

#: Extensions of files to create as main payload
EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5")
EXT_VALUES = (
".vcf.gz",
".vcf.gz.tbi",
".vcf.gz.md5",
".vcf.gz.tbi.md5",
".full.vcf.gz",
".full.vcf.gz.tbi",
".full.vcf.gz.md5",
".full.vcf.gz.tbi.md5",
)

#: Names of the files to create for the extension
EXT_NAMES = ("vcf", "tbi", "vcf_md5", "tbi_md5")
Expand Down Expand Up @@ -234,9 +243,9 @@
- 'GL000220.*' # Contig with problematic, repetitive DNA in GRCh37
# Configuration for MuTect 2
mutect2:
panel_of_normals: '' # Set path to panel of normals vcf if required
panel_of_normals: '' # Set path to panel of normals vcf if required
germline_resource: REQUIRED # Germline variants resource (same as panel of normals)
common_variants: REQUIRED # Common germline variants for contamination estimation
common_biallelic: REQUIRED # Common biallelic germline variants for contamination estimation
# Parallelization configuration
num_cores: 2 # number of cores to use locally
window_length: 50000000 # split input into windows of this size, each triggers a job
Expand Down
64 changes: 34 additions & 30 deletions snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,19 @@
segmentation: HaarSeg
normalization: MedianGcBinned
control_freec:
path_chrlenfile: REQUIRED #REQUIRED
path_mappability: REQUIRED #REQUIRED
path_mappability_enabled: False
window_size: -1 #set to a value >=0 you want a specific fixed window size
path_mappability: REQUIRED # REQUIRED
breakPointThreshold: 0.8
coefficientOfVariation: 0.05
contamination: 0.4
minCNAlength: 1
minMappabilityPerWindow: 0.85
minExpectedGC: 0.35
maxExpectedGC: 0.55
minimalSubclonePresence: 0.2
readCountThreshold: 10
telocentromeric: 50000
window: ~
ignore_chrom: []
convert:
org_obj: org.Hs.eg.db::org.Hs.eg.db
tx_obj: TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
Expand Down Expand Up @@ -475,40 +484,29 @@ def get_output_files(self, action):
var_caller=self.name, ext=".ratio.txt.md5"
)
elif action == "transform":
transform_ext_names = ("log2", "call", "segments", "cns", "cnr")
transform_ext_values = (
"_gene_log2.txt",
"_gene_call.txt",
"_segments.txt",
".cns",
".cnr",
)
result = dict(
zip(
transform_ext_names,
expand(self.base_path_out, var_caller=[self.name], ext=transform_ext_values),
)
)
transform = {
"log2": ".gene_log2.txt",
"call": ".gene_call.txt",
"segments": ".segments.txt",
"cns": ".cns.txt",
"cnr": ".cnr.txt",
}
for (name, value) in transform.items():
result[name] = self.base_path_out.format(var_caller=self.name, ext=value)
result[name + "_md5"] = result[name] + ".md5"
elif action == "plot":
plot_ext_names = ("heatmap", "scatter", "diagram")
plot_ext_values = (".heatmap.png", ".scatter.png", ".diagram.pdf")
result = dict(
zip(
plot_ext_names,
expand(self.base_path_out, var_caller=[self.name], ext=plot_ext_values),
)
)
plot = {"heatmap": ".heatmap.png", "scatter": ".scatter.png", "diagram": ".diagram.pdf"}
for (name, value) in plot.items():
result[name] = self.base_path_out.format(var_caller=self.name, ext=value)
result[name + "_md5"] = result[name] + ".md5"

return result

def check_config(self):
"""Check configuration for ControlFreec Somatic WGS CNV calling"""
if "control_freec" not in (self.config["tools"] or []): # pylint: disable=C0325
return # ControlFreec not enabled, skip # pragma: no cover
self.parent.ensure_w_config(
("step_config", "somatic_wgs_cnv_calling", "control_freec", "path_chrlenfile"),
"Path to ControlFreec ChrLenFile not configured",
)

self.parent.ensure_w_config(
("step_config", "somatic_wgs_cnv_calling", "control_freec", "path_mappability"),
"Path to ControlFreec mappability file not configured",
Expand Down Expand Up @@ -612,11 +610,17 @@ def get_result_files(self):
".ratio.txt",
".ratio.txt.md5",
".gene_log2.txt",
".gene_log2.txt.md5",
".gene_call.txt",
".gene_call.txt.md5",
".segments.txt",
".segments.txt.md5",
".scatter.png",
".scatter.png.md5",
".heatmap.png",
".heatmap.png.md5",
".diagram.pdf",
".diagram.pdf.md5",
],
)
# Plots for cnvetti
Expand Down
5 changes: 4 additions & 1 deletion snappy_wrappers/wrappers/eb_filter/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,13 @@
fi
fi

# Hack: get back bin directory of base/root environment - find `snappy_vcf_sort`
export PATH=$PATH:$(dirname $(dirname $(which conda)))/bin

# Used to be:
# filter='FILTER == "germline_risk" || FILTER == "t_lod_fstar" || FILTER == "OffExome" || ANN ~ "stream_gene_variant"'
if [[ {snakemake.input.vcf} == *"mutect2"* ]]; then
filter='FILTER == "germline" || FILTER == "weak_evidence" || FILTER == "OffExome" || ANN ~ "stream_gene_variant"'
filter='FILTER ~ "germline" || FILTER ~ "weak_evidence" || FILTER ~ "OffExome" || ANN ~ "stream_gene_variant"'
{cmd_fetch} \
| bcftools view \
-e "$filter" \
Expand Down
10 changes: 5 additions & 5 deletions snappy_wrappers/wrappers/mutect2/contamination/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
export LD_LIBRARY_PATH=$(dirname $(which bgzip))/../lib

# Also pipe everything to log file
if [[ -n "{snakemake.log}" ]]; then
if [[ -n "{snakemake.log.log}" ]]; then
if [[ "$(set +e; tty; set -e)" != "" ]]; then
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log})
exec &> >(tee -a "{snakemake.log}" >&2)
rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
exec &> >(tee -a "{snakemake.log.log}" >&2)
else
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log})
echo "No tty, logging disabled" >"{snakemake.log}"
rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
echo "No tty, logging disabled" >"{snakemake.log.log}"
fi
fi

Expand Down
10 changes: 5 additions & 5 deletions snappy_wrappers/wrappers/mutect2/filter/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
export LD_LIBRARY_PATH=$(dirname $(which bgzip))/../lib

# Also pipe everything to log file
if [[ -n "{snakemake.log}" ]]; then
if [[ -n "{snakemake.log.log}" ]]; then
if [[ "$(set +e; tty; set -e)" != "" ]]; then
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log})
exec &> >(tee -a "{snakemake.log}" >&2)
rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
exec &> >(tee -a "{snakemake.log.log}" >&2)
else
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log})
echo "No tty, logging disabled" >"{snakemake.log}"
rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
echo "No tty, logging disabled" >"{snakemake.log.log}"
fi
fi

Expand Down
9 changes: 5 additions & 4 deletions snappy_wrappers/wrappers/mutect2/pileup/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
__author__ = "Manuel Holtgrewe <[email protected]>"

reference = snakemake.config["static_data_config"]["reference"]["path"]
common_variants = snakemake.config["step_config"]["somatic_variant_calling"]["mutect2"][
"common_variants"
common_biallelic = snakemake.config["step_config"]["somatic_variant_calling"]["mutect2"][
"common_biallelic"
]

shell.executable("/bin/bash")
Expand Down Expand Up @@ -44,8 +44,8 @@
gatk --java-options '-Xms4000m -Xmx8000m' GetPileupSummaries \
--input {snakemake.input.bam} \
--reference {reference} \
--variant {common_variants} \
--intervals {common_variants} \
--variant {common_biallelic} \
--intervals {common_biallelic} \
--output $out_base.pileup

pushd $TMPDIR && \
Expand All @@ -54,6 +54,7 @@
done && \
popd

mkdir -p $(dirname {snakemake.output.pileup})
mv $out_base.* $(dirname {snakemake.output.pileup})
"""
)
13 changes: 6 additions & 7 deletions snappy_wrappers/wrappers/mutect_par/parallel_mutect.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,13 @@
class ParallelMutectWrapper(ParallelSomaticVariantCallingBaseWrapper):
"""Parallel execution of MuTect"""

# TODO: probably, nobody looked at anything but the vcf/tbi files... get rid of them?
realpath_output_keys = (
"vcf",
"vcf_md5",
"tbi",
"tbi_md5",
"full_vcf",
"full_vcf_md5",
"full",
"full_md5",
"full_tbi",
"full_tbi",
"full_tbi_md5",
Expand All @@ -48,8 +47,8 @@ class ParallelMutectWrapper(ParallelSomaticVariantCallingBaseWrapper):
"vcf_md5": "vcf.gz.md5",
"tbi": "vcf.gz.tbi",
"tbi_md5": "vcf.gz.tbi.md5",
"full_vcf": "full.vcf.gz",
"full_vcf_md5": "full.vcf.gz.md5",
"full": "full.vcf.gz",
"full_md5": "full.vcf.gz.md5",
"full_tbi": "full.vcf.gz.tbi",
"full_tbi_md5": "full.vcf.gz.tbi.md5",
"wig": "full.wig.txt.gz",
Expand Down Expand Up @@ -135,8 +134,8 @@ def construct_merge_rule(self):
mkdir -p $(dirname {{output.txt}})
mv output/result.full.out.txt.gz {{output.txt}}
mv output/result.full.out.txt.gz.md5 {{output.txt_md5}}
mv output/result.full.vcf.gz {{output.full_vcf}}
mv output/result.full.vcf.gz.md5 {{output.full_vcf_md5}}
mv output/result.full.vcf.gz {{output.full}}
mv output/result.full.vcf.gz.md5 {{output.full_md5}}
mv output/result.full.vcf.gz.tbi {{output.full_tbi}}
mv output/result.full.vcf.gz.tbi.md5 {{output.full_tbi_md5}}
mv output/result.vcf.gz {{output.vcf}}
Expand Down
2 changes: 1 addition & 1 deletion snappy_wrappers/wrappers/scalpel/somatic/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@

# split out somatic variants
bcftools view \
--include 'INFO/INH=="no" & INFO/SOMATIC==1' \
--include 'FILTER=="PASS" & INFO/INH=="no" & INFO/SOMATIC==1' \
{snakemake.output.full_vcf} \
| bgzip -c \
> {snakemake.output.vcf}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -854,14 +854,14 @@ def test_copywriter_step_part_get_output_files_call(somatic_targeted_seq_cnv_cal
"""Tests CopywriterStepPart.get_output_files() - action 'call'"""
base_name = "work/{mapper}.copywriter.{library_name}/out/{mapper}.copywriter.{library_name}"
expected = {
"bins_txt": base_name + "_bins.txt",
"bins_txt_md5": base_name + "_bins.txt.md5",
"gene_call_txt": base_name + "_gene_call.txt",
"gene_call_txt_md5": base_name + "_gene_call.txt.md5",
"gene_log2_txt": base_name + "_gene_log2.txt",
"gene_log2_txt_md5": base_name + "_gene_log2.txt.md5",
"segments_txt": base_name + "_segments.txt",
"segments_txt_md5": base_name + "_segments.txt.md5",
"bins_txt": base_name + ".bins.txt",
"bins_txt_md5": base_name + ".bins.txt.md5",
"gene_call_txt": base_name + ".gene_call.txt",
"gene_call_txt_md5": base_name + ".gene_call.txt.md5",
"gene_log2_txt": base_name + ".gene_log2.txt",
"gene_log2_txt_md5": base_name + ".gene_log2.txt.md5",
"segments_txt": base_name + ".segments.txt",
"segments_txt_md5": base_name + ".segments.txt.md5",
}
actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("copywriter", "call")
assert actual == expected
Expand Down Expand Up @@ -1042,7 +1042,7 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call
# copywriter
tpl = (
"output/bwa.copywriter.P00{i}-T{t}-DNA1-WGS1/out/"
"bwa.copywriter.P00{i}-T{t}-DNA1-WGS1_{ext}"
"bwa.copywriter.P00{i}-T{t}-DNA1-WGS1.{ext}"
)
expected += [
tpl.format(i=i, t=t, ext=ext)
Expand All @@ -1060,6 +1060,5 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call
]
expected = list(sorted(expected))
actual = list(sorted(somatic_targeted_seq_cnv_calling_workflow.get_result_files()))
# HACK TODO
actual = [f for f in actual if "/log/" not in f]
assert expected == actual
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def minimal_config():
somatic_variant_calling:
tools:
- mutect
- mutect2
- scalpel
scalpel:
path_target_regions: /path/to/target/regions.bed
Expand Down Expand Up @@ -925,7 +926,7 @@ def test_somatic_variant_calling_workflow(somatic_variant_calling_workflow):
# Perform the tests
#
# Check created sub steps
expected = ["link_out", "mutect", "scalpel"]
expected = ["link_out", "mutect", "mutect2", "scalpel"]
assert set(expected).issubset(list(sorted(somatic_variant_calling_workflow.sub_steps.keys())))
# Check result file construction
tpl = (
Expand All @@ -935,9 +936,18 @@ def test_somatic_variant_calling_workflow(somatic_variant_calling_workflow):
expected = [
tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext)
for i, t in ((1, 1), (2, 1), (2, 2))
for ext in ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5")
for ext in (
"vcf.gz",
"vcf.gz.md5",
"vcf.gz.tbi",
"vcf.gz.tbi.md5",
"full.vcf.gz",
"full.vcf.gz.md5",
"full.vcf.gz.tbi",
"full.vcf.gz.tbi.md5",
)
for mapper in ("bwa",)
for var_caller in ("mutect", "scalpel")
for var_caller in ("mutect", "mutect2", "scalpel")
]
# add log files
tpl = (
Expand All @@ -956,7 +966,7 @@ def test_somatic_variant_calling_workflow(somatic_variant_calling_workflow):
"log.md5",
)
for mapper in ("bwa",)
for var_caller in ("mutect", "scalpel")
for var_caller in ("mutect", "mutect2", "scalpel")
]
expected = list(sorted(expected))
actual = list(sorted(somatic_variant_calling_workflow.get_result_files()))
Expand Down
Loading