From 35b24cd70e4352244b6d09d414c6fae761c7dece Mon Sep 17 00:00:00 2001 From: ericblanc20 Date: Thu, 24 Aug 2023 10:47:18 +0200 Subject: [PATCH] feat: (430) picard metrics for bam files (#431) --- .../workflows/ngs_data_qc/Snakefile | 39 +++ .../workflows/ngs_data_qc/__init__.py | 231 +++++++++++++++++- .../wrappers/picard/environment.yaml | 6 + .../wrappers/picard/metrics/environment.yaml | 1 + .../wrappers/picard/metrics/wrapper.py | 166 +++++++++++++ .../wrappers/picard/prepare/environment.yaml | 1 + .../wrappers/picard/prepare/wrapper.py | 100 ++++++++ .../workflows/test_workflows_ngs_data_qc.py | 123 +++++++--- ...t_workflows_ngs_data_qc_processed_fastq.py | 2 +- 9 files changed, 624 insertions(+), 45 deletions(-) create mode 100644 snappy_wrappers/wrappers/picard/environment.yaml create mode 120000 snappy_wrappers/wrappers/picard/metrics/environment.yaml create mode 100644 snappy_wrappers/wrappers/picard/metrics/wrapper.py create mode 120000 snappy_wrappers/wrappers/picard/prepare/environment.yaml create mode 100644 snappy_wrappers/wrappers/picard/prepare/wrapper.py diff --git a/snappy_pipeline/workflows/ngs_data_qc/Snakefile b/snappy_pipeline/workflows/ngs_data_qc/Snakefile index 0d379e48c..53ebb9335 100644 --- a/snappy_pipeline/workflows/ngs_data_qc/Snakefile +++ b/snappy_pipeline/workflows/ngs_data_qc/Snakefile @@ -85,3 +85,42 @@ rule data_qc_fastqc_run: wf.get_log_file("fastqc", "run"), wrapper: wf.wrapper_path("fastqc") + + +# Bam Data QC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Run picard ------------------------------------------------------------------ + + +rule data_qc_picard_prepare: + output: + **wf.get_output_files("picard", "prepare"), + threads: wf.get_resource("picard", "prepare", "threads") + resources: + time=wf.get_resource("picard", "prepare", "time"), + memory=wf.get_resource("picard", "prepare", "memory"), + partition=wf.get_resource("picard", "prepare", "partition"), + tmpdir=wf.get_resource("picard", "prepare", "tmpdir"), + log: + **wf.get_log_file("picard", "prepare"), + wrapper: + wf.wrapper_path("picard/prepare") + + +rule data_qc_picard_metrics: + input: + unpack(wf.get_input_files("picard", "metrics")), + output: + **wf.get_output_files("picard", "metrics"), + params: + wf.get_params("picard", "metrics"), + threads: wf.get_resource("picard", "metrics", "threads") + resources: + time=wf.get_resource("picard", "metrics", "time"), + memory=wf.get_resource("picard", "metrics", "memory"), + partition=wf.get_resource("picard", "metrics", "partition"), + tmpdir=wf.get_resource("picard", "metrics", "tmpdir"), + log: + **wf.get_log_file("picard", "metrics"), + wrapper: + wf.wrapper_path("picard/metrics") diff --git a/snappy_pipeline/workflows/ngs_data_qc/__init__.py b/snappy_pipeline/workflows/ngs_data_qc/__init__.py index 3268ca84d..3f4d4af76 100644 --- a/snappy_pipeline/workflows/ngs_data_qc/__init__.py +++ b/snappy_pipeline/workflows/ngs_data_qc/__init__.py @@ -17,6 +17,7 @@ from biomedsheets.shortcuts import GenericSampleSheet from snakemake.io import Namedlist, expand, touch +from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -24,6 +25,7 @@ LinkInPathGenerator, LinkInStep, LinkOutStepPart, + ResourceUsage, get_ngs_library_folder_name, ) @@ -33,9 +35,66 @@ step_config: ngs_data_qc: path_link_in: "" # OPTIONAL Override data set configuration search paths for FASTQ files - tools: [fastqc] # REQUIRED - available: 'fastqc' + tools: [fastqc, picard] # REQUIRED - available: 'fastqc' & 'picard' (for QC on bam files) + picard: + path_ngs_mapping: ../ngs_mapping # REQUIRED + path_to_baits: "" # Required when CollectHsMetrics is among the programs + path_to_targets: "" # When missing, same as baits + bait_name: "" # Exon enrichment kit name (optional) + programs: [] # Available metrics: + # * Generic metrics [* grouped into CollectMultipleMetrics] + # - CollectAlignmentSummaryMetrics * + # - CollectBaseDistributionByCycle * + # - CollectGcBiasMetrics * + # - CollectInsertSizeMetrics * + # - CollectJumpingLibraryMetrics + # - CollectOxoGMetrics + # - CollectQualityYieldMetrics * + # - CollectSequencingArtifactMetrics * + # - EstimateLibraryComplexity + # - MeanQualityByCycle * + # - QualityScoreDistribution * + # * WGS-specific metrics + # - CollectRawWgsMetrics + # - CollectWgsMetrics + # - CollectWgsMetricsWithNonZeroCoverage + # * Other assay-specific metrics + # - CollectHsMetrics Whole Exome Sequencing + # - CollectTargetedPcrMetrics Panel sequencing + # - CollectRnaSeqMetrics mRNA sequencing, not implemented yet + # - CollectRbsMetrics bi-sulfite sequencing, not implemented yet """ +MULTIPLE_METRICS = { + "CollectAlignmentSummaryMetrics": ["alignment_summary_metrics"], + "CollectBaseDistributionByCycle": ["base_distribution_by_cycle_metrics"], + "CollectGcBiasMetrics": ["gc_bias.summary_metrics", "gc_bias.detail_metrics"], + "CollectInsertSizeMetrics": ["insert_size_metrics"], + "CollectQualityYieldMetrics": ["quality_yield_metrics"], + "CollectSequencingArtifactMetrics": [ + "pre_adapter_detail_metrics", + "pre_adapter_summary_metrics", + "bait_bias_summary_metrics", + "bait_bias_detail_metrics", + ], + "MeanQualityByCycle": ["quality_by_cycle_metrics"], + "QualityScoreDistribution": ["quality_distribution_metrics"], +} +ADDITIONAL_METRICS = ( + "CollectJumpingLibraryMetrics", + "CollectOxoGMetrics", + "EstimateLibraryComplexity", +) +WGS_METRICS = ( + "CollectRawWgsMetrics", + "CollectWgsMetrics", + "CollectWgsMetricsWithNonZeroCoverage", +) +WES_METRICS = ("CollectHsMetrics",) +PANEL_METRICS = ("CollectTargetedPcrMetrics",) +RNA_METRICS = ("CollectRnaSeqMetrics",) +BISULFITE_METRICS = ("CollectRbsMetrics",) + class FastQcReportStepPart(BaseStepPart): """(Raw) data QC using FastQC""" @@ -109,6 +168,107 @@ def _collect_reads(self, wildcards, library_name, prefix): yield os.path.join(self.base_path_in, path_infix, filename).format(**wildcards) +class PicardStepPart(BaseStepPart): + """Collect Picard metrics""" + + name = "picard" + actions = ("prepare", "metrics") + + def __init__(self, parent): + super().__init__(parent) + + def get_input_files(self, action): + self._validate_action(action) + if action == "prepare": + raise UnsupportedActionException( + 'Action "prepare" input files must be defined in config' + ) + + return self._get_input_files_metrics + + @dictify + def _get_input_files_metrics(self, wildcards): + if "CollectHsMetrics" in self.config["picard"]["programs"]: + yield "baits", "work/static_data/picard/out/baits.interval_list" + yield "targets", "work/static_data/picard/out/targets.interval_list" + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + infix = f"{wildcards.mapper}.{wildcards.library_name}" + yield "bam", ngs_mapping(f"output/{infix}/out/{infix}.bam") + + @dictify + def get_output_files(self, action): + if action == "prepare": + yield "baits", "work/static_data/picard/out/baits.interval_list" + yield "targets", "work/static_data/picard/out/targets.interval_list" + elif action == "metrics": + base_out = "work/{mapper}.{library_name}/report/picard/{mapper}.{library_name}." + for pgm in self.config["picard"]["programs"]: + if pgm in MULTIPLE_METRICS.keys(): + first = MULTIPLE_METRICS[pgm][0] + yield pgm, base_out + f"CollectMultipleMetrics.{first}.txt" + yield pgm + "_md5", base_out + f"CollectMultipleMetrics.{first}.txt.md5" + else: + yield pgm, base_out + pgm + ".txt" + yield pgm + "_md5", base_out + pgm + ".txt.md5" + else: + actions_str = ", ".join(self.actions) + raise UnsupportedActionException( + f"Action '{action}' is not supported. Valid options: {actions_str}" + ) + + @dictify + def get_log_file(self, action): + if action == "prepare": + prefix = "work/static_data/picard/log/prepare" + elif action == "metrics": + prefix = "work/{mapper}.{library_name}/log/picard/{mapper}.{library_name}" + else: + actions_str = ", ".join(self.actions) + raise UnsupportedActionException( + f"Action '{action}' is not supported. Valid options: {actions_str}" + ) + + key_ext = ( + ("wrapper", ".wrapper.py"), + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ("env_yaml", ".environment.yaml"), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + def get_params(self, action): + self._validate_action(action) + + return self._get_params + + @dictify + def _get_params(self, wildcards): + return {"prefix": f"{wildcards.mapper}.{wildcards.library_name}"} + + def get_resource_usage(self, action): + """Get Resource Usage + + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + + :return: Returns ResourceUsage for step. + + :raises UnsupportedActionException: if action not in class defined list of valid actions. + """ + if action == "prepare": + return super().get_resource_usage(action) + elif action == "metrics": + return ResourceUsage(threads=1, time="24:00:00", memory="24G") + else: + actions_str = ", ".join(self.actions) + raise UnsupportedActionException( + f"Action '{action}' is not supported. Valid options: {actions_str}" + ) + + class NgsDataQcWorkflow(BaseStep): """Perform NGS raw data QC""" @@ -124,7 +284,11 @@ def default_config_yaml(cls): def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): super().__init__(workflow, config, config_lookup_paths, config_paths, workdir) - self.register_sub_step_classes((LinkInStep, LinkOutStepPart, FastQcReportStepPart)) + self.register_sub_step_classes( + (LinkInStep, LinkOutStepPart, FastQcReportStepPart, PicardStepPart) + ) + if "picard" in self.config["tools"]: + self.register_sub_workflow("ngs_mapping", self.config["picard"]["path_ngs_mapping"]) @listify def get_result_files(self): @@ -133,17 +297,58 @@ def get_result_files(self): We will process all NGS libraries of all test samples in all sample sheets. """ - from os.path import join - - name_pattern = "{ngs_library.name}" - # TODO: actually link out report files - yield from self._yield_result_files( - join("output", name_pattern, "report", "fastqc", ".done") - ) - - def _yield_result_files(self, tpl, **kwargs): + if "fastqc" in self.config["tools"]: + yield from self._yield_result_files( + tpl="output/{ngs_library.name}/report/fastqc/.done", + allowed_extraction_types=( + "DNA", + "RNA", + ), + ) + if "picard" in self.config["tools"]: + tpl = ( + "output/{mapper}.{ngs_library.name}/report/picard/{mapper}.{ngs_library.name}.{ext}" + ) + exts = [] + for pgm in self.config["picard"]["programs"]: + if pgm in MULTIPLE_METRICS.keys(): + first = MULTIPLE_METRICS[pgm][0] + exts.append(f"CollectMultipleMetrics.{first}.txt") + exts.append(f"CollectMultipleMetrics.{first}.txt.md5") + else: + exts.append(pgm + ".txt") + exts.append(pgm + ".txt.md5") + yield from self._yield_result_files( + tpl=tpl, + allowed_extraction_types=("DNA",), + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + ext=exts, + ) + + def _yield_result_files(self, tpl, allowed_extraction_types, **kwargs): """Build output paths from path template and extension list""" for sheet in self.shortcut_sheets: for ngs_library in sheet.all_ngs_libraries: - # extraction_type = ngs_library.test_sample.extra_infos['extractionType'] - yield from expand(tpl, ngs_library=[ngs_library], **kwargs) + extraction_type = ngs_library.test_sample.extra_infos["extractionType"] + if extraction_type in allowed_extraction_types: + yield from expand(tpl, ngs_library=[ngs_library], **kwargs) + + def check_config(self): + if "picard" in self.config["tools"]: + self.ensure_w_config( + ("step_config", "ngs_data_qc", "picard", "path_ngs_mapping"), + "Path to ngs_mapping not configured but required for picard", + ) + programs = self.config["picard"]["programs"] + assert len(programs) > 0, "No selected programs for collecting metrics" + assert all( + pgm in MULTIPLE_METRICS.keys() + or pgm in ADDITIONAL_METRICS + or pgm in WES_METRICS + or pgm in WGS_METRICS + for pgm in programs + ), "Some requested metrics programs are not implemented" + if "CollectHsMetrics" in programs: + assert self.config["picard"][ + "path_to_baits" + ], "Path to baits must be specified when using CollectHsMetrics" diff --git a/snappy_wrappers/wrappers/picard/environment.yaml b/snappy_wrappers/wrappers/picard/environment.yaml new file mode 100644 index 000000000..758c031e7 --- /dev/null +++ b/snappy_wrappers/wrappers/picard/environment.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - picard + - bedtools diff --git a/snappy_wrappers/wrappers/picard/metrics/environment.yaml b/snappy_wrappers/wrappers/picard/metrics/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/picard/metrics/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/picard/metrics/wrapper.py b/snappy_wrappers/wrappers/picard/metrics/wrapper.py new file mode 100644 index 000000000..7156197be --- /dev/null +++ b/snappy_wrappers/wrappers/picard/metrics/wrapper.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- +"""CUBI+Snakemake wrapper code for picard metrics collection: Snakemake wrapper.py +""" + +from snakemake import shell + +__author__ = "Eric Blanc " + +reference = snakemake.config["static_data_config"]["reference"]["path"] +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["picard"] + +collect_multiple_metrics_programs = { + "CollectAlignmentSummaryMetrics", + "CollectBaseDistributionByCycle", + "CollectGcBiasMetrics", + "CollectInsertSizeMetrics", + "CollectQualityYieldMetrics", + "CollectSequencingArtifactMetrics", + "MeanQualityByCycle", + "QualityScoreDistribution", +} +collect_multiple_metrics = " ".join( + [ + f"-PROGRAM {pgm}" + for pgm in collect_multiple_metrics_programs.intersection(set(config["programs"])) + ] +) + +# TODO: understand why snakemake.params is a list... +prefix = "" +if "prefix" in snakemake.params[0].keys() and snakemake.params[0]["prefix"]: + prefix = snakemake.params[0]["prefix"] + "." + +name = "null" +if "bait_name" in config.keys() and config["bait_name"]: + name = config["bait_name"] + +shell.executable("/bin/bash") + +shell( + r""" +set -x + +d=$(ls $CONDA_PREFIX/share | grep picard) +picard_jar="$CONDA_PREFIX/share/$d/picard.jar" +if [[ ! -r $picard_jar ]] +then + echo "Can't find picard jar" + exit -1 +fi + +# Also pipe everything to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec &> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Setup auto-cleaned tmpdir +export tmpdir=$(mktemp -d) +trap "rm -rf $tmpdir" EXIT + +d=$(dirname {snakemake.output[0]}) + +if [[ -n "{collect_multiple_metrics}" ]] +then + # Setting METRIC_ACCUMULATION_LEVEL causes issues for some programs + java -jar $picard_jar CollectMultipleMetrics \ + -I {snakemake.input.bam} \ + -O $d/{prefix}CollectMultipleMetrics \ + -R {reference} \ + -FILE_EXTENSION .txt \ + -PROGRAM null \ + {collect_multiple_metrics} +fi + +if [[ "{config[programs]}" == *"EstimateLibraryComplexity"* ]] +then + java -jar $picard_jar EstimateLibraryComplexity \ + -I {snakemake.input.bam} \ + -O $d/{prefix}EstimateLibraryComplexity.txt +fi + +if [[ "{config[programs]}" == *"CollectJumpingLibraryMetrics"* ]] +then + java -jar $picard_jar CollectJumpingLibraryMetrics \ + -I {snakemake.input.bam} \ + -O $d/{prefix}CollectJumpingLibraryMetrics.txt +fi + +if [[ "{config[programs]}" == *"CollectOxoGMetrics"* ]] +then + if [[ -r "{snakemake.config[static_data_config][dbsnp][path]}" ]] + then + dbsnp="-DB_SNP {snakemake.config[static_data_config][dbsnp][path]}" + else + dbsnp="" + fi + java -jar $picard_jar CollectOxoGMetrics \ + -I {snakemake.input.bam} \ + -O $d/{prefix}CollectOxoGMetrics.txt \ + -R {reference} \ + $dbsnp +fi + +if [[ "{config[programs]}" == *"CollectHsMetrics"* ]] +then + java -jar $picard_jar CollectHsMetrics \ + -I {snakemake.input.bam} \ + -O $d/{prefix}CollectHsMetrics.txt \ + -R {reference} \ + -BAIT_SET_NAME {name} \ + -BAIT_INTERVALS {snakemake.input.baits} \ + -TARGET_INTERVALS {snakemake.input.targets} +fi + +if [[ "{config[programs]}" == *"CollectRawWgsMetrics"* ]] +then + java -jar $picard_jar CollectRawWgsMetrics \ + -I {snakemake.input.bam} \ + -O $d/{prefix}CollectRawWgsMetrics.txt \ + -R {reference} +fi + +if [[ "{config[programs]}" == *"CollectWgsMetrics"* ]] +then + java -jar $picard_jar CollectWgsMetrics \ + -I {snakemake.input.bam} \ + -O $d/{prefix}CollectWgsMetrics.txt \ + -R {reference} +fi + +if [[ "{config[programs]}" == *"CollectWgsMetricsWithNonZeroCoverage"* ]] +then + java -jar $picard_jar CollectWgsMetricsWithNonZeroCoverage \ + -I {snakemake.input.bam} \ + -O $d/{prefix}CollectWgsMetricsWithNonZeroCoverage.txt \ + -CHART $d/{prefix}CollectWgsMetricsWithNonZeroCoverage.pdf \ + -R {reference} +fi + +pushd $d +for f in $(ls *.txt) ; do + md5sum $f >$f.md5 +done +popd +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/picard/prepare/environment.yaml b/snappy_wrappers/wrappers/picard/prepare/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/picard/prepare/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/picard/prepare/wrapper.py b/snappy_wrappers/wrappers/picard/prepare/wrapper.py new file mode 100644 index 000000000..b6f146410 --- /dev/null +++ b/snappy_wrappers/wrappers/picard/prepare/wrapper.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +"""CUBI+Snakemake wrapper code for converting exome baits & targets from bed to interval lists +""" + +import os +import re + +from snakemake import shell + +__author__ = "Eric Blanc " + +reference = snakemake.config["static_data_config"]["reference"]["path"] +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["picard"] + +reference = re.sub("\.fa(sta)?(\.b?gz)?$", ".dict", reference) +assert os.path.exists(reference), "Missing dict of reference fasta" + +baits = config["path_to_baits"] +targets = config.get("path_to_targets", "") + +shell.executable("/bin/bash") + +shell( + r""" +set -x + +d=$(ls $CONDA_PREFIX/share | grep picard) +picard_jar="$CONDA_PREFIX/share/$d/picard.jar" +if [[ ! -r $picard_jar ]] +then + echo "Can't find picar jar" + exit -1 +fi + +# Also pipe everything to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec &> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Setup auto-cleaned tmpdir +export tmpdir=$(mktemp -d) +trap "rm -rf $tmpdir" EXIT + +# Can't pipe to BedToIntervalList (https://github.com/broadinstitute/picard/issues/1890) +bed_to_interval_list() {{ + fn=$1 + f=$(basename $fn) + cut -f 1-3 $fn \ + | bedtools sort -i - \ + | bedtools merge -i - \ + > $tmpdir/$f + java -jar $picard_jar BedToIntervalList \ + -I $tmpdir/$f \ + -O /dev/stdout \ + -SD {reference} +}} + +md5() {{ + fn=$1 + d=$(dirname $fn) + f=$(basename $fn) + pushd $d 1> /dev/null 2>&1 + checksum=$(md5sum $f) + popd 1> /dev/null 2>&1 + echo $checksum +}} + +bed_to_interval_list {baits} > {snakemake.output.baits} +md5 {snakemake.output.baits} > {snakemake.output.baits}.md5 + +if [[ -n "{targets}" ]] +then + bed_to_interval_list {targets} > {snakemake.output.targets} + md5 {snakemake.output.targets} > {snakemake.output.targets}.md5 +else + ln -rs {snakemake.output.baits} {snakemake.output.targets} + ln -rs {snakemake.output.baits}.md5 {snakemake.output.targets}.md5 +fi +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc.py b/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc.py index 28a343e75..7056f602c 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc.py +++ b/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc.py @@ -9,6 +9,7 @@ from snappy_pipeline.workflows.ngs_data_qc import NgsDataQcWorkflow +from .common import get_expected_log_files_dict from .conftest import patch_module_fs @@ -24,8 +25,20 @@ def minimal_config(): path: /path/to/ref.fa step_config: + ngs_mapping: + tools: + dna: [bwa] ngs_data_qc: - tools: ['fastqc'] + tools: ['picard'] + picard: + path_ngs_mapping: /NGS_MAPPING + path_to_baits: /path/to/baits + path_to_targets: /path/to/targets + programs: + - CollectAlignmentSummaryMetrics + - CollectOxoGMetrics + - CollectHsMetrics + - CollectWgsMetrics data_sets: first_batch: @@ -53,6 +66,7 @@ def ngs_data_qc( mocker, ): """Return NgsDataQcWorkflow object pre-configured with germline sheet""" + dummy_workflow.globals = {"ngs_mapping": lambda x: "/NGS_MAPPING/" + x} # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) # Patch out files for aligner indices @@ -67,61 +81,98 @@ def ngs_data_qc( ) -# Tests for FastQcReportStepPart ------------------------------------------------------------------- +# Tests for PicardStepPart ------------------------------------------------------------------- -def test_fastqc_step_part_get_args(ngs_data_qc): - """Tests FastQcReportStepPart.get_args()""" +def test_picard_step_part_get_output_files(ngs_data_qc): + """Tests PicardStepPart.get_output_files() - prepare""" # Define expected - wildcards = Wildcards(fromdict={"library_name": "P001-N1-DNA1-WGS1"}) expected = { - "num_threads": 1, - "more_reads": [ - "work/input_links/P001-N1-DNA1-WGS1/FCXXXXXX/L001/P001_R1.fastq.gz", - "work/input_links/P001-N1-DNA1-WGS1/FCXXXXXX/L001/P001_R2.fastq.gz", - ], + "baits": "work/static_data/picard/out/baits.interval_list", + "targets": "work/static_data/picard/out/targets.interval_list", } - # Get actual and assert - actual = ngs_data_qc.get_args("fastqc", "run")(wildcards) + # Get actual + actual = ngs_data_qc.get_output_files("picard", "prepare") assert actual == expected -def test_fastqc_step_part_get_input_files(ngs_data_qc): - """Tests FastQcReportStepPart.get_input_files()""" +def test_picard_step_part_get_log_file(ngs_data_qc): + """Tests PicardStepPart.get_log_file() - prepare""" # Define expected - wildcards = Wildcards(fromdict={"library_name": "P001-N1-DNA1-WGS1"}) - expected = "work/input_links/P001-N1-DNA1-WGS1/.done" + expected = get_expected_log_files_dict( + base_out="work/static_data/picard/log/prepare", + extended=True, + ) + # Get actual + actual = ngs_data_qc.get_log_file("picard", "prepare") + assert actual == expected + + +def test_picard_step_part_get_input_files(ngs_data_qc): + """Tests PicardStepPart.get_input_files() - metrics""" + # Define expected + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-N1-DNA1-WGS1"}) + expected = { + "baits": "work/static_data/picard/out/baits.interval_list", + "targets": "work/static_data/picard/out/targets.interval_list", + "bam": "/NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + } # Get actual and assert - actual = ngs_data_qc.get_input_files("fastqc", "run")(wildcards) + actual = ngs_data_qc.get_input_files("picard", "metrics")(wildcards) assert actual == expected -def test_fastqc_step_part_get_output_files(ngs_data_qc): - """Tests FastQcReportStepPart.get_output_files()""" +def test_picard_step_part_get_output_files(ngs_data_qc): + """Tests PicardStepPart.get_output_files() - metrics""" # Define expected - expected = {"fastqc_done": "work/{library_name}/report/fastqc/.done"} + base_out = "work/{mapper}.{library_name}/report/picard/{mapper}.{library_name}." + expected = { + "CollectAlignmentSummaryMetrics": base_out + + "CollectMultipleMetrics.alignment_summary_metrics.txt", + "CollectOxoGMetrics": base_out + "CollectOxoGMetrics.txt", + "CollectHsMetrics": base_out + "CollectHsMetrics.txt", + "CollectWgsMetrics": base_out + "CollectWgsMetrics.txt", + "CollectAlignmentSummaryMetrics_md5": base_out + + "CollectMultipleMetrics.alignment_summary_metrics.txt.md5", + "CollectOxoGMetrics_md5": base_out + "CollectOxoGMetrics.txt.md5", + "CollectHsMetrics_md5": base_out + "CollectHsMetrics.txt.md5", + "CollectWgsMetrics_md5": base_out + "CollectWgsMetrics.txt.md5", + } + # Get actual + actual = ngs_data_qc.get_output_files("picard", "metrics") + assert actual == expected + + +def test_picard_step_part_get_log_file(ngs_data_qc): + """Tests PicardStepPart.get_log_file() - metrics""" + # Define expected + expected = get_expected_log_files_dict( + base_out="work/{mapper}.{library_name}/log/picard/{mapper}.{library_name}", + extended=True, + ) # Get actual - actual = ngs_data_qc.get_output_files("fastqc", "run") + actual = dict(ngs_data_qc.get_log_file("picard", "metrics")) assert actual == expected -def test_fastqc_step_part_get_log_file(ngs_data_qc): - """Tests FastQcReportStepPart.get_log_file()""" +def test_picard_step_part_get_params(ngs_data_qc): + """Tests PicardStepPart.get_params() - metrics""" + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-N1-DNA1-WGS1"}) # Define expected - expected = "work/{library_name}/log/snakemake.fastqc.log" + expected = {"prefix": "bwa.P001-N1-DNA1-WGS1"} # Get actual - actual = ngs_data_qc.get_log_file("fastqc", "run") + actual = ngs_data_qc.get_params("picard", "metrics")(wildcards) assert actual == expected -def test_fastqc_step_part_get_resource_usage(ngs_data_qc): - """Tests FastQcReportStepPart.get_resource_usage()""" +def test_picard_step_part_get_resource_usage(ngs_data_qc): + """Tests PicardStepPart.get_resource_usage() - metrics""" # Define expected: default defined in workflow.abstract - expected_dict = {"threads": 1, "time": "01:00:00", "memory": "2G", "partition": "medium"} + expected_dict = {"threads": 1, "time": "24:00:00", "memory": "24G", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_data_qc.get_resource("fastqc", "run", resource) + actual = ngs_data_qc.get_resource("picard", "metrics", resource) assert actual == expected, msg_error @@ -131,7 +182,7 @@ def test_fastqc_step_part_get_resource_usage(ngs_data_qc): def test_ngs_data_qc_workflow_steps(ngs_data_qc): """Tests simple functionality of the workflow: checks if sub steps are created.""" # Check created sub steps - expected = ["fastqc", "link_in", "link_out"] + expected = ["fastqc", "link_in", "link_out", "picard"] actual = list(sorted(ngs_data_qc.sub_steps.keys())) assert actual == expected @@ -140,6 +191,16 @@ def test_ngs_data_qc_workflow_files(ngs_data_qc): """Tests simple functionality of the workflow: checks if file structure is created according to the expected results from the tools.""" # Check result file construction - expected = [f"output/P00{i}-N1-DNA1-WGS1/report/fastqc/.done" for i in range(1, 7)] + expected = [ + f"output/bwa.P00{i}-N1-DNA1-WGS1/report/picard/bwa.P00{i}-N1-DNA1-WGS1.{metric}.txt{ext}" + for i in range(1, 7) + for metric in ( + "CollectHsMetrics", + "CollectMultipleMetrics.alignment_summary_metrics", + "CollectOxoGMetrics", + "CollectWgsMetrics", + ) + for ext in ("", ".md5") + ] actual = sorted(ngs_data_qc.get_result_files()) assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc_processed_fastq.py b/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc_processed_fastq.py index d3f797f4f..096d6b91e 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc_processed_fastq.py +++ b/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc_processed_fastq.py @@ -134,7 +134,7 @@ def test_fastqc_step_part_get_resource_usage(ngs_data_qc): def test_ngs_data_qc_workflow_steps(ngs_data_qc): """Tests simple functionality of the workflow: checks if sub steps are created.""" # Check created sub steps - expected = ["fastqc", "link_in", "link_out"] + expected = ["fastqc", "link_in", "link_out", "picard"] actual = list(sorted(ngs_data_qc.sub_steps.keys())) assert actual == expected