From 8aaa171162433dd9f2f32fdd8266c1c2ba247005 Mon Sep 17 00:00:00 2001 From: Arielle R Munters <45485665+elleira@users.noreply.github.com> Date: Fri, 5 Jul 2024 13:20:52 +0200 Subject: [PATCH] feat: add pindel artifact annotation and filter (#71) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: add pipeline version and restructure software versions * feat: hydra genetics 3.0.0 to include software versions into multiqc * refactor: remove missed comment on min_version() * style: make snakefmt happy * feat: add background tsv file from refereces module * feat: add background annotation to snv_indels vcf-file * test: add missing background in test config * chore: update cnv_sv module to include type in pindel vcf samplename * refactor: move all pindel processing rules into one snakefile * feat: create an artifact tsv-file in reference pipeline * fix: add samplename in dict, e.g. more than one del same pos * feat: add artifact annotation to pindel vcf * feat: add artifact filtering to pindel vcf * fix: broken svdb-merge-vcf when using priority flag * docs: slight rewording and formatting --------- Co-authored-by: Niklas Mähler <2573608+maehler@users.noreply.github.com> --- .tests/integration/config/config.yaml | 1 + config/config.yaml | 1 + config/config_soft_filter_pindel.yaml | 15 ++- config/output_files_references.yaml | 4 + docs/softwares.md | 101 ++++++++++++++++- workflow/Snakefile | 10 +- workflow/Snakefile_references.smk | 1 + workflow/rules/annotation_vep_pindel.smk | 37 ------- workflow/rules/common.smk | 24 +++- workflow/rules/common_references.smk | 2 + workflow/rules/fix_af_pindel.smk | 33 ------ workflow/rules/pindel_processing.smk | 102 +++++++++++++++++ workflow/rules/reference_rules.smk | 34 ++++++ workflow/rules/svdb.smk | 40 +++++++ workflow/schemas/config.schema.yaml | 103 +++++++++++++----- .../schemas/config_references.schema.yaml | 14 +++ workflow/schemas/resources.schema.yaml | 64 ++++++++++- workflow/schemas/rules.schema.yaml | 91 +++++++++++++++- .../scripts/create_artifact_file_pindel.py | 43 ++++++++ ...sh => pindel_processing_annotation_vep.sh} | 0 .../pindel_processing_artifact_annotation.py | 63 +++++++++++ ..._pindel.py => pindel_processing_fix_af.py} | 0 22 files changed, 661 insertions(+), 122 deletions(-) delete mode 100644 workflow/rules/annotation_vep_pindel.smk delete mode 100644 workflow/rules/fix_af_pindel.smk create mode 100644 workflow/rules/pindel_processing.smk create mode 100644 workflow/rules/reference_rules.smk create mode 100644 workflow/rules/svdb.smk create mode 100644 workflow/scripts/create_artifact_file_pindel.py rename workflow/scripts/{annotation_vep_pindel.sh => pindel_processing_annotation_vep.sh} (100%) create mode 100644 workflow/scripts/pindel_processing_artifact_annotation.py rename workflow/scripts/{fix_af_pindel.py => pindel_processing_fix_af.py} (100%) diff --git a/.tests/integration/config/config.yaml b/.tests/integration/config/config.yaml index 2dcbc6a..14c481d 100644 --- a/.tests/integration/config/config.yaml +++ b/.tests/integration/config/config.yaml @@ -7,6 +7,7 @@ reference: design_bed: "data/bed/design.bed" design_intervals_gatk_cnv: "data/bed/design.bed" artifacts: "reference/artifact_panel.tsv" + artifacts_pindel: "reference/artifact_panel.tsv" background: "reference/background_panel.tsv" bwa_mem: diff --git a/config/config.yaml b/config/config.yaml index fb0f77a..dd9ceb4 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -16,6 +16,7 @@ reference: design_intervals: "/projects/wp2/nobackup/Twist_Myeloid/Bed_files/Twist_myeloid_v.1.1_padd6_201126.sorted.intervals" design_intervals_gatk_cnv: "" artifacts: "FILL_ME_IN.tsv" + artifacts_pindel: "FILL_ME_IN.tsv" background: "FILL_ME_IN.tsv" skip_chrs: - "chrM" diff --git a/config/config_soft_filter_pindel.yaml b/config/config_soft_filter_pindel.yaml index a0966bc..32f704c 100644 --- a/config/config_soft_filter_pindel.yaml +++ b/config/config_soft_filter_pindel.yaml @@ -1,4 +1,9 @@ filters: + vaf: + description: "Soft filter variants with low vaf (AF lower than 0.01)" + expression: "(INFO:AF:0 < 0.01)" + soft_filter_flag: "AF_0.01" + soft_filter: "True" depth: description: "Soft filter on depth lower than 200" expression: "FORMAT:DP < 200" @@ -9,11 +14,6 @@ filters: expression: "(FORMAT:AD:1 < 5)" soft_filter_flag: "AD_5" soft_filter: "True" - vaf: - description: "Soft filter variants with low vaf (AF lower than 0.01)" - expression: "(INFO:AF:0 < 0.01)" - soft_filter_flag: "AF_0.01" - soft_filter: "True" intron: description: "Soft filter intronic variants except if also splice, in cosmic, or in gata2 or tert genes" expression: "(exist[intron_variant, VEP:Consequence] and !exist[splice, VEP:Consequence] and VEP:SYMBOL != TERT and VEP:SYMBOL != GATA2 and !exist[COSV[0-9]+, VEP:Existing_variation])" @@ -24,3 +24,8 @@ filters: expression: "(VEP:MAX_AF > 0.02)" soft_filter_flag: "PopAF_0.02" soft_filter: "True" + artifacts: + description: "Soft filter position that occurs in 4 or more normal samples and AF is less than 5 sd from median af in normalpool" + expression: "(INFO:Artifact > 3 and INFO:ArtifactNrSD < 5)" + soft_filter_flag: "Artifact_gt_3" + soft_filter: "True" \ No newline at end of file diff --git a/config/output_files_references.yaml b/config/output_files_references.yaml index 8b3c064..d85e99c 100644 --- a/config/output_files_references.yaml +++ b/config/output_files_references.yaml @@ -32,6 +32,10 @@ files: input: references/create_artifact_file/artifact_panel.tsv output: artifact_panel.tsv + - name: artifacts pindel tsv-file + input: references/create_artifact_file_pindel/artifact_panel.tsv + output: artifact_panel_pindel.tsv + - name: background tsv-file input: "references/create_background_file/background_panel.tsv" output: background_panel.tsv diff --git a/docs/softwares.md b/docs/softwares.md index 916c66a..6145057 100644 --- a/docs/softwares.md +++ b/docs/softwares.md @@ -1,23 +1,112 @@ # Software used in Poppy +Rules specifically for Poppy listed here. -## [annotation_vep_pindel](https://www.ensembl.org/info/docs/tools/vep/index.html) -Since pindel is run on limited region it does not always produce results, if an empty vcf-file is used with VEP it will fail and the entire pipeline will stop, therefor a specific rule is needed to ensure there are variants in the pindel vcf before annotating the vcf. If no variants are found the empty vcf file is just copied to the output. +## pindel_processing.smk +[Pindel](http://gmt.genome.wustl.edu/packages/pindel/) creates an older type of VCF and therefore has to be processed slightly different than more modern VCFs. Here we add the AF and DP fields to the VCF INFO column, annotate the calls using [vep](https://www.ensembl.org/info/docs/tools/vep/index.html) and add artifact annotation based an on artifact panel created with the reference pipeline. + + + +### :snake: Rule + +#SNAKEMAKE_RULE_SOURCE__pindel_processing__pindel_processing_annotation_vep# + +#### :left_right_arrow: input / output files + +#SNAKEMAKE_RULE_TABLE__pindel_processing__pindel_processing_annotation_vep# + +### :wrench: Configuration + +#### Software settings (`config.yaml`) + +#CONFIGSCHEMA__pindel_processing_annotation_vep# + +#### Resources settings (`resources.yaml`) + +#RESOURCESSCHEMA__pindel_processing_annotation_vep# + + +### :snake: Rule + +#SNAKEMAKE_RULE_SOURCE__pindel_processing__pindel_processing_fix_af# + +#### :left_right_arrow: input / output files + +#SNAKEMAKE_RULE_TABLE__pindel_processing__pindel_processing_fix_af# + +### :wrench: Configuration + +#### Software settings (`config.yaml`) + +#CONFIGSCHEMA__pindel_processing_fix_af# + +#### Resources settings (`resources.yaml`) + +#RESOURCESSCHEMA__pindel_processing_fix_af# + + +### :snake: Rule + +#SNAKEMAKE_RULE_SOURCE__pindel_processing__pindel_processing_artifact_annotation# + +#### :left_right_arrow: input / output files + +#SNAKEMAKE_RULE_TABLE__pindel_processing__pindel_processing_artifact_annotation# + +### :wrench: Configuration + +#### Software settings (`config.yaml`) + +#CONFIGSCHEMA__pindel_processing_artifact_annotation# + +#### Resources settings (`resources.yaml`) + +#RESOURCESSCHEMA__pindel_processing_artifact_annotation# + + +## [svdb](https://github.com/J35P312/SVDB).smk +Since when running `svdb --merge` with the priority flag set, svdb cuts off the FORMAT column for cnvkit variants [git issue](). We use a non-Hydra Genetics rule for the `svdb --merge` command. ### :snake: Rule -#SNAKEMAKE_RULE_SOURCE__annotation_vep_pindel__annotation_vep_pindel# +#SNAKEMAKE_RULE_SOURCE__svdb__svdb_merge_wo_priority# #### :left_right_arrow: input / output files -#SNAKEMAKE_RULE_TABLE__annotation_vep_pindel__annotation_vep_pindel# +#SNAKEMAKE_RULE_TABLE__svdb__svdb_merge_wo_priority# ### :wrench: Configuration #### Software settings (`config.yaml`) -#CONFIGSCHEMA__annotation_vep_pindel# +#CONFIGSCHEMA__svdb_merge# #### Resources settings (`resources.yaml`) -#RESOURCESSCHEMA__annotation_vep_pindel# +#RESOURCESSCHEMA__svdb_merge# + + +--- + +## reference_rules.smk +Software used specifically to create the reference-files for Poppy. + +### :snake: Rule + +#SNAKEMAKE_RULE_SOURCE__reference_rules__reference_rules_create_artefact_file_pindel# + +#### :left_right_arrow: input / output files + +#SNAKEMAKE_RULE_TABLE__reference_rules__reference_rules_create_artifact_file_pindel# + +### :wrench: Configuration + +#### Software settings (`config.yaml`) + +#CONFIGSCHEMA__reference_rules_create_artifact_file_pindel# + +#### Resources settings (`resources.yaml`) + +#RESOURCESSCHEMA__reference_rules_create_artefact_file_pindel# + + diff --git a/workflow/Snakefile b/workflow/Snakefile index 7b0f313..6e41ffe 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -7,12 +7,14 @@ __email__ = "arielle.munters@scilifelab.uu.se" __license__ = "GPL-3" -ruleorder: annotation_vep_pindel > annotation_vep +ruleorder: pindel_processing_annotation_vep > annotation_vep +ruleorder: pindel_processing_artifact_annotation > annotation_artifact_annotation +ruleorder: svdb_merge_wo_priority > cnv_sv_svdb_merge include: "rules/common.smk" -include: "rules/fix_af_pindel.smk" -include: "rules/annotation_vep_pindel.smk" +include: "rules/svdb.smk" +include: "rules/pindel_processing.smk" report: "report/workflow.rst" @@ -53,7 +55,7 @@ use rule * from annotation as annotation_* module cnv_sv: snakefile: - github("hydra-genetics/cnv_sv", path="workflow/Snakefile", tag="b549266") + github("hydra-genetics/cnv_sv", path="workflow/Snakefile", tag="1aa9a68") config: config diff --git a/workflow/Snakefile_references.smk b/workflow/Snakefile_references.smk index 119a666..eed301f 100644 --- a/workflow/Snakefile_references.smk +++ b/workflow/Snakefile_references.smk @@ -1,4 +1,5 @@ include: "rules/common_references.smk" +include: "rules/reference_rules.smk" rule all: diff --git a/workflow/rules/annotation_vep_pindel.smk b/workflow/rules/annotation_vep_pindel.smk deleted file mode 100644 index 9449a57..0000000 --- a/workflow/rules/annotation_vep_pindel.smk +++ /dev/null @@ -1,37 +0,0 @@ -__author__ = "Arielle R. Munters" -__copyright__ = "Copyright 2024, Arielle R. Munters" -__email__ = "arielle.munters@scilifelab.uu.se" -__license__ = "GPL-3" - - -rule annotation_vep_pindel: - input: - cache=config.get("vep", {}).get("vep_cache", ""), - fasta=config["reference"]["fasta"], - tabix="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.normalized.vcf.gz.tbi", - vcf="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.normalized.vcf.gz", - output: - vcf=temp("cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.vcf"), - params: - extra=config.get("vep", {}).get("extra", "--pick"), - mode=config.get("vep", {}).get("mode", "--offline --cache --merged "), - log: - "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.vcf.log", - benchmark: - repeat( - "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.vcf.benchmark.tsv", - config.get("vep", {}).get("benchmark_repeats", 1), - ) - threads: config.get("vep", {}).get("threads", config["default_resources"]["threads"]) - resources: - mem_mb=config.get("vep", {}).get("mem_mb", config["default_resources"]["mem_mb"]), - mem_per_cpu=config.get("vep", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), - partition=config.get("vep", {}).get("partition", config["default_resources"]["partition"]), - threads=config.get("vep", {}).get("threads", config["default_resources"]["threads"]), - time=config.get("vep", {}).get("time", config["default_resources"]["time"]), - container: - config.get("vep", {}).get("container", config["default_container"]) - message: - "{rule}: vep annotate {input.vcf}" - script: - "../scripts/annotation_vep_pindel.sh" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 6040a44..570533b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -86,10 +86,10 @@ config = load_resources(config, config["resources"]) validate(config, schema="../schemas/resources.schema.yaml") ### Read and validate samples file - samples = pd.read_table(config["samples"], comment="#").set_index("sample", drop=False) validate(samples, schema="../schemas/samples.schema.yaml") + ### Read and validate units file units = ( pandas.read_table(config["units"], dtype=str, comment="#") @@ -109,9 +109,8 @@ with open(config["output"], "r") as f: output_spec = yaml.safe_load(f.read()) validate(output_spec, schema="../schemas/output_files.schema.yaml", set_default=True) -### Set wildcard constraints - +### Set wildcard constraints wildcard_constraints: barcode="[A-Z+]+", chr="[^_]+", @@ -121,4 +120,23 @@ wildcard_constraints: type="N|T|R", +def get_vcfs_for_svdb_merge(wildcards, add_suffix=False): + vcf_dict = {} + for v in config.get("svdb_merge", {}).get("tc_method"): + tc_method = v["name"] + callers = v["cnv_caller"] + for caller in callers: + if add_suffix: + caller_suffix = f":{caller}" + else: + caller_suffix = "" + if tc_method in vcf_dict: + vcf_dict[tc_method].append( + f"cnv_sv/{caller}_vcf/{wildcards.sample}_{wildcards.type}.{tc_method}.vcf{caller_suffix}" + ) + else: + vcf_dict[tc_method] = [f"cnv_sv/{caller}_vcf/{wildcards.sample}_{wildcards.type}.{tc_method}.vcf{caller_suffix}"] + return vcf_dict[wildcards.tc_method] + + generate_copy_rules(output_spec) diff --git a/workflow/rules/common_references.smk b/workflow/rules/common_references.smk index 0b66a77..9244820 100644 --- a/workflow/rules/common_references.smk +++ b/workflow/rules/common_references.smk @@ -8,6 +8,8 @@ import yaml from hydra_genetics.utils.resources import load_resources from hydra_genetics import min_version as hydra_min_version from hydra_genetics.utils.misc import replace_dict_variables +from hydra_genetics.utils.samples import * +from hydra_genetics.utils.units import * include: "results.smk" diff --git a/workflow/rules/fix_af_pindel.smk b/workflow/rules/fix_af_pindel.smk deleted file mode 100644 index 4b07a39..0000000 --- a/workflow/rules/fix_af_pindel.smk +++ /dev/null @@ -1,33 +0,0 @@ -__author__ = "Arielle R. Munters" -__copyright__ = "Copyright 2024, Arielle R. Munters" -__email__ = "arielle.munters@scilifelab.uu.se" -__license__ = "GPL-3" - - -rule fix_af_pindel: - input: - vcf="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vcf", - output: - vcf="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.fix_af.vcf", - params: - extra=config.get("fix_af_pindel", {}).get("extra", ""), - log: - "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.fix_af.vcf.log", - benchmark: - repeat( - "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.fix_af.vcf.benchmark.tsv", - config.get("fix_af_pindel", {}).get("benchmark_repeats", 1), - ) - threads: config.get("fix_af_pindel", {}).get("threads", config["default_resources"]["threads"]) - resources: - mem_mb=config.get("fix_af_pindel", {}).get("mem_mb", config["default_resources"]["mem_mb"]), - mem_per_cpu=config.get("fix_af_pindel", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), - partition=config.get("fix_af_pindel", {}).get("partition", config["default_resources"]["partition"]), - threads=config.get("fix_af_pindel", {}).get("threads", config["default_resources"]["threads"]), - time=config.get("fix_af_pindel", {}).get("time", config["default_resources"]["time"]), - container: - config.get("fix_af_pindel", {}).get("container", config["default_container"]) - message: - "{rule}: add af and dp to info field in {input.vcf}" - script: - "../scripts/fix_af_pindel.py" diff --git a/workflow/rules/pindel_processing.smk b/workflow/rules/pindel_processing.smk new file mode 100644 index 0000000..61f2e64 --- /dev/null +++ b/workflow/rules/pindel_processing.smk @@ -0,0 +1,102 @@ +__author__ = "Arielle R. Munters" +__copyright__ = "Copyright 2024, Arielle R. Munters" +__email__ = "arielle.munters@scilifelab.uu.se" +__license__ = "GPL-3" + + +rule pindel_processing_annotation_vep: + input: + cache=config.get("vep", {}).get("vep_cache", ""), + fasta=config["reference"]["fasta"], + tabix="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.normalized.vcf.gz.tbi", + vcf="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.normalized.vcf.gz", + output: + vcf=temp("cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.vcf"), + params: + extra=config.get("vep", {}).get("extra", "--pick"), + mode=config.get("vep", {}).get("mode", "--offline --cache --merged "), + log: + "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.vcf.log", + benchmark: + repeat( + "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.vcf.benchmark.tsv", + config.get("vep", {}).get("benchmark_repeats", 1), + ) + threads: config.get("vep", {}).get("threads", config["default_resources"]["threads"]) + resources: + mem_mb=config.get("vep", {}).get("mem_mb", config["default_resources"]["mem_mb"]), + mem_per_cpu=config.get("vep", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), + partition=config.get("vep", {}).get("partition", config["default_resources"]["partition"]), + threads=config.get("vep", {}).get("threads", config["default_resources"]["threads"]), + time=config.get("vep", {}).get("time", config["default_resources"]["time"]), + container: + config.get("vep", {}).get("container", config["default_container"]) + message: + "{rule}: vep annotate {input.vcf}" + script: + "../scripts/pindel_processing_annotation_vep.sh" + + +rule pindel_processing_artifact_annotation: + input: + vcf="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.vcf.gz", + tbi="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.vcf.gz.tbi", + artifacts=config["reference"]["artifacts_pindel"], + output: + vcf="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.artifact_annotated.vcf", + params: + extra=config.get("pindel_processing_artifact_annotation", {}).get("extra", ""), + log: + "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.artifact_annotated.vcf.log", + benchmark: + repeat( + "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vep_annotated.artifact_annotated.vcf.benchmark.tsv", + config.get("pindel_processing_artifact_annotation", {}).get("benchmark_repeats", 1), + ) + threads: config.get("pindel_processing_artifact_annotation", {}).get("threads", config["default_resources"]["threads"]) + resources: + mem_mb=config.get("pindel_processing_artifact_annotation", {}).get("mem_mb", config["default_resources"]["mem_mb"]), + mem_per_cpu=config.get("pindel_processing_artifact_annotation", {}).get( + "mem_per_cpu", config["default_resources"]["mem_per_cpu"] + ), + partition=config.get("pindel_processing_artifact_annotation", {}).get( + "partition", config["default_resources"]["partition"] + ), + threads=config.get("pindel_processing_artifact_annotation", {}).get("threads", config["default_resources"]["threads"]), + time=config.get("pindel_processing_artifact_annotation", {}).get("time", config["default_resources"]["time"]), + container: + config.get("pindel_processing_artifact_annotation", {}).get("container", config["default_container"]) + message: + "{rule}: add artifact annotation on {input.vcf}, based on arifact_panel_pindel.tsv " + localrule: True + script: + "../scripts/pindel_processing_artifact_annotation.py" + + +rule pindel_processing_fix_af: + input: + vcf="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.vcf", + output: + vcf="cnv_sv/pindel_vcf/{sample}_{type}.no_tc.fix_af.vcf", + params: + extra=config.get("pindel_processing_fix_af", {}).get("extra", ""), + log: + "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.fix_af.vcf.log", + benchmark: + repeat( + "cnv_sv/pindel_vcf/{sample}_{type}.no_tc.fix_af.vcf.benchmark.tsv", + config.get("pindel_processing_fix_af", {}).get("benchmark_repeats", 1), + ) + threads: config.get("pindel_processing_fix_af", {}).get("threads", config["default_resources"]["threads"]) + resources: + mem_mb=config.get("pindel_processing_fix_af", {}).get("mem_mb", config["default_resources"]["mem_mb"]), + mem_per_cpu=config.get("pindel_processing_fix_af", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), + partition=config.get("pindel_processing_fix_af", {}).get("partition", config["default_resources"]["partition"]), + threads=config.get("pindel_processing_fix_af", {}).get("threads", config["default_resources"]["threads"]), + time=config.get("pindel_processing_fix_af", {}).get("time", config["default_resources"]["time"]), + container: + config.get("pindel_processing_fix_af", {}).get("container", config["default_container"]) + message: + "{rule}: add af and dp to info field in {input.vcf}" + script: + "../scripts/pindel_processing_fix_af.py" diff --git a/workflow/rules/reference_rules.smk b/workflow/rules/reference_rules.smk new file mode 100644 index 0000000..94646f7 --- /dev/null +++ b/workflow/rules/reference_rules.smk @@ -0,0 +1,34 @@ +__author__ = "Arielle R. Munters" +__copyright__ = "Copyright 2024, Arielle R. Munters" +__email__ = "arielle.munters@scilifelab.uu.se" +__license__ = "GPL-3" + + +rule reference_rules_create_artifact_file_pindel: + input: + vcfs=set([f"cnv_sv/pindel_vcf/{t.sample}_{t.type}.no_tc.vep_annotated.vcf.gz" for t in units.itertuples()]), + tbis=set([f"cnv_sv/pindel_vcf/{t.sample}_{t.type}.no_tc.vep_annotated.vcf.gz.tbi" for t in units.itertuples()]), + output: + artifact_panel=temp("references/create_artifact_file_pindel/artifact_panel.tsv"), + params: + extra=config.get("create_artifact_file_pindel", {}).get("extra", ""), + log: + "references/create_artifact_file_pindel/artifact_panel.tsv.log", + benchmark: + repeat( + "references/create_artifact_file_pindel/artifact_panel.tsv.benchmark.tsv", + config.get("create_artifact_file_pindel", {}).get("benchmark_repeats", 1), + ) + threads: config.get("create_artifact_file_pindel", {}).get("threads", config["default_resources"]["threads"]) + resources: + mem_mb=config.get("create_artifact_file_pindel", {}).get("mem_mb", config["default_resources"]["mem_mb"]), + mem_per_cpu=config.get("create_artifact_file_pindel", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), + partition=config.get("create_artifact_file_pindel", {}).get("partition", config["default_resources"]["partition"]), + threads=config.get("create_artifact_file_pindel", {}).get("threads", config["default_resources"]["threads"]), + time=config.get("create_artifact_file_pindel", {}).get("time", config["default_resources"]["time"]), + container: + config.get("create_artifact_file_pindel", {}).get("container", config["default_container"]) + message: + "{rule}: create artifact PoN for pindel" + script: + "../scripts/create_artifact_file_pindel.py" diff --git a/workflow/rules/svdb.smk b/workflow/rules/svdb.smk new file mode 100644 index 0000000..6d6c3d3 --- /dev/null +++ b/workflow/rules/svdb.smk @@ -0,0 +1,40 @@ +__author__ = "Arielle R. Munters" +__copyright__ = "Copyright 2024, Arielle R. Munters" +__email__ = "arielle.munters@scilifelab.uu.se" +__license__ = "GPL-3" + + +rule svdb_merge_wo_priority: + input: + vcfs=get_vcfs_for_svdb_merge, + output: + vcf=temp("cnv_sv/svdb_merge/{sample}_{type}.{tc_method}.merged.vcf"), + params: + extra=config.get("svdb_merge", {}).get("extra", ""), + overlap=config.get("svdb_merge", {}).get("overlap", 0.6), + bnd_distance=config.get("svdb_merge", {}).get("bnd_distance", 10000), + log: + "cnv_sv/svdb_merge/{sample}_{type}.{tc_method}.merged.vcf.log", + benchmark: + repeat( + "cnv_sv/svdb_merge/{sample}_{type}.{tc_method}.merged.benchmark.tsv", + config.get("svdb_merge", {}).get("benchmark_repeats", 1), + ) + threads: config.get("svdb_merge", {}).get("threads", config["default_resources"]["threads"]) + resources: + mem_mb=config.get("svdb_merge", {}).get("mem_mb", config["default_resources"]["mem_mb"]), + mem_per_cpu=config.get("svdb_merge", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), + partition=config.get("svdb_merge", {}).get("partition", config["default_resources"]["partition"]), + threads=config.get("svdb_merge", {}).get("threads", config["default_resources"]["threads"]), + time=config.get("svdb_merge", {}).get("time", config["default_resources"]["time"]), + container: + config.get("svdb_merge", {}).get("container", config["default_container"]) + message: + "{rule}: merges vcf files from different cnv callers into {output.vcf}" + shell: + "(svdb --merge " + "--vcf {input.vcfs} " + "--bnd_distance {params.bnd_distance} " + "--overlap {params.overlap} " + "{params.extra} " + "> {output.vcf}) 2> {log}" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index e4fc4a7..a1eefb6 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -21,6 +21,16 @@ properties: Path to resources.yaml file format: uri-reference + pacbio_alignment: + type: boolean + description: if pacbio_alignment is used set to true + default: false + + ont_alignment: + type: boolean + description: if ont_alignment should be used set to true + default: false + reference: type: object description: > @@ -75,6 +85,12 @@ properties: tsv-file with artifact freq in normal samples, created with reference pipeline format: uri-reference + artifacts_pindel: + type: string + description: > + tsv-file with artifact freq for pindel calls in normal samples, created with reference pipeline + format: uri-reference + background: type: string description: tsv-file with background freq in normal samples, created with reference pipeline. @@ -87,6 +103,7 @@ properties: - design_intervals - design_intervals_gatk_cnv - artifacts + - artifacts_pindel - background samples: @@ -611,10 +628,66 @@ properties: required: - container + pindel_processing_annotation_vep: + type: object + description: parameters for annotation with vep on pindel vcf, most taken from vep rule to ensure that pindel result is processed identical to snvs + properties: + container: + type: string + description: > + From config["vep"]: Name of path to container containing the vep executable + format: uri-reference + vep_cache: + type: string + description: > + From config["vep"]: Path to offline VEP cache + format: uri-reference + mode: + type: string + description: > + From config["vep"]: VEP arguments for run mode + examples: + - "--offline --cache" + extra: + type: string + description: > + From config["vep"]: Additional command line arguments for VEP + benchmark_repeats: + type: integer + description: 'From config["vep"]: set number of times benchmark should be repeated' + + pindel_processing_artifact_annotation: + type: object + description: parameters for pindel_processing_artifact_annotation to add artifact annotation to pindel vcf + properties: + benchmark_repeats: + type: integer + description: set number of times benchmark should be repeated + container: + type: string + description: name or path to docker/singularity container + extra: + type: string + description: parameters that should be forwarded + + pindel_processing_fix_af: + type: object + description: parameters for pindel_processing_fix_af to add AF and DP values in INFO field of vcf + properties: + benchmark_repeats: + type: integer + description: set number of times benchmark should be repeated + container: + type: string + description: name or path to docker/singularity container + extra: + type: string + description: parameters that should be forwarded + svdb_merge: type: object description: > - Configuration for the svdb_merge rule + Configuration for the svdb_merge_wo_priority rule properties: container: type: string @@ -755,34 +828,6 @@ properties: required: - container - annotation_vep_pindel: - type: object - description: parameters for annotation with vep on pindel vcf, most taken from vep rule to ensure that pindel result is processed identical to snvs - properties: - container: - type: string - description: > - From config["vep"]: Name of path to container containing the vep executable - format: uri-reference - vep_cache: - type: string - description: > - From config["vep"]: Path to offline VEP cache - format: uri-reference - mode: - type: string - description: > - From config["vep"]: VEP arguments for run mode - examples: - - "--offline --cache" - extra: - type: string - description: > - From config["vep"]: Additional command line arguments for VEP - benchmark_repeats: - type: integer - description: 'From config["vep"]: set number of times benchmark should be repeated' - required: - bcbio_variation_recall_ensemble - bwa_mem diff --git a/workflow/schemas/config_references.schema.yaml b/workflow/schemas/config_references.schema.yaml index 914fa55..abf67d5 100644 --- a/workflow/schemas/config_references.schema.yaml +++ b/workflow/schemas/config_references.schema.yaml @@ -112,6 +112,20 @@ properties: required: - mappability + reference_rules_create_artifact_file_pindel: + type: object + description: parameters for reference_rules_create_artifact_file_pindel + properties: + benchmark_repeats: + type: integer + description: set number of times benchmark should be repeated + container: + type: string + description: name or path to docker/singularity container + extra: + type: string + description: parameters that should be forwarded + resources_references: type: string description: > diff --git a/workflow/schemas/resources.schema.yaml b/workflow/schemas/resources.schema.yaml index 69e2823..7b390b6 100644 --- a/workflow/schemas/resources.schema.yaml +++ b/workflow/schemas/resources.schema.yaml @@ -24,9 +24,69 @@ properties: required: - default_resources - annotation_vep_pindel: + pindel_processing_annotation_vep: type: object - description: resource definitions for annotation_vep_pindel + description: resource definitions for pindel_processing_annotation_vep, all resources taken from config["vep"] + properties: + mem_mb: + type: integer + description: max memory in MB to be available + mem_per_cpu: + type: integer + description: memory in MB used per cpu + partition: + type: string + description: partition to use on cluster + threads: + type: integer + description: number of threads to be available + time: + type: string + description: max execution time + + pindel_processing_artifact_annotation: + type: object + description: resource definitions for pindel_processing_artifact_annotation + properties: + mem_mb: + type: integer + description: max memory in MB to be available + mem_per_cpu: + type: integer + description: memory in MB used per cpu + partition: + type: string + description: partition to use on cluster + threads: + type: integer + description: number of threads to be available + time: + type: string + description: max execution time + + pindel_processing_fix_af: + type: object + description: resource definitions for pindel_processing_fix_af + properties: + mem_mb: + type: integer + description: max memory in MB to be available + mem_per_cpu: + type: integer + description: memory in MB used per cpu + partition: + type: string + description: partition to use on cluster + threads: + type: integer + description: number of threads to be available + time: + type: string + description: max execution time + + svdb_merge: + type: object + description: resource definitions for svdb_merge_wo_priority properties: mem_mb: type: integer diff --git a/workflow/schemas/rules.schema.yaml b/workflow/schemas/rules.schema.yaml index aa349c0..362e08d 100644 --- a/workflow/schemas/rules.schema.yaml +++ b/workflow/schemas/rules.schema.yaml @@ -2,9 +2,9 @@ $schema: "http://json-schema.org/draft-04/schema#" description: snakemake rule input and output files description file type: object properties: - annotation_vep_pindel: + pindel_processing_annotation_vep: type: object - description: input and output parameters for vep + description: input and output parameters for vep annotation of pindel vcf properties: input: type: object @@ -12,7 +12,7 @@ properties: properties: cache: type: string - description: path to vep cache directory + description: path to vep cache directory from config["vep"]["vep_cache"] fasta: type: string description: path to fasta reference genome @@ -29,3 +29,88 @@ properties: vcf: type: string description: annotated (or incase of empty just copied) vcf file + + pindel_processing_artifact_annotation: + type: object + description: input and output parameters for pindel_processing_artifact_annotation + properties: + input: + type: object + description: list of inputs + properties: + vcf: + type: string + description: gzipped vcf to be artifact annotated + tbi: + type: string + description: tbi index to input.vcf + artifacts: + type: string + description: tsv file with artifact pindel calls, created in reference pipeline + output: + type: object + description: list of outputs + properties: + vcf: + type: string + description: vcf with artifact annotation + + pindel_processing_fix_af: + type: object + description: input and output parameters for pindel_processing_fix_af + properties: + input: + type: object + description: list of inputs + properties: + vcf: + type: string + description: vcf where AF and DP is needed in INFO field + output: + type: object + description: list of outputs + properties: + vcf: + type: string + description: vcf with added AF and DP in INFO field + + reference_rules_create_artefact_file_pindel: + type: object + description: input and output parameters for reference_rules_create_artefact_file_pindel + properties: + input: + type: object + description: list of inputs + properties: + vcfs: + type: string + description: all (gzipped) vcfs to be used for artifact panel + tbis: + type: string + description: tbi index to all input vcfs + output: + type: object + description: list of outputs + properties: + artifact_panel: + type: string + description: tsv file with chr, pos, svtype, median, sd, num_obs of detected variants + + svdb_merge_wo_priority: + type: object + description: input and output parameters for svdb_merge_wo_priority + properties: + input: + type: object + description: list of inputs + properties: + vcfs: + type: string + description: a function get_vcfs_for_svdb_merge (common.smk) is used to list all files (eg. from different callers) that should be merge into a SVDB 'vcf' + output: + type: object + description: list of outputs + properties: + vcf: + type: string + description: a 'vcf' file containing the merged SV calls diff --git a/workflow/scripts/create_artifact_file_pindel.py b/workflow/scripts/create_artifact_file_pindel.py new file mode 100644 index 0000000..06dae05 --- /dev/null +++ b/workflow/scripts/create_artifact_file_pindel.py @@ -0,0 +1,43 @@ +import gzip +import statistics +from pysam import VariantFile + +vcf_files = snakemake.input.vcfs +artifact_panel = open(snakemake.output.artifact_panel, "w") + +artifact_call_dict = {} +for file_name in vcf_files: + in_vcf = VariantFile(file_name) + samplename = in_vcf.header.samples[0] + for record in in_vcf.fetch(): + key = record.contig + "_" + str(record.pos) + "_" + record.info["SVTYPE"] + af = float(record.info["AF"][0]) + + if key not in artifact_call_dict: + artifact_call_dict[key] = [[af], [samplename]] # Ar forsta 0 i jonas kod antal? call_dict[caller][key]=[0,[]] + else: + artifact_call_dict[key][0].append(af) + artifact_call_dict[key][1].append(samplename) + # vad gor FFPE_rm_dup_dict???? + + +artifact_panel.write("Chromosome\tpos\tSV_type\tmedian_MAF\tsd_MAF\tnr_obs\n") +for key in artifact_call_dict: + chrom = key.split("_")[0] + pos = key.split("_")[1] + svtype = key.split("_")[2] + + artifact_call_dict[key][0].sort() + artifact_call_dict[key][1] = list(dict.fromkeys(artifact_call_dict[key][1])) + + median_af = statistics.median(artifact_call_dict[key][0]) + sd_af = 1000 + nr_obs = len(artifact_call_dict[key][1]) # bara antal prover, varje prov kan ha flera. + if nr_obs >= 4: + """This is the sample variance s² with Bessel’s correction, also known as variance with N-1 degrees of freedom. + Provided that the data points are representative (e.g. independent and identically distributed), + the result should be an unbiased estimate of the true population variance.""" + sd_af = statistics.stdev(artifact_call_dict[key][0]) + artifact_panel.write( + chrom + "\t" + pos + "\t" + svtype + "\t" + str(median_af) + "\t" + str(sd_af) + "\t" + str(nr_obs) + "\n" + ) diff --git a/workflow/scripts/annotation_vep_pindel.sh b/workflow/scripts/pindel_processing_annotation_vep.sh similarity index 100% rename from workflow/scripts/annotation_vep_pindel.sh rename to workflow/scripts/pindel_processing_annotation_vep.sh diff --git a/workflow/scripts/pindel_processing_artifact_annotation.py b/workflow/scripts/pindel_processing_artifact_annotation.py new file mode 100644 index 0000000..d897706 --- /dev/null +++ b/workflow/scripts/pindel_processing_artifact_annotation.py @@ -0,0 +1,63 @@ +from pysam import VariantFile + + +def add_artifact_annotation_data(in_vcf_filename, artifacts_filename, out_vcf_filename): + + artifact_dict = {} + with open(artifacts_filename, "r") as artifacts: + next(artifacts) + for lline in artifacts: + line = lline.strip().split("\t") + chrom = line[0] + pos = line[1] + type = line[2] + median = line[3] + sd = line[4] + observations = line[5] + + artifact_dict[chrom + "_" + pos + "_" + type] = [median, sd, observations] + + # Create new vcf + in_vcf = VariantFile(in_vcf_filename) + new_header = in_vcf.header + new_header.info.add("Artifact", "1", "String", "Number of observations of variant in panel samples") + new_header.info.add("ArtifactMedian", "1", "String", "Artifact median MAFs in normal panel") + new_header.info.add("ArtifactNrSD", "1", "String", "Number of Standard Deviations from artifacts in panel median") + out_vcf = VariantFile(out_vcf_filename, "w", header=new_header) + + for record in in_vcf.fetch(): + key = record.contig + "_" + str(record.pos) + "_" + record.info["SVTYPE"] + AF = float(record.info["AF"][0]) + + Observations = 0 + Median = 0 + SD = 0 + if key in artifact_dict: + Median = artifact_dict[key][0] + SD = artifact_dict[key][1] + Observations = artifact_dict[key][2] + + record.info["Artifact"] = str(Observations) + record.info["ArtifactMedian"] = str(Median) + + nrsd = 1000 + if float(SD) == 1000.0 or float(SD) == 0.0: + if float(Median) >= AF: + nrsd = 0.0 + else: + nrsd = 1000 + else: + nrsd = (AF - float(Median)) / float(SD) + + record.info["ArtifactNrSD"] = str(nrsd) + out_vcf.write(record) + + +if __name__ == "__main__": + log = snakemake.log_fmt_shell(stdout=False, stderr=True) + + add_artifact_annotation_data( + snakemake.input.vcf, + snakemake.input.artifacts, + snakemake.output.vcf, + ) diff --git a/workflow/scripts/fix_af_pindel.py b/workflow/scripts/pindel_processing_fix_af.py similarity index 100% rename from workflow/scripts/fix_af_pindel.py rename to workflow/scripts/pindel_processing_fix_af.py