From de7069962888037b6836494b8e9433dfc3d07cf0 Mon Sep 17 00:00:00 2001 From: sahays Date: Fri, 3 May 2024 17:53:32 +0200 Subject: [PATCH 01/11] Added support of demultiplexed samples with UMI barcodes in them, Added support for extracting consensus metrics --- README.md | 29 +- config/config.yaml | 28 ++ config/test_config_exliquid.yaml | 53 ++++ dag.pdf | Bin profile/config.yaml | 10 +- workflow/Snakefile | 25 +- workflow/envs/consensus.yaml | 6 + workflow/rules/adapter_trimming.smk | 167 +++++----- workflow/rules/alignment.smk | 158 ++++++++-- workflow/rules/common.smk | 286 +++++++++++++----- workflow/rules/create_links.smk | 25 +- workflow/rules/duplicate_marking.smk | 17 +- workflow/rules/flagstatt.smk | 20 -- workflow/rules/metric_hsmetrics.smk | 2 +- workflow/rules/recalibration.smk | 4 +- ...{umi_consensus.smk => umi_based_rules.smk} | 78 +++-- workflow/scripts/getConsensusMetrics.py | 24 ++ 17 files changed, 669 insertions(+), 263 deletions(-) create mode 100644 config/test_config_exliquid.yaml create mode 100644 dag.pdf create mode 100644 workflow/envs/consensus.yaml rename workflow/rules/{umi_consensus.smk => umi_based_rules.smk} (66%) create mode 100644 workflow/scripts/getConsensusMetrics.py diff --git a/README.md b/README.md index 428ca3f..0943a3c 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The pipeline was tested and supported by Daniel Steiert. The pipeline is made for aligning UMI based WGS/WES and Panel Seq data and to compute the QC metrics associated with it. -We require the sequencing is performed in paired end mode and must contain R1 (forward read) R2 (UMI) and R3 (reverse read) for each lane and run +We require the sequencing is performed in paired end mode # Prerequisites @@ -91,11 +91,36 @@ Please modify the entry for 16. `dict_genome`: An absolute path to dict file for the given genomes, ignored if `SeqType` is `WGS` +17. `group_allowed_edits`: Number of edit allowed when grouping based on umi. defaults to 0, should be set to zero if correct_umi is true +18. `group_min_mapq: 20`: Set `--min-map-q` of groupReadsByUMI +19. `group_strategy`: Set the `--strategy` param of groupReadsByUMI deafults to `Adjacency` +20. `consensus_min_reads`: 1 +21. `consensus_min_base_qual`: 2 +22. `consensus_min_input_base_mapq`: 10 +23. `consensus_error_rate_pre_umi`: 45 +24. `consensus_error_rate_post_umi`: 30 + + +25. `filter_min_reads`: 3 +26. `filter_min_base_qual`: 2 +27. `filter_max_base_error_rate`: 0.1 +28. `filter_max_read_error_rate`: 0.05 +29. `filter_max_no_call_fraction`: 0.2 + + +30. `read_structure`: 8M143T 8M143T + + +31. `correct_umi`: True +32. `correct_umi_max_mismatches`: 3 +33. `correct_umi_min_distance`: 1 +34. `umi_file`: + ## Metadata file -Please create a metadata file with columns +Please create a metadata file with columns other columns can exist but not required 1. FASTQ_FILE: A column containing absolute paths to the locations of the FASTQ files, 2. READ: A column contain information with regards to R1, R2 and R3 of the sequencing file. Needs to be prefixed with `R` if not present 3. LANE_NO: A column containing the lane information for the sequencing files. Should be prefixed with `L_` if not present diff --git a/config/config.yaml b/config/config.yaml index 5919356..36a31a4 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -42,3 +42,31 @@ bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions f chrom_sizes: /Path/to/chrom_sizes.tsv # Tab separated file containing the chromosome names and their lengths should match the reference genome dict_genome: /Path/to/genome.dict # Dictionary file for the reference genome + +#Parameters for Read_Grouping +group_allowed_edits: 0 +group_min_mapq: 20 +group_strategy: Adjacency + +# Configuration for consensus calling +consensus_min_reads: 1 +consensus_min_base_qual: 2 +consensus_min_input_base_mapq: 10 +consensus_error_rate_pre_umi: 45 +consensus_error_rate_post_umi: 30 + +# Configuration for consensus filter +filter_min_reads: 3 +filter_min_base_qual: 2 +filter_max_base_error_rate: 0.1 +filter_max_read_error_rate: 0.05 +filter_max_no_call_fraction: 0.2 + +# Configuration when UMI present in R1 and R2 files +read_structure: 8M143T 8M143T + +# Correct UMI based on list of UMI's +correct_umi: True +correct_umi_max_mismatches: 3 +correct_umi_min_distance: 1 +umi_file: /path/to/umi.txt diff --git a/config/test_config_exliquid.yaml b/config/test_config_exliquid.yaml new file mode 100644 index 0000000..ac9ba5e --- /dev/null +++ b/config/test_config_exliquid.yaml @@ -0,0 +1,53 @@ +Adapter_R1: [] +Adapter_R3: [] +SeqType: Panel +chrom_sizes: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/stats/hg19_chrTotalLength.tsv + +dbsnp: /applications/otp/ngs_share_complete/assemblies/hg19_GRCh37_1000genomes/databases/dbSNP/dbSNP_147/00-All.vcf.gz +dict_genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.dict + +genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.fa + +library_prep_kit: IDT_xGen_cfDNA_FFPE + +log_dir: /dh-projects/exliquid/scratch/results_alignment_test/logs/ + +metadata: /dh-projects/exliquid/scratch/input/bare_min_meta_data.csv + +pid: EXLIQUID_EX59 +sample: EXLIQUID_EX59-BUFFYCOAT_control + +target_regions: /dh-projects/exliquid/raw_data/reference/Targets-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed +bait_regions: /dh-projects/exliquid/raw_data/reference/Probes-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed +# trim_adapters: false + +work_dir: /dh-projects/exliquid/scratch/results_alignment_test/ + +# bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions for the analysis only required if SeqType is Panel/WES, if not provided a bait file will be generated from the target file + +#Parameters for Read_Grouping +group_allowed_edits: 0 +group_min_mapq: 20 +group_strategy: Adjacency + +# Configuration for consensus calling +consensus_min_reads: 1 +consensus_min_base_qual: 2 +consensus_min_input_base_mapq: 10 +consensus_error_rate_pre_umi: 45 +consensus_error_rate_post_umi: 30 + +# Configuration for consensus filter +filter_min_reads: 3 +filter_min_base_qual: 2 +filter_max_base_error_rate: 0.1 +filter_max_read_error_rate: 0.05 +filter_max_no_call_fraction: 0.2 + +# Configuration when UMI present in R1 and R2 files +read_structure: 8M143T 8M143T +# Correct UMI based on list of UMI's +correct_umi: True +correct_umi_max_mismatches: 3 +correct_umi_min_distance: 1 +umi_file: /dh-projects/exliquid/raw_data/reference/umi_list.txt diff --git a/dag.pdf b/dag.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profile/config.yaml b/profile/config.yaml index b38fca9..692d38b 100644 --- a/profile/config.yaml +++ b/profile/config.yaml @@ -3,14 +3,14 @@ jobs: 10 latency-wait: 60 reason: True - +# default-resources: +# slurm_partition: compute keep-going: True printshellcmds: True rerun-incomplete: True restart-times: 2 - # delete-temp-output: True -conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs +conda-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/conda_envs # conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs_ohne_singularity -singularity-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/Singularity -singularity-args: "-B /dh-projects/richter_transformation:/dh-projects/richter_transformation,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy" +singularity-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/Singularity +singularity-args: "-B /dh-projects/ag-ishaque:/dh-projects/ag-ishaque,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy,/dh-projects/exliquid:/dh-projects/exliquid" diff --git a/workflow/Snakefile b/workflow/Snakefile index 9a8334e..716db10 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,4 +1,5 @@ import os +from datetime import datetime import pandas as pd from pathlib import Path from itertools import product @@ -9,10 +10,10 @@ container: "docker://condaforge/mambaforge" include: "rules/common.smk" #done include: "rules/create_links.smk" +include: "rules/umi_based_rules.smk" #done include: "rules/adapter_trimming.smk" #done include: "rules/alignment.smk" #done -include: "rules/umi_consensus.smk" #done -include: "rules/duplicate_marking.smk" #done +# include: "rules/duplicate_marking.smk" #done include: "rules/flagstatt.smk" #done include: "rules/metric_hsmetrics.smk" #done include: "rules/metric_insert_size.smk" #done @@ -35,11 +36,10 @@ rule all: wrkdir / "metrics" / "{sample}.mosdepth.global.dist.txt", sample=config["sample"], ), - # expand(wrkdir / "metrics" / "{sample}.flagstat", sample=config["sample"]), expand( wrkdir / "metrics" / "{sample}_{ext}.flagstat", sample=config["sample"], - ext=["dedup.recall.sorted", "merged_umi_annot"], + ext=["dedup.recall.sorted"], ), expand( wrkdir / "metrics" / "{sample}_insert_size_metrics.txt", @@ -64,3 +64,20 @@ rule all: expand(wrkdir / "metrics" / "{sample}.hs_metrics.txt", sample=config["sample"]) if config["SeqType"] in ["Panel", "WES"] else [], + expand( + wrkdir + / "metrics" + / "correct_umi" + / "{run_id}" + / "{sample}_{lane}_umi_metrics.txt", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, + ) + if correct_umi + else [], + expand( + wrkdir / "metrics" / "{sample}_consensus_metrics.tsv", + sample=config["sample"], + ), diff --git a/workflow/envs/consensus.yaml b/workflow/envs/consensus.yaml new file mode 100644 index 0000000..ff898d8 --- /dev/null +++ b/workflow/envs/consensus.yaml @@ -0,0 +1,6 @@ +name: consensus +channels: + - conda-forge +dependencies: + - pandas=2.2.2 + - python=3.11 diff --git a/workflow/rules/adapter_trimming.smk b/workflow/rules/adapter_trimming.smk index 0778088..3026e62 100755 --- a/workflow/rules/adapter_trimming.smk +++ b/workflow/rules/adapter_trimming.smk @@ -1,103 +1,78 @@ # disabling adapter trimming when when set to false, # To maintain the same input and output files we create links instead of running cutadapt -if config["trim_adapters"]: - rule create_adapter_fastq: - params: - adapt_1=adapter_seq_r1, - adapt_3=adapter_seq_r3, - output: - adapt_1=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq"), - adapt_3=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq"), - threads: 1 - resources: - mem_mb=1000, - runtime=20, - nodes=1, - message: - "Creating adapter fastq files" - run: - with open(output.adapt_1, "w") as handle: - count = 1 - for i in params.adapt_1: - handle.write(">adapter_" + str(count) + "\n") - handle.write(i + "\n") - count += 1 +rule create_adapter_fastq: + params: + adapt_1=adapter_seq_r1, + adapt_3=adapter_seq_r3, + output: + adapt_1=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq"), + adapt_3=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq"), + threads: 1 + resources: + mem_mb=1000, + runtime=20, + nodes=1, + message: + "Creating adapter fastq files" + run: + with open(output.adapt_1, "w") as handle: + count = 1 + for i in params.adapt_1: + handle.write(">adapter_" + str(count) + "\n") + handle.write(i + "\n") + count += 1 - with open(output.adapt_3, "w") as handle: - count = 1 - for i in params.adapt_3: - handle.write(">adapter_" + str(count) + "\n") - handle.write(i + "\n") - count += 1 + with open(output.adapt_3, "w") as handle: + count = 1 + for i in params.adapt_3: + handle.write(">adapter_" + str(count) + "\n") + handle.write(i + "\n") + count += 1 - rule cutadapt: - input: - adapt_1=wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq", - adapt_3=wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq", - fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", - fastq_r3=wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - output: - fastq_r1=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz" - ), - fastq_r3=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz" - ), - log: - logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", - threads: 8 - resources: - mem_mb=8000, - runtime=4 * 60, - nodes=1, - conda: - "../envs/cutadapt.yaml" - message: - "Trimming adapters using cutadapt" - shell: - "cutadapt -j {threads} -a file:{input.adapt_1} -A file:{input.adapt_3} -o {output.fastq_r1} -p {output.fastq_r3} {input.fastq_r1} {input.fastq_r3} &> {log}" -else: - - rule cutadapt: - input: - fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", - fastq_r3=wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - output: - fastq_r1=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz" - ), - fastq_r3=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz" - ), - log: - logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", - threads: 1 - resources: - mem_mb=1000, - runtime=60, - nodes=1, - message: - "Skipping adapter trimming, creating links instead of running cutadapt" - conda: - "../envs/cutadapt.yaml" - shell: - "(ln -s {input.fastq_r1} {output.fastq_r1} && ln -s {input.fastq_r3} {output.fastq_r3}) &> {log}" +rule cutadapt: + input: + adapt_1=wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq", + adapt_3=wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq", + fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", + fastq_r3=( + wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz" + if read_structure + else wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz" + ), + output: + fastq_r1=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R1_{lane}_trim.fastq.gz" + ), + fastq_r3=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R2_{lane}_trim.fastq.gz" + if read_structure + else wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R3_{lane}.fastq.gz" + ), + log: + logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", + threads: 8 + resources: + mem_mb=8000, + runtime=72 * 60, + nodes=1, + conda: + "../envs/cutadapt.yaml" + message: + "Trimming adapters using cutadapt" + shell: + "cutadapt -j {threads} -a file:{input.adapt_1} -A file:{input.adapt_3} -o {output.fastq_r1} -p {output.fastq_r3} {input.fastq_r1} {input.fastq_r3} &> {log}" diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk index 86d1746..3b7a9f9 100755 --- a/workflow/rules/alignment.smk +++ b/workflow/rules/alignment.smk @@ -6,24 +6,51 @@ rule fastqbam: """ input: genome=genome, - fastq_r1=wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz", - fastq_r3=wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz", + fastq_r1=( + ( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R1_{lane}_trim.fastq.gz" + ) + if config["trim_adapters"] + else (wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz") + ), + fastq_r3=( + ( + ( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R2_{lane}_trim.fastq.gz" + ) + if config["trim_adapters"] + else (wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz") + ) + if read_structure + else ( + ( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R3_{lane}_trim.fastq.gz" + ) + if config["trim_adapters"] + else (wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz") + ) + ), output: temp(wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam"), params: library=library_prep_kit, - threads: 8 + read_structure="--read-structures " + read_structure if read_structure else "", + threads: 1 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -37,10 +64,83 @@ rule fastqbam: "--input {input.fastq_r1} {input.fastq_r3} " "--sample {wildcards.sample} " "--library {params.library} " - "--output {output} " + "--output {output} {params.read_structure} " ") &> {log}" +if correct_umi: + + rule CorrectUMI: + input: + bam=wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam", + umi_file=umi_file, + output: + bam=temp( + wrkdir + / "fastq" + / "{run_id}" + / "{sample}_{lane}_unmapped_umi_corrected.bam" + ), + metrics=wrkdir + / "metrics" + / "correct_umi" + / "{run_id}" + / "{sample}_{lane}_umi_metrics.txt", + params: + max_mismatches=correct_umi_max_mismatches, + min_distance=correct_umi_min_distance, + threads: 1 + resources: + mem_mb=2000, + runtime=72 * 60, + nodes=1, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio" / "correct_umi_{run_id}_{sample}_{lane}.log", + message: + "Correcting UMIs." + shell: + "(" + "fgbio -Xmx{resources.mem_mb}m --compression 1 --async-io CorrectUmis " + "--input {input.bam} " + "--output {output.bam} " + "--max-mismatches 3 " + "--min-distance 2 " + "--umi-files {input.umi_file} " + "--metrics {output.metrics} " + "--dont-store-original-umis ) &> {log}" + + +if not read_structure: + print("Hi") + + rule AnnotateUMI: + input: + alignment=wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", + fastq_r2=wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz", + output: + temp( + wrkdir + / "alignments" + / "{run_id}" + / "{sample}_aln_{lane}_umi_annot.bam" + ), + threads: 1 + resources: + mem_mb=8000, + runtime=72 * 60, + nodes=1, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio" / "annotate_umi_{run_id}_{sample}_{lane}.log", + message: + "Annotating BAM with UMIs from fastq file." + shell: + "fgbio AnnotateBamWithUmis -i {input.alignment} -f {input.fastq_r2} -o {output} -t RX -q UQ -s true &> {log}" + + rule bwa_map: """ First pass alignemnt @@ -48,13 +148,20 @@ rule bwa_map: """ input: genome=genome, - bam=wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam", + bam=( + wrkdir + / "fastq" + / "{run_id}" + / "{sample}_{lane}_unmapped_umi_corrected.bam" + if correct_umi + else wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam" + ), output: temp(wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam"), threads: 8 resources: mem_mb=12000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -79,6 +186,14 @@ rule merge: """ input: expand( + wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, + ) + if read_structure + else expand( wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}_umi_annot.bam", filtered_product, run_id=RUN_ID, @@ -86,11 +201,11 @@ rule merge: lane=LANE, ), output: - temp(wrkdir / "alignments" / "{sample}_merged_umi_annot.bam"), + bam=temp(wrkdir / "alignments" / "{sample}_merged_umi_annot.bam"), threads: 8 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, conda: "../envs/samtools.yaml" @@ -99,7 +214,7 @@ rule merge: log: logdir / "samtools/{sample}_merge.log", shell: - "samtools merge -f {output} {input} &> {log}" + "(samtools merge -f {output.bam} {input}) &> {log} " rule realign: @@ -114,10 +229,11 @@ rule realign: bai=temp(wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam.bai"), threads: 8 resources: - mem_mb=16000, - runtime=24 * 60, + mem_mb=50000, + runtime=72 * 60, nodes=1, mem_fgbio=8000, + mem_samtools=8000, conda: "../envs/fgbio.yaml" log: @@ -133,5 +249,5 @@ rule realign: "--ref {input.ref} " "--tags-to-reverse Consensus " "--tags-to-revcomp Consensus " - "| samtools sort --threads {threads} -o {output.bam}##idx##{output.bai} --write-index " + "| samtools sort --threads {threads} -m{resources.mem_samtools}m -o {output.bam}##idx##{output.bai} --write-index " ") &> {log} " diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index ef7d139..1df85ac 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,43 +1,32 @@ -if "library_prep_kit" in config: - library_prep_kit = config["library_prep_kit"] - if library_prep_kit == "": - library_prep_kit = "Unknown" -else: - library_prep_kit = "Unknown" +print( + """ + \tAlignment pipeline for UMI based sequencing reads + \tAuthor: Shashwat Sahay + \tEmail: shashwat.sahay@charite.de + \tVersion: 0.2.0 -if "sample" not in config: - raise ValueError("Sample name not provided") + """ +) -if "pid" not in config: - raise ValueError("Patient ID not provided") -if "genome" not in config: - raise ValueError("Genome not provided") -else: - genome = config["genome"] +def get_data_time(): + now = datetime.now() -if "work_dir" not in config: - raise ValueError("Work directory not provided") -else: - wrkdir = Path(config["work_dir"]) + dt_string = now.strftime("%d/%m/%Y %H:%M:%S") + return "========= " + dt_string + " ========= " -if "log_dir" not in config: - logdir = wrkdir / "logs" -else: - logdir = Path(config["log_dir"]) -if "trim_adapters" not in config: - config["trim_adapters"] = False +print(get_data_time(), "Starting run") -if config["trim_adapters"]: - if "Adapter_R1" not in config: - raise ValueError("Adapter sequence for R1 not provided") - else: - adapter_seq_r1 = config["Adapter_R1"] - if "Adapter_R3" not in config: - raise ValueError("Adapter sequence for R3 not provided") - else: - adapter_seq_r3 = config["Adapter_R3"] +####################################################################### +################# Sanity check on the input data ###################### +####################################################################### + +if "sample" not in config: + raise ValueError("Sample name not provided") + +if "pid" not in config: + raise ValueError("Patient ID not provided") if "metadata" not in config: raise ValueError("Metadata file not provided") @@ -48,7 +37,14 @@ else: & (metadata["PATIENT_ID"] == config["pid"]) ] if metadata.shape[0] == 0: - raise ValueError("No metadata found for the given sample and patient ID") + raise ValueError( + "No metadata found for the given sample: %s and patient ID: %s" + % (config["sample"], config["pid"]) + ) + +########################################################## +###### Setting Sequencing type and related parameters #### +########################################################## if "SeqType" not in config: raise ValueError("Sequencing type not provided") @@ -72,79 +68,231 @@ else: dict_genome = config["dict_genome"] if "bait_regions" not in config: - print("Bait regions not provided will be calculated from target regions") + print( + get_data_time(), + "Bait regions not provided will be calculated from target regions", + ) else: bait_regions = config["bait_regions"] + +########################################################## +################ Setting Library kit ##################### +########################################################## + +if "library_prep_kit" in config: + library_prep_kit = config["library_prep_kit"] + if library_prep_kit == "": + library_prep_kit = "Unknown" +else: + library_prep_kit = "Unknown" + + +########################################################## +################## Setting Genome ######################## +########################################################## + +if "genome" not in config: + raise ValueError("Genome not provided") +else: + genome = config["genome"] + if "dbsnp" not in config: raise ValueError("dbSNP file not provided") else: dbsnp = config["dbsnp"] -if "SORTING_COLLECTION_SIZE_RATIO" not in config: - print("Setting default value for SORTING_COLLECTION_SIZE_RATIO to 0.01") - SORTING_COLLECTION_SIZE_RATIO = 0.01 + +########################################################## +############# Setting working directory ################## +########################################################## + +if "work_dir" not in config: + raise ValueError("Work directory not provided") else: - SORTING_COLLECTION_SIZE_RATIO = config["SORTING_COLLECTION_SIZE_RATIO"] + wrkdir = Path(config["work_dir"]) -if "allowed_edits" not in config: - print("Setting default value for allowed_edits to 1") - allowed_edits = 1 +if "log_dir" not in config: + logdir = wrkdir / "logs" else: - allowed_edits = config["allowed_edits"] + logdir = Path(config["log_dir"]) + -if "strategy" not in config: - print("Setting default value for strategy to Adjacency") - strategy = "Adjacency" +######################################################################### +############# Check if UMIs have been demultiplexed ##################### +######################################################################### + +if "read_structure" in config: + print( + get_data_time(), + "Read structure has been provided expecting only R1 and R2 files for each sample and lane", + ) + read_structure = config["read_structure"] + print(read_structure) + if read_structure == "": + raise ValueError("Read structure defined but not provided") else: - strategy = config["strategy"] + read_structure = False -if "consensus_min_reads" not in config: - if seq_type == "WGS": - print("Setting default value for consensus_min_reads to 3 as SeqType is WGS") - consensus_min_reads = 3 + +######################################################################### +############# Setting variables related to trimming ##################### +######################################################################### + +if "trim_adapters" not in config or (not config["trim_adapters"]): + print(get_data_time(), "Adapter trimming has been turned off") + config["trim_adapters"] = False + adapter_seq_r1 = None + adapter_seq_r3 = None +else: + if config["trim_adapters"]: + if "Adapter_R1" not in config: + raise ValueError("Adapter sequence for R1 not provided") + else: + adapter_seq_r1 = config["Adapter_R1"] + if "Adapter_R3" not in config: + raise ValueError("Adapter sequence for R3 not provided") + else: + adapter_seq_r3 = config["Adapter_R3"] + +########################################################################## +############# Setting variables related to UMI correction ################ +########################################################################## + +if "correct_umi" in config: + correct_umi = config["correct_umi"] +else: + correct_umi = False + +if correct_umi: + print(get_data_time(), "UMI Correction is enabled") + if "umi_file" not in config: + raise ValueError("UMI file not provided") + else: + umi_file = config["umi_file"] + if "correct_umi_max_mismatches" in config: + correct_umi_max_mismatches = config["correct_umi_max_mismatches"] else: print( - "Setting default value for consensus_min_reads to 1 as SeqType is not WGS" + get_data_time(), "Setting default value for correct_umi_max_mismatches to 3" ) - consensus_min_reads = 1 + correct_umi_max_mismatches = 3 + if "correct_umi_min_distance" in config: + correct_umi_min_distance = config["correct_umi_min_distance"] + else: + print( + get_data_time(), "Setting default value for correct_umi_min_distance to 1" + ) + correct_umi_min_distance = 1 +else: + print(get_data_time(), "UMI Correction is disabled") + + +########################################################################## +########### Initialising parameters for grouping by fgbio ################ +########################################################################## + +if "group_allowed_edits" not in config: + print(get_data_time(), "Setting default value for allowed_edits to 1") + group_allowed_edits = 1 +else: + group_allowed_edits = config["group_allowed_edits"] + +if "group_strategy" not in config: + print(get_data_time(), "Setting default value for strategy to Adjacency") + group_strategy = "Adjacency" +else: + group_strategy = config["group_strategy"] + +if "group_min_mapq" not in config: + print(get_data_time(), "Setting default value for group min mapq to 20") + group_min_mapq = 20 +else: + group_min_mapq = config["group_min_mapq"] + + +########################################################################## +############# Intialising parameters for consensus calling ############### +########################################################################## + +if "consensus_min_reads" not in config: + print(get_data_time(), "Setting default value for consensus_min_reads to 1") + consensus_min_reads = 1 else: consensus_min_reads = config["consensus_min_reads"] if "consensus_min_base_qual" not in config: - print("Setting default value for consensus_min_base_qual to 20") - consensus_min_base_qual = 20 + print(get_data_time(), "Setting default value for consensus_min_base_qual to 2") + consensus_min_base_qual = 2 else: consensus_min_base_qual = config["consensus_min_base_qual"] + +if "consensus_min_input_base_mapq" not in config: + print( + get_data_time(), "Setting default value for consensus_min_input_base_mapq to 10" + ) + consensus_min_input_base_mapq = 10 +else: + consensus_min_input_base_mapq = config["consensus_min_input_base_mapq"] + +if "consensus_error_rate_pre_umi" not in config: + print(get_data_time(), "Setting default value for consensus_error_rate to 45") + consensus_error_rate_pre_umi = 45 +else: + consensus_error_rate_pre_umi = config["consensus_error_rate_pre_umi"] + +if "consensus_error_rate_post_umi" not in config: + print(get_data_time(), "Setting default value for consensus_error_rate to 30") + consensus_error_rate_post_umi = 30 +else: + consensus_error_rate_post_umi = config["consensus_error_rate_post_umi"] + +############################################################################ +######## Initialising parameters for consensus filtering ################### +############################################################################ + if "filter_min_reads" not in config: - if seq_type == "WGS": - print("Setting default value for filter_min_reads to 3 as SeqType is WGS") - filter_min_reads = 3 - else: - print("Setting default value for filter_min_reads to 1 as SeqType is not WGS") - filter_min_reads = 1 + print(get_data_time(), "Setting default value for filter_min_reads") + filter_min_reads = 1 else: filter_min_reads = config["filter_min_reads"] if "filter_min_base_qual" not in config: - print("Setting default value for filter_min_base_qual to 40") + print(get_data_time(), "Setting default value for filter_min_base_qual to 40") filter_min_base_qual = 40 else: filter_min_base_qual = config["filter_min_base_qual"] -if "filter_max_error_rate" not in config: - print("Setting default value for filter_max_error_rate to 0.2") - filter_max_error_rate = 0.2 +if "filter_max_base_error_rate" not in config: + print( + get_data_time(), "Setting default value for filter_max_base_error_rate to 0.2" + ) + filter_max_base_error_rate = 0.2 else: - filter_max_error_rate = config["filter_max_error_rate"] + filter_max_base_error_rate = config["filter_max_base_error_rate"] +if "filter_max_read_error_rate" not in config: + print( + get_data_time(), "Setting default value for filter_max_base_error_rate to 0.1" + ) + filter_max_read_error_rate = 0.1 +else: + filter_max_read_error_rate = config["filter_max_read_error_rate"] -LANE = metadata["LANE_NO"].unique().tolist() -RUN_ID = metadata["RUN_ID"].unique().tolist() +if "filter_max_no_call_fraction" not in config: + print( + get_data_time(), "Setting default value for filter_max_base_error_rate to 0.1" + ) + filter_max_no_call_fraction = 0.1 +else: + filter_max_no_call_fraction = config["filter_max_no_call_fraction"] -# Create a function which checks if the input configuration is valid +# Create a set of valid combination for run lane and read ids + +LANE = metadata["LANE_NO"].unique().tolist() +RUN_ID = metadata["RUN_ID"].unique().tolist() def filter_combinator(combinator, allow_list): diff --git a/workflow/rules/create_links.smk b/workflow/rules/create_links.smk index 73773f2..bc21743 100644 --- a/workflow/rules/create_links.smk +++ b/workflow/rules/create_links.smk @@ -2,6 +2,7 @@ rule create_links_files: params: metadata=config["metadata"], fastq_dir=wrkdir / "fastq", + read_structure=read_structure, resources: mem_mb=1000, runtime=20, @@ -21,12 +22,16 @@ rule create_links_files: sample=config["sample"], lane=LANE, ), - fastq_r3=expand( - wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - filtered_product, - run_id=RUN_ID, - sample=config["sample"], - lane=LANE, + fastq_r3=( + expand( + wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, + ) + if not read_structure + else [] ), message: "Creating links to fastq files" @@ -36,6 +41,14 @@ rule create_links_files: (metadata["SAMPLE_NAME"] == config["sample"]) & (metadata["PATIENT_ID"] == config["pid"]) ] + if params.read_structure: + if metadata["READ"].nunique() != 2: + raise ValueError("Read structure is provided but R1 R2 and R3 provided") + else: + if metadata["READ"].nunique() != 3: + raise ValueError( + "Read structure is not provided but R1 R2 and R3 not provided" + ) for index, row in metadata.iterrows(): fastq_file = Path(row["FASTQ_FILE"]) diff --git a/workflow/rules/duplicate_marking.smk b/workflow/rules/duplicate_marking.smk index ff10227..3fe1bd5 100755 --- a/workflow/rules/duplicate_marking.smk +++ b/workflow/rules/duplicate_marking.smk @@ -1,10 +1,12 @@ +# Deprecated + rule duplicates: input: - wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam", + wrkdir / "alignments" / "{sample}_temp.bam", output: - bam=wrkdir / "alignments" / "{sample}_dedup.bam", - bai=wrkdir / "alignments" / "{sample}_dedup.bam.bai", + bam=temp(wrkdir / "alignments" / "{sample}_dedup.bam"), + # bai=temp(wrkdir / "alignments" / "{sample}_temp.bam.bai"), metric=wrkdir / "metrics" / "{sample}_marked_dup_metrics.txt", params: SORTING_COLLECTION_SIZE_RATIO=SORTING_COLLECTION_SIZE_RATIO, @@ -15,14 +17,13 @@ rule duplicates: logdir / "gatk/{sample}_dedup.log", resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, message: - "Marking duplicates on consensus reads." + "Marking duplicates on pre-consensus reads.: Decrapated slated for removal." shell: " ( " " gatk --java-options '-Xmx{resources.mem_mb}m' MarkDuplicates -I {input} " - " -O {output.bam} -M {output.metric} " - " --SORTING_COLLECTION_SIZE_RATIO {params.SORTING_COLLECTION_SIZE_RATIO} && " - " samtools index -b {output.bam} {output.bai} " + " -O {output.bam} -M {output.metric} --BARCODE_TAG RX " + " --SORTING_COLLECTION_SIZE_RATIO {params.SORTING_COLLECTION_SIZE_RATIO} " " ) &> {log} " diff --git a/workflow/rules/flagstatt.smk b/workflow/rules/flagstatt.smk index da699de..dc55326 100755 --- a/workflow/rules/flagstatt.smk +++ b/workflow/rules/flagstatt.smk @@ -1,23 +1,3 @@ -# rule flagstatt: -# input: -# bam=wrkdir / "alignments" / "{sample}_dedup.recall.sorted.bam", -# output: -# wrkdir / "metrics" / "{sample}.flagstat", -# conda: -# "../envs/sambamba.yaml" -# threads: 1 -# resources: -# mem_mb=8000, -# runtime=24 * 60, -# nodes=1, -# log: -# logdir / "sambamba/{sample}.log", -# message: -# "Running Flagstat" -# shell: -# "(sambamba flagstat {input.bam} > {output}) &> {log}" - - rule flagstatt: input: bam=wrkdir / "alignments" / "{sample}_{ext}.bam", diff --git a/workflow/rules/metric_hsmetrics.smk b/workflow/rules/metric_hsmetrics.smk index 687458a..29c480c 100755 --- a/workflow/rules/metric_hsmetrics.smk +++ b/workflow/rules/metric_hsmetrics.smk @@ -41,7 +41,7 @@ if seq_type in ["Panel", "WES"]: threads: 1 resources: mem_mb=8000, - runtime=24 * 60, + runtime=4 * 60, nodes=1, log: logdir / "bedtools" / "{sample}_slop.log", diff --git a/workflow/rules/recalibration.smk b/workflow/rules/recalibration.smk index f15ef7a..3ccf3b6 100755 --- a/workflow/rules/recalibration.smk +++ b/workflow/rules/recalibration.smk @@ -1,6 +1,6 @@ rule baseRecalibrator: input: - bam=wrkdir / "alignments" / "{sample}_dedup.bam", + bam=wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam", dbsnp=dbsnp, genome=genome, output: @@ -12,7 +12,7 @@ rule baseRecalibrator: threads: 8 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, log: logdir / "gatk/{sample}_recal.log", diff --git a/workflow/rules/umi_consensus.smk b/workflow/rules/umi_based_rules.smk similarity index 66% rename from workflow/rules/umi_consensus.smk rename to workflow/rules/umi_based_rules.smk index f190eb2..444914d 100755 --- a/workflow/rules/umi_consensus.smk +++ b/workflow/rules/umi_based_rules.smk @@ -1,22 +1,4 @@ -rule fgbio: - input: - alignment=wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", - fastq_r2=wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz", - output: - temp(wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}_umi_annot.bam"), - threads: 1 - resources: - mem_mb=8000, - runtime=4 * 60, - nodes=1, - conda: - "../envs/fgbio.yaml" - log: - logdir / "fgbio" / "annotate_umi_{run_id}_{sample}_{lane}.log", - message: - "Annotating BAM with UMIs from fastq file." - shell: - "fgbio AnnotateBamWithUmis -i {input.alignment} -f {input.fastq_r2} -o {output} -t RX -q UQ -s true &> {log}" + rule fix_mate: @@ -26,13 +8,17 @@ rule fix_mate: This can cause problems in downstream analysis. """ input: - bam=wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", + bam=( + wrkdir / "alignments" / "{sample}_merged_umi_annot.bam" + if not read_structure + else wrkdir / "alignments" / "{sample}_merged_umi_annot.bam" + ), output: bam=temp(wrkdir / "alignments" / "{sample}_mate_fix.bam"), threads: 2 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -57,12 +43,13 @@ rule group_reads: ), stats=wrkdir / "metrics" / "{sample}.grouped-family-sizes.txt", params: - strategy=strategy, - allowed_edits=allowed_edits, + strategy=group_strategy, + allowed_edits=group_allowed_edits, + min_mapq=group_min_mapq, threads: 2 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -75,6 +62,7 @@ rule group_reads: "--input {input.bam} " "--strategy {params.strategy} " "--edits {params.allowed_edits} " + "--min-map-q {params.min_mapq} " "--output {output.bam} " "--family-size-histogram {output.stats} " "&> {log} " @@ -85,13 +73,17 @@ rule call_consensus_reads: bam=wrkdir / "alignments" / "{sample}_merged_aln_umi_annot_sorted_grouped.bam", output: bam=temp(wrkdir / "alignments" / "{sample}.cons.unmapped.bam"), + metrics=logdir / "fgbio/call_consensus_reads.{sample}.log", params: min_reads=consensus_min_reads, min_base_qual=consensus_min_base_qual, + min_input_base_mapq=consensus_min_input_base_mapq, + error_rate_post_umi=consensus_error_rate_post_umi, + error_rate_pre_umi=consensus_error_rate_pre_umi, threads: 4 resources: mem_mb=4000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -104,7 +96,10 @@ rule call_consensus_reads: "--input {input.bam} " "--output {output.bam} " "--min-reads {params.min_reads} " - "--min-input-base-quality {params.min_base_qual} " + "--error-rate-pre-umi {params.error_rate_pre_umi} " + "--error-rate-post-umi {params.error_rate_post_umi} " + "--min-consensus-base-quality {params.min_base_qual} " + "--min-input-base-quality {params.min_input_base_mapq} " "--threads {threads} " "&> {log}" @@ -118,11 +113,13 @@ rule filter_consensus_reads: params: min_reads=filter_min_reads, min_base_qual=filter_min_base_qual, - max_error_rate=filter_max_error_rate, + max_base_error_rate=filter_max_base_error_rate, + max_read_error_rate=filter_max_read_error_rate, + max_no_call_fraction=filter_max_no_call_fraction, threads: 8 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -138,6 +135,29 @@ rule filter_consensus_reads: "--output {output.bam} " "--ref {input.ref} " "--min-reads {params.min_reads} " + "--max-read-error-rate {params.max_read_error_rate} " + "--max-base-error-rate {params.max_base_error_rate} " "--min-base-quality {params.min_base_qual} " - "--max-base-error-rate {params.max_error_rate} " + "--max-no-call-fraction {params.max_no_call_fraction} " + ") &> {log} " + + +rule gatherConsensusMetrics: + input: + logdir / "fgbio/call_consensus_reads.{sample}.log", + output: + wrkdir / "metrics" / "{sample}_consensus_metrics.tsv", + params: + sample=str(config["sample"]), + threads: 1 + resources: + mem_mb=1000, + runtime=60, + nodes=1, + log: + logdir / "metrics" / "{sample}_consensus_metrics.log", + conda: + "../envs/consensus.yaml" + script: + "../scripts/getConsensusMetrics.py" diff --git a/workflow/scripts/getConsensusMetrics.py b/workflow/scripts/getConsensusMetrics.py new file mode 100644 index 0000000..3706538 --- /dev/null +++ b/workflow/scripts/getConsensusMetrics.py @@ -0,0 +1,24 @@ +import os +import pandas as pd + + + +column_set1=['Raw Reads Filtered Due to Insufficient Support', 'Raw Reads Filtered Due to Mismatching Cigars', 'Raw Reads Filtered Due to Low Base Quality', + 'Raw Reads Filtered Due to Orphan Consensus Created','Consensus reads emitted', 'Total Raw Reads Considered'] +column_set2=['overlapping templates', 'corrected templates', 'overlapping bases', 'corrected bases' ] +consensus_df=pd.DataFrame(columns=column_set1+column_set2, index=[snakemake.params['sample']]) +# print(consensus_df) +with open(snakemake.input[0]) as handle: + for line in handle: + for i in consensus_df.columns: + if i in line: + # print(line) + if i in column_set1: + line=line.split(':')[-1].split('(')[0].replace('.', '').replace(',','').replace(' ', '').strip() + else: + line=line.split(']')[-1].split('(')[0].replace('.', '').replace(i, '').replace(',','').replace(' ', '').strip() + line=int(line) + consensus_df.loc['T-NHL_98_malign', i]=line + break + +consensus_df.to_csv(snakemake.output[0], sep='\t', index=True) From d67ac4a647bcfa9b795a727fcd44f5e23d3f06af Mon Sep 17 00:00:00 2001 From: sahays Date: Wed, 19 Jun 2024 12:04:38 +0200 Subject: [PATCH 02/11] Aligning metadata information according to the otp folder --- .pre-commit-config.yaml | 0 README.md | 2 +- workflow/rules/common.smk | 2 +- workflow/rules/create_links.smk | 4 ++-- 4 files changed, 4 insertions(+), 4 deletions(-) mode change 100644 => 100755 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100644 new mode 100755 diff --git a/README.md b/README.md index 0943a3c..38fe433 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ Please create a metadata file with columns other columns can exist but not requi 1. FASTQ_FILE: A column containing absolute paths to the locations of the FASTQ files, 2. READ: A column contain information with regards to R1, R2 and R3 of the sequencing file. Needs to be prefixed with `R` if not present 3. LANE_NO: A column containing the lane information for the sequencing files. Should be prefixed with `L_` if not present -4. SAMPLE_NAME: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run +4. SAMPLE_TYPE: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run 5. PATIENT_ID :Containing the ```pid``` which is inputed in the config file. Please note if the metadata file consists of multipe PIDs only the ```sample``` ```pid``` combination mentioned in the sample and pid directive of the config.yaml will be run. 7. RUN_ID: Please mention the run id for the squencing run for the sample diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 1df85ac..cddc73b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -33,7 +33,7 @@ if "metadata" not in config: else: metadata = pd.read_csv(config["metadata"]) metadata = metadata[ - (metadata["SAMPLE_NAME"] == config["sample"]) + (metadata["SAMPLE_TYPE"] == config["sample"]) & (metadata["PATIENT_ID"] == config["pid"]) ] if metadata.shape[0] == 0: diff --git a/workflow/rules/create_links.smk b/workflow/rules/create_links.smk index bc21743..277a8a2 100644 --- a/workflow/rules/create_links.smk +++ b/workflow/rules/create_links.smk @@ -38,7 +38,7 @@ rule create_links_files: run: metadata = pd.read_csv(params.metadata) metadata = metadata[ - (metadata["SAMPLE_NAME"] == config["sample"]) + (metadata["SAMPLE_TYPE"] == config["sample"]) & (metadata["PATIENT_ID"] == config["pid"]) ] if params.read_structure: @@ -59,7 +59,7 @@ rule create_links_files: params.fastq_dir / row["RUN_ID"] / ( - row["SAMPLE_NAME"] + row["SAMPLE_TYPE"] + "_" + row["READ"] + "_" From 80c33d97522e7a1db4e56e82ab095b76860fd44b Mon Sep 17 00:00:00 2001 From: sahays Date: Wed, 19 Jun 2024 12:11:21 +0200 Subject: [PATCH 03/11] strange --- .gitignore | 2 + LICENSE | 0 README.md | 33 +- config/config.yaml | 28 -- config/test_config_exliquid.yaml | 53 ---- dag.pdf | Bin profile/config.yaml | 10 +- workflow/Snakefile | 25 +- workflow/envs/consensus.yaml | 6 - workflow/envs/coveragePlot.yaml | 0 workflow/envs/cutadapt.yaml | 0 workflow/envs/fgbio.yaml | 0 workflow/envs/gatk.yaml | 0 workflow/envs/mosdepth.yaml | 0 workflow/envs/sambamba.yaml | 0 workflow/envs/samtools.yaml | 0 workflow/rules/adapter_trimming.smk | 167 +++++----- workflow/rules/alignment.smk | 158 ++-------- workflow/rules/common.smk | 288 +++++------------- workflow/rules/create_links.smk | 29 +- workflow/rules/duplicate_marking.smk | 17 +- workflow/rules/flagstatt.smk | 20 ++ workflow/rules/metric_hsmetrics.smk | 2 +- workflow/rules/recalibration.smk | 4 +- ...{umi_based_rules.smk => umi_consensus.smk} | 78 ++--- workflow/scripts/getConsensusMetrics.py | 24 -- workflow/scripts/getCoveragePlot_snakemake.R | 0 27 files changed, 271 insertions(+), 673 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.md mode change 100644 => 100755 config/config.yaml delete mode 100644 config/test_config_exliquid.yaml delete mode 100644 dag.pdf mode change 100644 => 100755 profile/config.yaml mode change 100644 => 100755 workflow/Snakefile delete mode 100644 workflow/envs/consensus.yaml mode change 100644 => 100755 workflow/envs/coveragePlot.yaml mode change 100644 => 100755 workflow/envs/cutadapt.yaml mode change 100644 => 100755 workflow/envs/fgbio.yaml mode change 100644 => 100755 workflow/envs/gatk.yaml mode change 100644 => 100755 workflow/envs/mosdepth.yaml mode change 100644 => 100755 workflow/envs/sambamba.yaml mode change 100644 => 100755 workflow/envs/samtools.yaml mode change 100644 => 100755 workflow/rules/common.smk mode change 100644 => 100755 workflow/rules/create_links.smk rename workflow/rules/{umi_based_rules.smk => umi_consensus.smk} (66%) delete mode 100644 workflow/scripts/getConsensusMetrics.py mode change 100644 => 100755 workflow/scripts/getCoveragePlot_snakemake.R diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 0b9ea32..945d63d --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ config/S*.yaml .snakemake/ Singularity/ conda_envs/ +conda_envs.bkp/ + conda_envs_ohne_singularity/ profile/config.yaml.bkp diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 38fe433..699aa83 --- a/README.md +++ b/README.md @@ -9,7 +9,9 @@ The pipeline was tested and supported by Daniel Steiert. The pipeline is made for aligning UMI based WGS/WES and Panel Seq data and to compute the QC metrics associated with it. -We require the sequencing is performed in paired end mode +We require the sequencing is performed in paired end mode and must contain R1 (forward read) R2 (UMI) and R3 (reverse read) for each lane and run + +Currently the ability to provide support is limited. # Prerequisites @@ -91,40 +93,15 @@ Please modify the entry for 16. `dict_genome`: An absolute path to dict file for the given genomes, ignored if `SeqType` is `WGS` -17. `group_allowed_edits`: Number of edit allowed when grouping based on umi. defaults to 0, should be set to zero if correct_umi is true -18. `group_min_mapq: 20`: Set `--min-map-q` of groupReadsByUMI -19. `group_strategy`: Set the `--strategy` param of groupReadsByUMI deafults to `Adjacency` - - -20. `consensus_min_reads`: 1 -21. `consensus_min_base_qual`: 2 -22. `consensus_min_input_base_mapq`: 10 -23. `consensus_error_rate_pre_umi`: 45 -24. `consensus_error_rate_post_umi`: 30 - - -25. `filter_min_reads`: 3 -26. `filter_min_base_qual`: 2 -27. `filter_max_base_error_rate`: 0.1 -28. `filter_max_read_error_rate`: 0.05 -29. `filter_max_no_call_fraction`: 0.2 - - -30. `read_structure`: 8M143T 8M143T - -31. `correct_umi`: True -32. `correct_umi_max_mismatches`: 3 -33. `correct_umi_min_distance`: 1 -34. `umi_file`: ## Metadata file -Please create a metadata file with columns other columns can exist but not required +Please create a metadata file with columns 1. FASTQ_FILE: A column containing absolute paths to the locations of the FASTQ files, 2. READ: A column contain information with regards to R1, R2 and R3 of the sequencing file. Needs to be prefixed with `R` if not present 3. LANE_NO: A column containing the lane information for the sequencing files. Should be prefixed with `L_` if not present -4. SAMPLE_TYPE: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run +4. SAMPLE_NAME: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run 5. PATIENT_ID :Containing the ```pid``` which is inputed in the config file. Please note if the metadata file consists of multipe PIDs only the ```sample``` ```pid``` combination mentioned in the sample and pid directive of the config.yaml will be run. 7. RUN_ID: Please mention the run id for the squencing run for the sample diff --git a/config/config.yaml b/config/config.yaml old mode 100644 new mode 100755 index 36a31a4..5919356 --- a/config/config.yaml +++ b/config/config.yaml @@ -42,31 +42,3 @@ bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions f chrom_sizes: /Path/to/chrom_sizes.tsv # Tab separated file containing the chromosome names and their lengths should match the reference genome dict_genome: /Path/to/genome.dict # Dictionary file for the reference genome - -#Parameters for Read_Grouping -group_allowed_edits: 0 -group_min_mapq: 20 -group_strategy: Adjacency - -# Configuration for consensus calling -consensus_min_reads: 1 -consensus_min_base_qual: 2 -consensus_min_input_base_mapq: 10 -consensus_error_rate_pre_umi: 45 -consensus_error_rate_post_umi: 30 - -# Configuration for consensus filter -filter_min_reads: 3 -filter_min_base_qual: 2 -filter_max_base_error_rate: 0.1 -filter_max_read_error_rate: 0.05 -filter_max_no_call_fraction: 0.2 - -# Configuration when UMI present in R1 and R2 files -read_structure: 8M143T 8M143T - -# Correct UMI based on list of UMI's -correct_umi: True -correct_umi_max_mismatches: 3 -correct_umi_min_distance: 1 -umi_file: /path/to/umi.txt diff --git a/config/test_config_exliquid.yaml b/config/test_config_exliquid.yaml deleted file mode 100644 index ac9ba5e..0000000 --- a/config/test_config_exliquid.yaml +++ /dev/null @@ -1,53 +0,0 @@ -Adapter_R1: [] -Adapter_R3: [] -SeqType: Panel -chrom_sizes: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/stats/hg19_chrTotalLength.tsv - -dbsnp: /applications/otp/ngs_share_complete/assemblies/hg19_GRCh37_1000genomes/databases/dbSNP/dbSNP_147/00-All.vcf.gz -dict_genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.dict - -genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.fa - -library_prep_kit: IDT_xGen_cfDNA_FFPE - -log_dir: /dh-projects/exliquid/scratch/results_alignment_test/logs/ - -metadata: /dh-projects/exliquid/scratch/input/bare_min_meta_data.csv - -pid: EXLIQUID_EX59 -sample: EXLIQUID_EX59-BUFFYCOAT_control - -target_regions: /dh-projects/exliquid/raw_data/reference/Targets-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed -bait_regions: /dh-projects/exliquid/raw_data/reference/Probes-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed -# trim_adapters: false - -work_dir: /dh-projects/exliquid/scratch/results_alignment_test/ - -# bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions for the analysis only required if SeqType is Panel/WES, if not provided a bait file will be generated from the target file - -#Parameters for Read_Grouping -group_allowed_edits: 0 -group_min_mapq: 20 -group_strategy: Adjacency - -# Configuration for consensus calling -consensus_min_reads: 1 -consensus_min_base_qual: 2 -consensus_min_input_base_mapq: 10 -consensus_error_rate_pre_umi: 45 -consensus_error_rate_post_umi: 30 - -# Configuration for consensus filter -filter_min_reads: 3 -filter_min_base_qual: 2 -filter_max_base_error_rate: 0.1 -filter_max_read_error_rate: 0.05 -filter_max_no_call_fraction: 0.2 - -# Configuration when UMI present in R1 and R2 files -read_structure: 8M143T 8M143T -# Correct UMI based on list of UMI's -correct_umi: True -correct_umi_max_mismatches: 3 -correct_umi_min_distance: 1 -umi_file: /dh-projects/exliquid/raw_data/reference/umi_list.txt diff --git a/dag.pdf b/dag.pdf deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/profile/config.yaml b/profile/config.yaml old mode 100644 new mode 100755 index 692d38b..b38fca9 --- a/profile/config.yaml +++ b/profile/config.yaml @@ -3,14 +3,14 @@ jobs: 10 latency-wait: 60 reason: True -# default-resources: -# slurm_partition: compute + keep-going: True printshellcmds: True rerun-incomplete: True restart-times: 2 + # delete-temp-output: True -conda-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/conda_envs +conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs # conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs_ohne_singularity -singularity-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/Singularity -singularity-args: "-B /dh-projects/ag-ishaque:/dh-projects/ag-ishaque,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy,/dh-projects/exliquid:/dh-projects/exliquid" +singularity-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/Singularity +singularity-args: "-B /dh-projects/richter_transformation:/dh-projects/richter_transformation,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy" diff --git a/workflow/Snakefile b/workflow/Snakefile old mode 100644 new mode 100755 index 716db10..9a8334e --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,5 +1,4 @@ import os -from datetime import datetime import pandas as pd from pathlib import Path from itertools import product @@ -10,10 +9,10 @@ container: "docker://condaforge/mambaforge" include: "rules/common.smk" #done include: "rules/create_links.smk" -include: "rules/umi_based_rules.smk" #done include: "rules/adapter_trimming.smk" #done include: "rules/alignment.smk" #done -# include: "rules/duplicate_marking.smk" #done +include: "rules/umi_consensus.smk" #done +include: "rules/duplicate_marking.smk" #done include: "rules/flagstatt.smk" #done include: "rules/metric_hsmetrics.smk" #done include: "rules/metric_insert_size.smk" #done @@ -36,10 +35,11 @@ rule all: wrkdir / "metrics" / "{sample}.mosdepth.global.dist.txt", sample=config["sample"], ), + # expand(wrkdir / "metrics" / "{sample}.flagstat", sample=config["sample"]), expand( wrkdir / "metrics" / "{sample}_{ext}.flagstat", sample=config["sample"], - ext=["dedup.recall.sorted"], + ext=["dedup.recall.sorted", "merged_umi_annot"], ), expand( wrkdir / "metrics" / "{sample}_insert_size_metrics.txt", @@ -64,20 +64,3 @@ rule all: expand(wrkdir / "metrics" / "{sample}.hs_metrics.txt", sample=config["sample"]) if config["SeqType"] in ["Panel", "WES"] else [], - expand( - wrkdir - / "metrics" - / "correct_umi" - / "{run_id}" - / "{sample}_{lane}_umi_metrics.txt", - filtered_product, - run_id=RUN_ID, - sample=config["sample"], - lane=LANE, - ) - if correct_umi - else [], - expand( - wrkdir / "metrics" / "{sample}_consensus_metrics.tsv", - sample=config["sample"], - ), diff --git a/workflow/envs/consensus.yaml b/workflow/envs/consensus.yaml deleted file mode 100644 index ff898d8..0000000 --- a/workflow/envs/consensus.yaml +++ /dev/null @@ -1,6 +0,0 @@ -name: consensus -channels: - - conda-forge -dependencies: - - pandas=2.2.2 - - python=3.11 diff --git a/workflow/envs/coveragePlot.yaml b/workflow/envs/coveragePlot.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/cutadapt.yaml b/workflow/envs/cutadapt.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/fgbio.yaml b/workflow/envs/fgbio.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/gatk.yaml b/workflow/envs/gatk.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/mosdepth.yaml b/workflow/envs/mosdepth.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/sambamba.yaml b/workflow/envs/sambamba.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/samtools.yaml b/workflow/envs/samtools.yaml old mode 100644 new mode 100755 diff --git a/workflow/rules/adapter_trimming.smk b/workflow/rules/adapter_trimming.smk index 3026e62..0778088 100755 --- a/workflow/rules/adapter_trimming.smk +++ b/workflow/rules/adapter_trimming.smk @@ -1,78 +1,103 @@ # disabling adapter trimming when when set to false, # To maintain the same input and output files we create links instead of running cutadapt +if config["trim_adapters"]: -rule create_adapter_fastq: - params: - adapt_1=adapter_seq_r1, - adapt_3=adapter_seq_r3, - output: - adapt_1=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq"), - adapt_3=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq"), - threads: 1 - resources: - mem_mb=1000, - runtime=20, - nodes=1, - message: - "Creating adapter fastq files" - run: - with open(output.adapt_1, "w") as handle: - count = 1 - for i in params.adapt_1: - handle.write(">adapter_" + str(count) + "\n") - handle.write(i + "\n") - count += 1 + rule create_adapter_fastq: + params: + adapt_1=adapter_seq_r1, + adapt_3=adapter_seq_r3, + output: + adapt_1=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq"), + adapt_3=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq"), + threads: 1 + resources: + mem_mb=1000, + runtime=20, + nodes=1, + message: + "Creating adapter fastq files" + run: + with open(output.adapt_1, "w") as handle: + count = 1 + for i in params.adapt_1: + handle.write(">adapter_" + str(count) + "\n") + handle.write(i + "\n") + count += 1 - with open(output.adapt_3, "w") as handle: - count = 1 - for i in params.adapt_3: - handle.write(">adapter_" + str(count) + "\n") - handle.write(i + "\n") - count += 1 + with open(output.adapt_3, "w") as handle: + count = 1 + for i in params.adapt_3: + handle.write(">adapter_" + str(count) + "\n") + handle.write(i + "\n") + count += 1 + rule cutadapt: + input: + adapt_1=wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq", + adapt_3=wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq", + fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", + fastq_r3=wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", + output: + fastq_r1=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R1_{lane}_trim.fastq.gz" + ), + fastq_r3=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R3_{lane}_trim.fastq.gz" + ), + log: + logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", + threads: 8 + resources: + mem_mb=8000, + runtime=4 * 60, + nodes=1, + conda: + "../envs/cutadapt.yaml" + message: + "Trimming adapters using cutadapt" + shell: + "cutadapt -j {threads} -a file:{input.adapt_1} -A file:{input.adapt_3} -o {output.fastq_r1} -p {output.fastq_r3} {input.fastq_r1} {input.fastq_r3} &> {log}" -rule cutadapt: - input: - adapt_1=wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq", - adapt_3=wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq", - fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", - fastq_r3=( - wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz" - if read_structure - else wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz" - ), - output: - fastq_r1=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz" - ), - fastq_r3=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R2_{lane}_trim.fastq.gz" - if read_structure - else wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}.fastq.gz" - ), - log: - logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", - threads: 8 - resources: - mem_mb=8000, - runtime=72 * 60, - nodes=1, - conda: - "../envs/cutadapt.yaml" - message: - "Trimming adapters using cutadapt" - shell: - "cutadapt -j {threads} -a file:{input.adapt_1} -A file:{input.adapt_3} -o {output.fastq_r1} -p {output.fastq_r3} {input.fastq_r1} {input.fastq_r3} &> {log}" +else: + + rule cutadapt: + input: + fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", + fastq_r3=wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", + output: + fastq_r1=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R1_{lane}_trim.fastq.gz" + ), + fastq_r3=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R3_{lane}_trim.fastq.gz" + ), + log: + logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", + threads: 1 + resources: + mem_mb=1000, + runtime=60, + nodes=1, + message: + "Skipping adapter trimming, creating links instead of running cutadapt" + conda: + "../envs/cutadapt.yaml" + shell: + "(ln -s {input.fastq_r1} {output.fastq_r1} && ln -s {input.fastq_r3} {output.fastq_r3}) &> {log}" diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk index 3b7a9f9..86d1746 100755 --- a/workflow/rules/alignment.smk +++ b/workflow/rules/alignment.smk @@ -6,51 +6,24 @@ rule fastqbam: """ input: genome=genome, - fastq_r1=( - ( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz" - ) - if config["trim_adapters"] - else (wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz") - ), - fastq_r3=( - ( - ( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R2_{lane}_trim.fastq.gz" - ) - if config["trim_adapters"] - else (wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz") - ) - if read_structure - else ( - ( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz" - ) - if config["trim_adapters"] - else (wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz") - ) - ), + fastq_r1=wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R1_{lane}_trim.fastq.gz", + fastq_r3=wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R3_{lane}_trim.fastq.gz", output: temp(wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam"), params: library=library_prep_kit, - read_structure="--read-structures " + read_structure if read_structure else "", - threads: 1 + threads: 8 resources: mem_mb=8000, - runtime=72 * 60, + runtime=24 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -64,83 +37,10 @@ rule fastqbam: "--input {input.fastq_r1} {input.fastq_r3} " "--sample {wildcards.sample} " "--library {params.library} " - "--output {output} {params.read_structure} " + "--output {output} " ") &> {log}" -if correct_umi: - - rule CorrectUMI: - input: - bam=wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam", - umi_file=umi_file, - output: - bam=temp( - wrkdir - / "fastq" - / "{run_id}" - / "{sample}_{lane}_unmapped_umi_corrected.bam" - ), - metrics=wrkdir - / "metrics" - / "correct_umi" - / "{run_id}" - / "{sample}_{lane}_umi_metrics.txt", - params: - max_mismatches=correct_umi_max_mismatches, - min_distance=correct_umi_min_distance, - threads: 1 - resources: - mem_mb=2000, - runtime=72 * 60, - nodes=1, - conda: - "../envs/fgbio.yaml" - log: - logdir / "fgbio" / "correct_umi_{run_id}_{sample}_{lane}.log", - message: - "Correcting UMIs." - shell: - "(" - "fgbio -Xmx{resources.mem_mb}m --compression 1 --async-io CorrectUmis " - "--input {input.bam} " - "--output {output.bam} " - "--max-mismatches 3 " - "--min-distance 2 " - "--umi-files {input.umi_file} " - "--metrics {output.metrics} " - "--dont-store-original-umis ) &> {log}" - - -if not read_structure: - print("Hi") - - rule AnnotateUMI: - input: - alignment=wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", - fastq_r2=wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz", - output: - temp( - wrkdir - / "alignments" - / "{run_id}" - / "{sample}_aln_{lane}_umi_annot.bam" - ), - threads: 1 - resources: - mem_mb=8000, - runtime=72 * 60, - nodes=1, - conda: - "../envs/fgbio.yaml" - log: - logdir / "fgbio" / "annotate_umi_{run_id}_{sample}_{lane}.log", - message: - "Annotating BAM with UMIs from fastq file." - shell: - "fgbio AnnotateBamWithUmis -i {input.alignment} -f {input.fastq_r2} -o {output} -t RX -q UQ -s true &> {log}" - - rule bwa_map: """ First pass alignemnt @@ -148,20 +48,13 @@ rule bwa_map: """ input: genome=genome, - bam=( - wrkdir - / "fastq" - / "{run_id}" - / "{sample}_{lane}_unmapped_umi_corrected.bam" - if correct_umi - else wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam" - ), + bam=wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam", output: temp(wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam"), threads: 8 resources: mem_mb=12000, - runtime=72 * 60, + runtime=24 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -186,14 +79,6 @@ rule merge: """ input: expand( - wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", - filtered_product, - run_id=RUN_ID, - sample=config["sample"], - lane=LANE, - ) - if read_structure - else expand( wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}_umi_annot.bam", filtered_product, run_id=RUN_ID, @@ -201,11 +86,11 @@ rule merge: lane=LANE, ), output: - bam=temp(wrkdir / "alignments" / "{sample}_merged_umi_annot.bam"), + temp(wrkdir / "alignments" / "{sample}_merged_umi_annot.bam"), threads: 8 resources: mem_mb=8000, - runtime=72 * 60, + runtime=24 * 60, nodes=1, conda: "../envs/samtools.yaml" @@ -214,7 +99,7 @@ rule merge: log: logdir / "samtools/{sample}_merge.log", shell: - "(samtools merge -f {output.bam} {input}) &> {log} " + "samtools merge -f {output} {input} &> {log}" rule realign: @@ -229,11 +114,10 @@ rule realign: bai=temp(wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam.bai"), threads: 8 resources: - mem_mb=50000, - runtime=72 * 60, + mem_mb=16000, + runtime=24 * 60, nodes=1, mem_fgbio=8000, - mem_samtools=8000, conda: "../envs/fgbio.yaml" log: @@ -249,5 +133,5 @@ rule realign: "--ref {input.ref} " "--tags-to-reverse Consensus " "--tags-to-revcomp Consensus " - "| samtools sort --threads {threads} -m{resources.mem_samtools}m -o {output.bam}##idx##{output.bai} --write-index " + "| samtools sort --threads {threads} -o {output.bam}##idx##{output.bai} --write-index " ") &> {log} " diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk old mode 100644 new mode 100755 index cddc73b..ef7d139 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,50 +1,54 @@ -print( - """ - \tAlignment pipeline for UMI based sequencing reads - \tAuthor: Shashwat Sahay - \tEmail: shashwat.sahay@charite.de - \tVersion: 0.2.0 - - """ -) - +if "library_prep_kit" in config: + library_prep_kit = config["library_prep_kit"] + if library_prep_kit == "": + library_prep_kit = "Unknown" +else: + library_prep_kit = "Unknown" -def get_data_time(): - now = datetime.now() +if "sample" not in config: + raise ValueError("Sample name not provided") - dt_string = now.strftime("%d/%m/%Y %H:%M:%S") - return "========= " + dt_string + " ========= " +if "pid" not in config: + raise ValueError("Patient ID not provided") +if "genome" not in config: + raise ValueError("Genome not provided") +else: + genome = config["genome"] -print(get_data_time(), "Starting run") +if "work_dir" not in config: + raise ValueError("Work directory not provided") +else: + wrkdir = Path(config["work_dir"]) -####################################################################### -################# Sanity check on the input data ###################### -####################################################################### +if "log_dir" not in config: + logdir = wrkdir / "logs" +else: + logdir = Path(config["log_dir"]) -if "sample" not in config: - raise ValueError("Sample name not provided") +if "trim_adapters" not in config: + config["trim_adapters"] = False -if "pid" not in config: - raise ValueError("Patient ID not provided") +if config["trim_adapters"]: + if "Adapter_R1" not in config: + raise ValueError("Adapter sequence for R1 not provided") + else: + adapter_seq_r1 = config["Adapter_R1"] + if "Adapter_R3" not in config: + raise ValueError("Adapter sequence for R3 not provided") + else: + adapter_seq_r3 = config["Adapter_R3"] if "metadata" not in config: raise ValueError("Metadata file not provided") else: metadata = pd.read_csv(config["metadata"]) metadata = metadata[ - (metadata["SAMPLE_TYPE"] == config["sample"]) + (metadata["SAMPLE_NAME"] == config["sample"]) & (metadata["PATIENT_ID"] == config["pid"]) ] if metadata.shape[0] == 0: - raise ValueError( - "No metadata found for the given sample: %s and patient ID: %s" - % (config["sample"], config["pid"]) - ) - -########################################################## -###### Setting Sequencing type and related parameters #### -########################################################## + raise ValueError("No metadata found for the given sample and patient ID") if "SeqType" not in config: raise ValueError("Sequencing type not provided") @@ -68,233 +72,81 @@ else: dict_genome = config["dict_genome"] if "bait_regions" not in config: - print( - get_data_time(), - "Bait regions not provided will be calculated from target regions", - ) + print("Bait regions not provided will be calculated from target regions") else: bait_regions = config["bait_regions"] - -########################################################## -################ Setting Library kit ##################### -########################################################## - -if "library_prep_kit" in config: - library_prep_kit = config["library_prep_kit"] - if library_prep_kit == "": - library_prep_kit = "Unknown" -else: - library_prep_kit = "Unknown" - - -########################################################## -################## Setting Genome ######################## -########################################################## - -if "genome" not in config: - raise ValueError("Genome not provided") -else: - genome = config["genome"] - if "dbsnp" not in config: raise ValueError("dbSNP file not provided") else: dbsnp = config["dbsnp"] - -########################################################## -############# Setting working directory ################## -########################################################## - -if "work_dir" not in config: - raise ValueError("Work directory not provided") -else: - wrkdir = Path(config["work_dir"]) - -if "log_dir" not in config: - logdir = wrkdir / "logs" +if "SORTING_COLLECTION_SIZE_RATIO" not in config: + print("Setting default value for SORTING_COLLECTION_SIZE_RATIO to 0.01") + SORTING_COLLECTION_SIZE_RATIO = 0.01 else: - logdir = Path(config["log_dir"]) - - -######################################################################### -############# Check if UMIs have been demultiplexed ##################### -######################################################################### - -if "read_structure" in config: - print( - get_data_time(), - "Read structure has been provided expecting only R1 and R2 files for each sample and lane", - ) - read_structure = config["read_structure"] - print(read_structure) - if read_structure == "": - raise ValueError("Read structure defined but not provided") -else: - read_structure = False - + SORTING_COLLECTION_SIZE_RATIO = config["SORTING_COLLECTION_SIZE_RATIO"] -######################################################################### -############# Setting variables related to trimming ##################### -######################################################################### - -if "trim_adapters" not in config or (not config["trim_adapters"]): - print(get_data_time(), "Adapter trimming has been turned off") - config["trim_adapters"] = False - adapter_seq_r1 = None - adapter_seq_r3 = None +if "allowed_edits" not in config: + print("Setting default value for allowed_edits to 1") + allowed_edits = 1 else: - if config["trim_adapters"]: - if "Adapter_R1" not in config: - raise ValueError("Adapter sequence for R1 not provided") - else: - adapter_seq_r1 = config["Adapter_R1"] - if "Adapter_R3" not in config: - raise ValueError("Adapter sequence for R3 not provided") - else: - adapter_seq_r3 = config["Adapter_R3"] + allowed_edits = config["allowed_edits"] -########################################################################## -############# Setting variables related to UMI correction ################ -########################################################################## - -if "correct_umi" in config: - correct_umi = config["correct_umi"] +if "strategy" not in config: + print("Setting default value for strategy to Adjacency") + strategy = "Adjacency" else: - correct_umi = False + strategy = config["strategy"] -if correct_umi: - print(get_data_time(), "UMI Correction is enabled") - if "umi_file" not in config: - raise ValueError("UMI file not provided") - else: - umi_file = config["umi_file"] - if "correct_umi_max_mismatches" in config: - correct_umi_max_mismatches = config["correct_umi_max_mismatches"] - else: - print( - get_data_time(), "Setting default value for correct_umi_max_mismatches to 3" - ) - correct_umi_max_mismatches = 3 - if "correct_umi_min_distance" in config: - correct_umi_min_distance = config["correct_umi_min_distance"] +if "consensus_min_reads" not in config: + if seq_type == "WGS": + print("Setting default value for consensus_min_reads to 3 as SeqType is WGS") + consensus_min_reads = 3 else: print( - get_data_time(), "Setting default value for correct_umi_min_distance to 1" + "Setting default value for consensus_min_reads to 1 as SeqType is not WGS" ) - correct_umi_min_distance = 1 -else: - print(get_data_time(), "UMI Correction is disabled") - - -########################################################################## -########### Initialising parameters for grouping by fgbio ################ -########################################################################## - -if "group_allowed_edits" not in config: - print(get_data_time(), "Setting default value for allowed_edits to 1") - group_allowed_edits = 1 -else: - group_allowed_edits = config["group_allowed_edits"] - -if "group_strategy" not in config: - print(get_data_time(), "Setting default value for strategy to Adjacency") - group_strategy = "Adjacency" -else: - group_strategy = config["group_strategy"] - -if "group_min_mapq" not in config: - print(get_data_time(), "Setting default value for group min mapq to 20") - group_min_mapq = 20 -else: - group_min_mapq = config["group_min_mapq"] - - -########################################################################## -############# Intialising parameters for consensus calling ############### -########################################################################## - -if "consensus_min_reads" not in config: - print(get_data_time(), "Setting default value for consensus_min_reads to 1") - consensus_min_reads = 1 + consensus_min_reads = 1 else: consensus_min_reads = config["consensus_min_reads"] if "consensus_min_base_qual" not in config: - print(get_data_time(), "Setting default value for consensus_min_base_qual to 2") - consensus_min_base_qual = 2 + print("Setting default value for consensus_min_base_qual to 20") + consensus_min_base_qual = 20 else: consensus_min_base_qual = config["consensus_min_base_qual"] - -if "consensus_min_input_base_mapq" not in config: - print( - get_data_time(), "Setting default value for consensus_min_input_base_mapq to 10" - ) - consensus_min_input_base_mapq = 10 -else: - consensus_min_input_base_mapq = config["consensus_min_input_base_mapq"] - -if "consensus_error_rate_pre_umi" not in config: - print(get_data_time(), "Setting default value for consensus_error_rate to 45") - consensus_error_rate_pre_umi = 45 -else: - consensus_error_rate_pre_umi = config["consensus_error_rate_pre_umi"] - -if "consensus_error_rate_post_umi" not in config: - print(get_data_time(), "Setting default value for consensus_error_rate to 30") - consensus_error_rate_post_umi = 30 -else: - consensus_error_rate_post_umi = config["consensus_error_rate_post_umi"] - -############################################################################ -######## Initialising parameters for consensus filtering ################### -############################################################################ - if "filter_min_reads" not in config: - print(get_data_time(), "Setting default value for filter_min_reads") - filter_min_reads = 1 + if seq_type == "WGS": + print("Setting default value for filter_min_reads to 3 as SeqType is WGS") + filter_min_reads = 3 + else: + print("Setting default value for filter_min_reads to 1 as SeqType is not WGS") + filter_min_reads = 1 else: filter_min_reads = config["filter_min_reads"] if "filter_min_base_qual" not in config: - print(get_data_time(), "Setting default value for filter_min_base_qual to 40") + print("Setting default value for filter_min_base_qual to 40") filter_min_base_qual = 40 else: filter_min_base_qual = config["filter_min_base_qual"] -if "filter_max_base_error_rate" not in config: - print( - get_data_time(), "Setting default value for filter_max_base_error_rate to 0.2" - ) - filter_max_base_error_rate = 0.2 +if "filter_max_error_rate" not in config: + print("Setting default value for filter_max_error_rate to 0.2") + filter_max_error_rate = 0.2 else: - filter_max_base_error_rate = config["filter_max_base_error_rate"] + filter_max_error_rate = config["filter_max_error_rate"] -if "filter_max_read_error_rate" not in config: - print( - get_data_time(), "Setting default value for filter_max_base_error_rate to 0.1" - ) - filter_max_read_error_rate = 0.1 -else: - filter_max_read_error_rate = config["filter_max_read_error_rate"] - -if "filter_max_no_call_fraction" not in config: - print( - get_data_time(), "Setting default value for filter_max_base_error_rate to 0.1" - ) - filter_max_no_call_fraction = 0.1 -else: - filter_max_no_call_fraction = config["filter_max_no_call_fraction"] - - -# Create a set of valid combination for run lane and read ids LANE = metadata["LANE_NO"].unique().tolist() RUN_ID = metadata["RUN_ID"].unique().tolist() +# Create a function which checks if the input configuration is valid + + def filter_combinator(combinator, allow_list): def filtered_combinator(*args, **kwargs): for wc_comb in combinator(*args, **kwargs): diff --git a/workflow/rules/create_links.smk b/workflow/rules/create_links.smk old mode 100644 new mode 100755 index 277a8a2..73773f2 --- a/workflow/rules/create_links.smk +++ b/workflow/rules/create_links.smk @@ -2,7 +2,6 @@ rule create_links_files: params: metadata=config["metadata"], fastq_dir=wrkdir / "fastq", - read_structure=read_structure, resources: mem_mb=1000, runtime=20, @@ -22,33 +21,21 @@ rule create_links_files: sample=config["sample"], lane=LANE, ), - fastq_r3=( - expand( - wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - filtered_product, - run_id=RUN_ID, - sample=config["sample"], - lane=LANE, - ) - if not read_structure - else [] + fastq_r3=expand( + wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, ), message: "Creating links to fastq files" run: metadata = pd.read_csv(params.metadata) metadata = metadata[ - (metadata["SAMPLE_TYPE"] == config["sample"]) + (metadata["SAMPLE_NAME"] == config["sample"]) & (metadata["PATIENT_ID"] == config["pid"]) ] - if params.read_structure: - if metadata["READ"].nunique() != 2: - raise ValueError("Read structure is provided but R1 R2 and R3 provided") - else: - if metadata["READ"].nunique() != 3: - raise ValueError( - "Read structure is not provided but R1 R2 and R3 not provided" - ) for index, row in metadata.iterrows(): fastq_file = Path(row["FASTQ_FILE"]) @@ -59,7 +46,7 @@ rule create_links_files: params.fastq_dir / row["RUN_ID"] / ( - row["SAMPLE_TYPE"] + row["SAMPLE_NAME"] + "_" + row["READ"] + "_" diff --git a/workflow/rules/duplicate_marking.smk b/workflow/rules/duplicate_marking.smk index 3fe1bd5..ff10227 100755 --- a/workflow/rules/duplicate_marking.smk +++ b/workflow/rules/duplicate_marking.smk @@ -1,12 +1,10 @@ -# Deprecated - rule duplicates: input: - wrkdir / "alignments" / "{sample}_temp.bam", + wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam", output: - bam=temp(wrkdir / "alignments" / "{sample}_dedup.bam"), - # bai=temp(wrkdir / "alignments" / "{sample}_temp.bam.bai"), + bam=wrkdir / "alignments" / "{sample}_dedup.bam", + bai=wrkdir / "alignments" / "{sample}_dedup.bam.bai", metric=wrkdir / "metrics" / "{sample}_marked_dup_metrics.txt", params: SORTING_COLLECTION_SIZE_RATIO=SORTING_COLLECTION_SIZE_RATIO, @@ -17,13 +15,14 @@ rule duplicates: logdir / "gatk/{sample}_dedup.log", resources: mem_mb=8000, - runtime=72 * 60, + runtime=24 * 60, nodes=1, message: - "Marking duplicates on pre-consensus reads.: Decrapated slated for removal." + "Marking duplicates on consensus reads." shell: " ( " " gatk --java-options '-Xmx{resources.mem_mb}m' MarkDuplicates -I {input} " - " -O {output.bam} -M {output.metric} --BARCODE_TAG RX " - " --SORTING_COLLECTION_SIZE_RATIO {params.SORTING_COLLECTION_SIZE_RATIO} " + " -O {output.bam} -M {output.metric} " + " --SORTING_COLLECTION_SIZE_RATIO {params.SORTING_COLLECTION_SIZE_RATIO} && " + " samtools index -b {output.bam} {output.bai} " " ) &> {log} " diff --git a/workflow/rules/flagstatt.smk b/workflow/rules/flagstatt.smk index dc55326..da699de 100755 --- a/workflow/rules/flagstatt.smk +++ b/workflow/rules/flagstatt.smk @@ -1,3 +1,23 @@ +# rule flagstatt: +# input: +# bam=wrkdir / "alignments" / "{sample}_dedup.recall.sorted.bam", +# output: +# wrkdir / "metrics" / "{sample}.flagstat", +# conda: +# "../envs/sambamba.yaml" +# threads: 1 +# resources: +# mem_mb=8000, +# runtime=24 * 60, +# nodes=1, +# log: +# logdir / "sambamba/{sample}.log", +# message: +# "Running Flagstat" +# shell: +# "(sambamba flagstat {input.bam} > {output}) &> {log}" + + rule flagstatt: input: bam=wrkdir / "alignments" / "{sample}_{ext}.bam", diff --git a/workflow/rules/metric_hsmetrics.smk b/workflow/rules/metric_hsmetrics.smk index 29c480c..687458a 100755 --- a/workflow/rules/metric_hsmetrics.smk +++ b/workflow/rules/metric_hsmetrics.smk @@ -41,7 +41,7 @@ if seq_type in ["Panel", "WES"]: threads: 1 resources: mem_mb=8000, - runtime=4 * 60, + runtime=24 * 60, nodes=1, log: logdir / "bedtools" / "{sample}_slop.log", diff --git a/workflow/rules/recalibration.smk b/workflow/rules/recalibration.smk index 3ccf3b6..f15ef7a 100755 --- a/workflow/rules/recalibration.smk +++ b/workflow/rules/recalibration.smk @@ -1,6 +1,6 @@ rule baseRecalibrator: input: - bam=wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam", + bam=wrkdir / "alignments" / "{sample}_dedup.bam", dbsnp=dbsnp, genome=genome, output: @@ -12,7 +12,7 @@ rule baseRecalibrator: threads: 8 resources: mem_mb=8000, - runtime=72 * 60, + runtime=24 * 60, nodes=1, log: logdir / "gatk/{sample}_recal.log", diff --git a/workflow/rules/umi_based_rules.smk b/workflow/rules/umi_consensus.smk similarity index 66% rename from workflow/rules/umi_based_rules.smk rename to workflow/rules/umi_consensus.smk index 444914d..f190eb2 100755 --- a/workflow/rules/umi_based_rules.smk +++ b/workflow/rules/umi_consensus.smk @@ -1,4 +1,22 @@ - +rule fgbio: + input: + alignment=wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", + fastq_r2=wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz", + output: + temp(wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}_umi_annot.bam"), + threads: 1 + resources: + mem_mb=8000, + runtime=4 * 60, + nodes=1, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio" / "annotate_umi_{run_id}_{sample}_{lane}.log", + message: + "Annotating BAM with UMIs from fastq file." + shell: + "fgbio AnnotateBamWithUmis -i {input.alignment} -f {input.fastq_r2} -o {output} -t RX -q UQ -s true &> {log}" rule fix_mate: @@ -8,17 +26,13 @@ rule fix_mate: This can cause problems in downstream analysis. """ input: - bam=( - wrkdir / "alignments" / "{sample}_merged_umi_annot.bam" - if not read_structure - else wrkdir / "alignments" / "{sample}_merged_umi_annot.bam" - ), + bam=wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", output: bam=temp(wrkdir / "alignments" / "{sample}_mate_fix.bam"), threads: 2 resources: mem_mb=8000, - runtime=72 * 60, + runtime=24 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -43,13 +57,12 @@ rule group_reads: ), stats=wrkdir / "metrics" / "{sample}.grouped-family-sizes.txt", params: - strategy=group_strategy, - allowed_edits=group_allowed_edits, - min_mapq=group_min_mapq, + strategy=strategy, + allowed_edits=allowed_edits, threads: 2 resources: mem_mb=8000, - runtime=72 * 60, + runtime=24 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -62,7 +75,6 @@ rule group_reads: "--input {input.bam} " "--strategy {params.strategy} " "--edits {params.allowed_edits} " - "--min-map-q {params.min_mapq} " "--output {output.bam} " "--family-size-histogram {output.stats} " "&> {log} " @@ -73,17 +85,13 @@ rule call_consensus_reads: bam=wrkdir / "alignments" / "{sample}_merged_aln_umi_annot_sorted_grouped.bam", output: bam=temp(wrkdir / "alignments" / "{sample}.cons.unmapped.bam"), - metrics=logdir / "fgbio/call_consensus_reads.{sample}.log", params: min_reads=consensus_min_reads, min_base_qual=consensus_min_base_qual, - min_input_base_mapq=consensus_min_input_base_mapq, - error_rate_post_umi=consensus_error_rate_post_umi, - error_rate_pre_umi=consensus_error_rate_pre_umi, threads: 4 resources: mem_mb=4000, - runtime=72 * 60, + runtime=24 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -96,10 +104,7 @@ rule call_consensus_reads: "--input {input.bam} " "--output {output.bam} " "--min-reads {params.min_reads} " - "--error-rate-pre-umi {params.error_rate_pre_umi} " - "--error-rate-post-umi {params.error_rate_post_umi} " - "--min-consensus-base-quality {params.min_base_qual} " - "--min-input-base-quality {params.min_input_base_mapq} " + "--min-input-base-quality {params.min_base_qual} " "--threads {threads} " "&> {log}" @@ -113,13 +118,11 @@ rule filter_consensus_reads: params: min_reads=filter_min_reads, min_base_qual=filter_min_base_qual, - max_base_error_rate=filter_max_base_error_rate, - max_read_error_rate=filter_max_read_error_rate, - max_no_call_fraction=filter_max_no_call_fraction, + max_error_rate=filter_max_error_rate, threads: 8 resources: mem_mb=8000, - runtime=72 * 60, + runtime=24 * 60, nodes=1, conda: "../envs/fgbio.yaml" @@ -135,29 +138,6 @@ rule filter_consensus_reads: "--output {output.bam} " "--ref {input.ref} " "--min-reads {params.min_reads} " - "--max-read-error-rate {params.max_read_error_rate} " - "--max-base-error-rate {params.max_base_error_rate} " "--min-base-quality {params.min_base_qual} " - "--max-no-call-fraction {params.max_no_call_fraction} " - + "--max-base-error-rate {params.max_error_rate} " ") &> {log} " - - -rule gatherConsensusMetrics: - input: - logdir / "fgbio/call_consensus_reads.{sample}.log", - output: - wrkdir / "metrics" / "{sample}_consensus_metrics.tsv", - params: - sample=str(config["sample"]), - threads: 1 - resources: - mem_mb=1000, - runtime=60, - nodes=1, - log: - logdir / "metrics" / "{sample}_consensus_metrics.log", - conda: - "../envs/consensus.yaml" - script: - "../scripts/getConsensusMetrics.py" diff --git a/workflow/scripts/getConsensusMetrics.py b/workflow/scripts/getConsensusMetrics.py deleted file mode 100644 index 3706538..0000000 --- a/workflow/scripts/getConsensusMetrics.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -import pandas as pd - - - -column_set1=['Raw Reads Filtered Due to Insufficient Support', 'Raw Reads Filtered Due to Mismatching Cigars', 'Raw Reads Filtered Due to Low Base Quality', - 'Raw Reads Filtered Due to Orphan Consensus Created','Consensus reads emitted', 'Total Raw Reads Considered'] -column_set2=['overlapping templates', 'corrected templates', 'overlapping bases', 'corrected bases' ] -consensus_df=pd.DataFrame(columns=column_set1+column_set2, index=[snakemake.params['sample']]) -# print(consensus_df) -with open(snakemake.input[0]) as handle: - for line in handle: - for i in consensus_df.columns: - if i in line: - # print(line) - if i in column_set1: - line=line.split(':')[-1].split('(')[0].replace('.', '').replace(',','').replace(' ', '').strip() - else: - line=line.split(']')[-1].split('(')[0].replace('.', '').replace(i, '').replace(',','').replace(' ', '').strip() - line=int(line) - consensus_df.loc['T-NHL_98_malign', i]=line - break - -consensus_df.to_csv(snakemake.output[0], sep='\t', index=True) diff --git a/workflow/scripts/getCoveragePlot_snakemake.R b/workflow/scripts/getCoveragePlot_snakemake.R old mode 100644 new mode 100755 From 801803b12ed68d64653c2ed8f1dde37a6d53ec50 Mon Sep 17 00:00:00 2001 From: sahays Date: Wed, 19 Jun 2024 23:25:15 +0200 Subject: [PATCH 04/11] Added support for setting tmp dir as scratch_dir Fixes and closes #28 and #24 --- README.md | 33 +- config/config.yaml | 28 ++ config/test_config_exliquid.yaml | 53 ++++ profile/config.yaml | 11 +- workflow/Snakefile | 30 +- workflow/envs/consensus.yaml | 6 + workflow/rules/adapter_trimming.smk | 169 +++++----- workflow/rules/alignment.smk | 172 ++++++++-- workflow/rules/common.smk | 299 ++++++++++++++---- workflow/rules/create_links.smk | 29 +- workflow/rules/duplicate_marking.smk | 17 +- workflow/rules/flagstatt.smk | 21 +- workflow/rules/metric_hsmetrics.smk | 4 +- workflow/rules/metric_insert_size.smk | 1 + workflow/rules/metric_mosdepth.smk | 2 + workflow/rules/metric_wgs_coverage.smk | 1 + workflow/rules/recalibration.smk | 7 +- ...{umi_consensus.smk => umi_based_rules.smk} | 91 ++++-- workflow/scripts/getConsensusMetrics.py | 24 ++ 19 files changed, 721 insertions(+), 277 deletions(-) create mode 100755 config/test_config_exliquid.yaml create mode 100755 workflow/envs/consensus.yaml rename workflow/rules/{umi_consensus.smk => umi_based_rules.smk} (57%) create mode 100755 workflow/scripts/getConsensusMetrics.py diff --git a/README.md b/README.md index 699aa83..38fe433 100755 --- a/README.md +++ b/README.md @@ -9,9 +9,7 @@ The pipeline was tested and supported by Daniel Steiert. The pipeline is made for aligning UMI based WGS/WES and Panel Seq data and to compute the QC metrics associated with it. -We require the sequencing is performed in paired end mode and must contain R1 (forward read) R2 (UMI) and R3 (reverse read) for each lane and run - -Currently the ability to provide support is limited. +We require the sequencing is performed in paired end mode # Prerequisites @@ -93,15 +91,40 @@ Please modify the entry for 16. `dict_genome`: An absolute path to dict file for the given genomes, ignored if `SeqType` is `WGS` +17. `group_allowed_edits`: Number of edit allowed when grouping based on umi. defaults to 0, should be set to zero if correct_umi is true +18. `group_min_mapq: 20`: Set `--min-map-q` of groupReadsByUMI +19. `group_strategy`: Set the `--strategy` param of groupReadsByUMI deafults to `Adjacency` + + +20. `consensus_min_reads`: 1 +21. `consensus_min_base_qual`: 2 +22. `consensus_min_input_base_mapq`: 10 +23. `consensus_error_rate_pre_umi`: 45 +24. `consensus_error_rate_post_umi`: 30 + + +25. `filter_min_reads`: 3 +26. `filter_min_base_qual`: 2 +27. `filter_max_base_error_rate`: 0.1 +28. `filter_max_read_error_rate`: 0.05 +29. `filter_max_no_call_fraction`: 0.2 + + +30. `read_structure`: 8M143T 8M143T + +31. `correct_umi`: True +32. `correct_umi_max_mismatches`: 3 +33. `correct_umi_min_distance`: 1 +34. `umi_file`: ## Metadata file -Please create a metadata file with columns +Please create a metadata file with columns other columns can exist but not required 1. FASTQ_FILE: A column containing absolute paths to the locations of the FASTQ files, 2. READ: A column contain information with regards to R1, R2 and R3 of the sequencing file. Needs to be prefixed with `R` if not present 3. LANE_NO: A column containing the lane information for the sequencing files. Should be prefixed with `L_` if not present -4. SAMPLE_NAME: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run +4. SAMPLE_TYPE: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run 5. PATIENT_ID :Containing the ```pid``` which is inputed in the config file. Please note if the metadata file consists of multipe PIDs only the ```sample``` ```pid``` combination mentioned in the sample and pid directive of the config.yaml will be run. 7. RUN_ID: Please mention the run id for the squencing run for the sample diff --git a/config/config.yaml b/config/config.yaml index 5919356..36a31a4 100755 --- a/config/config.yaml +++ b/config/config.yaml @@ -42,3 +42,31 @@ bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions f chrom_sizes: /Path/to/chrom_sizes.tsv # Tab separated file containing the chromosome names and their lengths should match the reference genome dict_genome: /Path/to/genome.dict # Dictionary file for the reference genome + +#Parameters for Read_Grouping +group_allowed_edits: 0 +group_min_mapq: 20 +group_strategy: Adjacency + +# Configuration for consensus calling +consensus_min_reads: 1 +consensus_min_base_qual: 2 +consensus_min_input_base_mapq: 10 +consensus_error_rate_pre_umi: 45 +consensus_error_rate_post_umi: 30 + +# Configuration for consensus filter +filter_min_reads: 3 +filter_min_base_qual: 2 +filter_max_base_error_rate: 0.1 +filter_max_read_error_rate: 0.05 +filter_max_no_call_fraction: 0.2 + +# Configuration when UMI present in R1 and R2 files +read_structure: 8M143T 8M143T + +# Correct UMI based on list of UMI's +correct_umi: True +correct_umi_max_mismatches: 3 +correct_umi_min_distance: 1 +umi_file: /path/to/umi.txt diff --git a/config/test_config_exliquid.yaml b/config/test_config_exliquid.yaml new file mode 100755 index 0000000..ac9ba5e --- /dev/null +++ b/config/test_config_exliquid.yaml @@ -0,0 +1,53 @@ +Adapter_R1: [] +Adapter_R3: [] +SeqType: Panel +chrom_sizes: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/stats/hg19_chrTotalLength.tsv + +dbsnp: /applications/otp/ngs_share_complete/assemblies/hg19_GRCh37_1000genomes/databases/dbSNP/dbSNP_147/00-All.vcf.gz +dict_genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.dict + +genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.fa + +library_prep_kit: IDT_xGen_cfDNA_FFPE + +log_dir: /dh-projects/exliquid/scratch/results_alignment_test/logs/ + +metadata: /dh-projects/exliquid/scratch/input/bare_min_meta_data.csv + +pid: EXLIQUID_EX59 +sample: EXLIQUID_EX59-BUFFYCOAT_control + +target_regions: /dh-projects/exliquid/raw_data/reference/Targets-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed +bait_regions: /dh-projects/exliquid/raw_data/reference/Probes-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed +# trim_adapters: false + +work_dir: /dh-projects/exliquid/scratch/results_alignment_test/ + +# bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions for the analysis only required if SeqType is Panel/WES, if not provided a bait file will be generated from the target file + +#Parameters for Read_Grouping +group_allowed_edits: 0 +group_min_mapq: 20 +group_strategy: Adjacency + +# Configuration for consensus calling +consensus_min_reads: 1 +consensus_min_base_qual: 2 +consensus_min_input_base_mapq: 10 +consensus_error_rate_pre_umi: 45 +consensus_error_rate_post_umi: 30 + +# Configuration for consensus filter +filter_min_reads: 3 +filter_min_base_qual: 2 +filter_max_base_error_rate: 0.1 +filter_max_read_error_rate: 0.05 +filter_max_no_call_fraction: 0.2 + +# Configuration when UMI present in R1 and R2 files +read_structure: 8M143T 8M143T +# Correct UMI based on list of UMI's +correct_umi: True +correct_umi_max_mismatches: 3 +correct_umi_min_distance: 1 +umi_file: /dh-projects/exliquid/raw_data/reference/umi_list.txt diff --git a/profile/config.yaml b/profile/config.yaml index b38fca9..dbfda83 100755 --- a/profile/config.yaml +++ b/profile/config.yaml @@ -3,14 +3,15 @@ jobs: 10 latency-wait: 60 reason: True - +# default-resources: +# slurm_partition: master-fasttrack keep-going: True printshellcmds: True rerun-incomplete: True restart-times: 2 - # delete-temp-output: True -conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs +conda-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/conda_envs # conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs_ohne_singularity -singularity-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/Singularity -singularity-args: "-B /dh-projects/richter_transformation:/dh-projects/richter_transformation,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy" +singularity-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/Singularity +singularity-args: "-B /dh-projects/ag-ishaque:/dh-projects/ag-ishaque,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy,/dh-projects/otp:/dh-projects/otp" +# /dh-projects/exliquid:/dh-projects/exliquid, diff --git a/workflow/Snakefile b/workflow/Snakefile index 9a8334e..de314d4 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,7 +1,9 @@ import os +from datetime import datetime import pandas as pd from pathlib import Path from itertools import product +import tempfile container: "docker://condaforge/mambaforge" @@ -9,10 +11,10 @@ container: "docker://condaforge/mambaforge" include: "rules/common.smk" #done include: "rules/create_links.smk" +include: "rules/umi_based_rules.smk" #done include: "rules/adapter_trimming.smk" #done include: "rules/alignment.smk" #done -include: "rules/umi_consensus.smk" #done -include: "rules/duplicate_marking.smk" #done +# include: "rules/duplicate_marking.smk" #done include: "rules/flagstatt.smk" #done include: "rules/metric_hsmetrics.smk" #done include: "rules/metric_insert_size.smk" #done @@ -23,6 +25,10 @@ include: "rules/recalibration.smk" #done rule all: input: + expand( + wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", + sample=config["sample"], + ), expand( wrkdir / "alignments" / "{sample}_dedup.recall.sorted.bam", sample=config["sample"], @@ -35,11 +41,10 @@ rule all: wrkdir / "metrics" / "{sample}.mosdepth.global.dist.txt", sample=config["sample"], ), - # expand(wrkdir / "metrics" / "{sample}.flagstat", sample=config["sample"]), expand( wrkdir / "metrics" / "{sample}_{ext}.flagstat", sample=config["sample"], - ext=["dedup.recall.sorted", "merged_umi_annot"], + ext=["dedup.recall.sorted"], ), expand( wrkdir / "metrics" / "{sample}_insert_size_metrics.txt", @@ -64,3 +69,20 @@ rule all: expand(wrkdir / "metrics" / "{sample}.hs_metrics.txt", sample=config["sample"]) if config["SeqType"] in ["Panel", "WES"] else [], + expand( + wrkdir + / "metrics" + / "correct_umi" + / "{run_id}" + / "{sample}_{lane}_umi_metrics.txt", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, + ) + if correct_umi + else [], + expand( + wrkdir / "metrics" / "{sample}_consensus_metrics.tsv", + sample=config["sample"], + ), diff --git a/workflow/envs/consensus.yaml b/workflow/envs/consensus.yaml new file mode 100755 index 0000000..ff898d8 --- /dev/null +++ b/workflow/envs/consensus.yaml @@ -0,0 +1,6 @@ +name: consensus +channels: + - conda-forge +dependencies: + - pandas=2.2.2 + - python=3.11 diff --git a/workflow/rules/adapter_trimming.smk b/workflow/rules/adapter_trimming.smk index 0778088..4395e46 100755 --- a/workflow/rules/adapter_trimming.smk +++ b/workflow/rules/adapter_trimming.smk @@ -1,103 +1,80 @@ # disabling adapter trimming when when set to false, # To maintain the same input and output files we create links instead of running cutadapt -if config["trim_adapters"]: - rule create_adapter_fastq: - params: - adapt_1=adapter_seq_r1, - adapt_3=adapter_seq_r3, - output: - adapt_1=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq"), - adapt_3=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq"), - threads: 1 - resources: - mem_mb=1000, - runtime=20, - nodes=1, - message: - "Creating adapter fastq files" - run: - with open(output.adapt_1, "w") as handle: - count = 1 - for i in params.adapt_1: - handle.write(">adapter_" + str(count) + "\n") - handle.write(i + "\n") - count += 1 +rule create_adapter_fastq: + params: + adapt_1=adapter_seq_r1, + adapt_3=adapter_seq_r3, + output: + adapt_1=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq"), + adapt_3=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq"), + threads: 1 + resources: + mem_mb=1000, + runtime=20, + nodes=1, + tmpdir=scratch_dir, + message: + "Creating adapter fastq files" + run: + with open(output.adapt_1, "w") as handle: + count = 1 + for i in params.adapt_1: + handle.write(">adapter_" + str(count) + "\n") + handle.write(i + "\n") + count += 1 - with open(output.adapt_3, "w") as handle: - count = 1 - for i in params.adapt_3: - handle.write(">adapter_" + str(count) + "\n") - handle.write(i + "\n") - count += 1 + with open(output.adapt_3, "w") as handle: + count = 1 + for i in params.adapt_3: + handle.write(">adapter_" + str(count) + "\n") + handle.write(i + "\n") + count += 1 - rule cutadapt: - input: - adapt_1=wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq", - adapt_3=wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq", - fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", - fastq_r3=wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - output: - fastq_r1=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz" - ), - fastq_r3=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz" - ), - log: - logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", - threads: 8 - resources: - mem_mb=8000, - runtime=4 * 60, - nodes=1, - conda: - "../envs/cutadapt.yaml" - message: - "Trimming adapters using cutadapt" - shell: - "cutadapt -j {threads} -a file:{input.adapt_1} -A file:{input.adapt_3} -o {output.fastq_r1} -p {output.fastq_r3} {input.fastq_r1} {input.fastq_r3} &> {log}" -else: - - rule cutadapt: - input: - fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", - fastq_r3=wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - output: - fastq_r1=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz" - ), - fastq_r3=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz" - ), - log: - logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", - threads: 1 - resources: - mem_mb=1000, - runtime=60, - nodes=1, - message: - "Skipping adapter trimming, creating links instead of running cutadapt" - conda: - "../envs/cutadapt.yaml" - shell: - "(ln -s {input.fastq_r1} {output.fastq_r1} && ln -s {input.fastq_r3} {output.fastq_r3}) &> {log}" +rule cutadapt: + input: + adapt_1=wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq", + adapt_3=wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq", + fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", + fastq_r3=( + wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz" + if read_structure + else wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz" + ), + output: + fastq_r1=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R1_{lane}_trim.fastq.gz" + ), + fastq_r3=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R2_{lane}_trim.fastq.gz" + if read_structure + else wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R3_{lane}.fastq.gz" + ), + log: + logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", + threads: 8 + resources: + mem_mb=8000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/cutadapt.yaml" + message: + "Trimming adapters using cutadapt" + shell: + "cutadapt -j {threads} -a file:{input.adapt_1} -A file:{input.adapt_3} -o {output.fastq_r1} -p {output.fastq_r3} {input.fastq_r1} {input.fastq_r3} &> {log}" diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk index 86d1746..1b68346 100755 --- a/workflow/rules/alignment.smk +++ b/workflow/rules/alignment.smk @@ -6,25 +6,53 @@ rule fastqbam: """ input: genome=genome, - fastq_r1=wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz", - fastq_r3=wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz", + fastq_r1=( + ( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R1_{lane}_trim.fastq.gz" + ) + if config["trim_adapters"] + else (wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz") + ), + fastq_r3=( + ( + ( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R2_{lane}_trim.fastq.gz" + ) + if config["trim_adapters"] + else (wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz") + ) + if read_structure + else ( + ( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R3_{lane}_trim.fastq.gz" + ) + if config["trim_adapters"] + else (wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz") + ) + ), output: temp(wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam"), params: library=library_prep_kit, - threads: 8 + read_structure="--read-structures " + read_structure if read_structure else "", + threads: 1 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/fgbio.yaml" log: @@ -33,14 +61,89 @@ rule fastqbam: "Converting fastq to bam to assign read group and library information." shell: "(" - "fgbio -Xmx{resources.mem_mb}m --compression 1 FastqToBam " + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 FastqToBam " "--input {input.fastq_r1} {input.fastq_r3} " "--sample {wildcards.sample} " "--library {params.library} " - "--output {output} " + "--output {output} {params.read_structure} " ") &> {log}" +if correct_umi: + + rule CorrectUMI: + input: + bam=wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam", + umi_file=umi_file, + output: + bam=temp( + wrkdir + / "fastq" + / "{run_id}" + / "{sample}_{lane}_unmapped_umi_corrected.bam" + ), + metrics=wrkdir + / "metrics" + / "correct_umi" + / "{run_id}" + / "{sample}_{lane}_umi_metrics.txt", + params: + max_mismatches=correct_umi_max_mismatches, + min_distance=correct_umi_min_distance, + threads: 1 + resources: + mem_mb=2000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio" / "correct_umi_{run_id}_{sample}_{lane}.log", + message: + "Correcting UMIs." + shell: + "(" + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 --async-io CorrectUmis " + "--input {input.bam} " + "--output {output.bam} " + "--max-mismatches 3 " + "--min-distance 2 " + "--umi-files {input.umi_file} " + "--metrics {output.metrics} " + "--dont-store-original-umis ) &> {log}" + + +if not read_structure: + print("Hi") + + rule AnnotateUMI: + input: + alignment=wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", + fastq_r2=wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz", + output: + temp( + wrkdir + / "alignments" + / "{run_id}" + / "{sample}_aln_{lane}_umi_annot.bam" + ), + threads: 1 + resources: + mem_mb=8000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio" / "annotate_umi_{run_id}_{sample}_{lane}.log", + message: + "Annotating BAM with UMIs from fastq file." + shell: + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m AnnotateBamWithUmis -i {input.alignment} -f {input.fastq_r2} -o {output} -t RX -q UQ -s true &> {log}" + + rule bwa_map: """ First pass alignemnt @@ -48,14 +151,22 @@ rule bwa_map: """ input: genome=genome, - bam=wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam", + bam=( + wrkdir + / "fastq" + / "{run_id}" + / "{sample}_{lane}_unmapped_umi_corrected.bam" + if correct_umi + else wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam" + ), output: temp(wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam"), threads: 8 resources: - mem_mb=12000, - runtime=24 * 60, + mem_mb=15000, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/fgbio.yaml" message: @@ -66,7 +177,7 @@ rule bwa_map: "(" "samtools fastq {input.bam} " "| bwa mem -Y -K 150000000 -t {threads} -p {input.genome} - " - "| fgbio -Xmx4G --compression 1 --async-io ZipperBams " + "| fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx4G --compression 1 --async-io ZipperBams " "--unmapped {input.bam} " "--ref {input.genome} " "--output {output} " @@ -79,6 +190,14 @@ rule merge: """ input: expand( + wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, + ) + if read_structure + else expand( wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}_umi_annot.bam", filtered_product, run_id=RUN_ID, @@ -86,12 +205,13 @@ rule merge: lane=LANE, ), output: - temp(wrkdir / "alignments" / "{sample}_merged_umi_annot.bam"), + bam=wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", threads: 8 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/samtools.yaml" message: @@ -99,7 +219,7 @@ rule merge: log: logdir / "samtools/{sample}_merge.log", shell: - "samtools merge -f {output} {input} &> {log}" + "(samtools merge -f {output.bam} {input}) &> {log} " rule realign: @@ -114,10 +234,12 @@ rule realign: bai=temp(wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam.bai"), threads: 8 resources: - mem_mb=16000, - runtime=24 * 60, + mem_mb=26000, + runtime=72 * 60, nodes=1, mem_fgbio=8000, + mem_samtools=1000, + tmpdir=scratch_dir, conda: "../envs/fgbio.yaml" log: @@ -128,10 +250,10 @@ rule realign: "(" "samtools fastq {input.bam} " "| bwa mem -K 150000000 -Y -t {threads} -p {input.ref} - " - "| fgbio -Xmx{resources.mem_fgbio}m --compression 0 --async-io ZipperBams " + "| fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_fgbio}m --compression 0 --async-io ZipperBams " "--unmapped {input.bam} " "--ref {input.ref} " "--tags-to-reverse Consensus " "--tags-to-revcomp Consensus " - "| samtools sort --threads {threads} -o {output.bam}##idx##{output.bai} --write-index " + "| samtools sort --threads {threads} -m{resources.mem_samtools}m -o {output.bam}##idx##{output.bai} --write-index " ") &> {log} " diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index ef7d139..b5862a6 100755 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,54 +1,50 @@ -if "library_prep_kit" in config: - library_prep_kit = config["library_prep_kit"] - if library_prep_kit == "": - library_prep_kit = "Unknown" -else: - library_prep_kit = "Unknown" +print( + """ + \tAlignment pipeline for UMI based sequencing reads + \tAuthor: Shashwat Sahay + \tEmail: shashwat.sahay@charite.de + \tVersion: 0.2.0 -if "sample" not in config: - raise ValueError("Sample name not provided") + """ +) -if "pid" not in config: - raise ValueError("Patient ID not provided") -if "genome" not in config: - raise ValueError("Genome not provided") -else: - genome = config["genome"] +def get_data_time(): + now = datetime.now() -if "work_dir" not in config: - raise ValueError("Work directory not provided") -else: - wrkdir = Path(config["work_dir"]) + dt_string = now.strftime("%d/%m/%Y %H:%M:%S") + return "========= " + dt_string + " ========= " -if "log_dir" not in config: - logdir = wrkdir / "logs" -else: - logdir = Path(config["log_dir"]) -if "trim_adapters" not in config: - config["trim_adapters"] = False +print(get_data_time(), "Starting run") -if config["trim_adapters"]: - if "Adapter_R1" not in config: - raise ValueError("Adapter sequence for R1 not provided") - else: - adapter_seq_r1 = config["Adapter_R1"] - if "Adapter_R3" not in config: - raise ValueError("Adapter sequence for R3 not provided") - else: - adapter_seq_r3 = config["Adapter_R3"] +####################################################################### +################# Sanity check on the input data ###################### +####################################################################### + +if "sample" not in config: + raise ValueError("Sample name not provided") + +if "pid" not in config: + raise ValueError("Patient ID not provided") if "metadata" not in config: raise ValueError("Metadata file not provided") else: metadata = pd.read_csv(config["metadata"]) metadata = metadata[ - (metadata["SAMPLE_NAME"] == config["sample"]) + (metadata["SAMPLE_TYPE"] == config["sample"]) & (metadata["PATIENT_ID"] == config["pid"]) ] if metadata.shape[0] == 0: - raise ValueError("No metadata found for the given sample and patient ID") + raise ValueError( + "No metadata found for the given sample: %s and patient ID: %s" + % (config["sample"], config["pid"]) + ) + +########################################################## +###### Setting Sequencing type and related parameters #### +########################################################## if "SeqType" not in config: raise ValueError("Sequencing type not provided") @@ -72,79 +68,242 @@ else: dict_genome = config["dict_genome"] if "bait_regions" not in config: - print("Bait regions not provided will be calculated from target regions") + print( + get_data_time(), + "Bait regions not provided will be calculated from target regions", + ) else: bait_regions = config["bait_regions"] + +########################################################## +################ Setting Library kit ##################### +########################################################## + +if "library_prep_kit" in config: + library_prep_kit = config["library_prep_kit"] + if library_prep_kit == "": + library_prep_kit = "Unknown" +else: + library_prep_kit = "Unknown" + + +########################################################## +################## Setting Genome ######################## +########################################################## + +if "genome" not in config: + raise ValueError("Genome not provided") +else: + genome = config["genome"] + if "dbsnp" not in config: raise ValueError("dbSNP file not provided") else: dbsnp = config["dbsnp"] -if "SORTING_COLLECTION_SIZE_RATIO" not in config: - print("Setting default value for SORTING_COLLECTION_SIZE_RATIO to 0.01") - SORTING_COLLECTION_SIZE_RATIO = 0.01 + +########################################################## +############# Setting working directory ################## +########################################################## + +if "work_dir" not in config: + raise ValueError("Work directory not provided") else: - SORTING_COLLECTION_SIZE_RATIO = config["SORTING_COLLECTION_SIZE_RATIO"] + wrkdir = Path(config["work_dir"]) -if "allowed_edits" not in config: - print("Setting default value for allowed_edits to 1") - allowed_edits = 1 +if "log_dir" not in config: + logdir = wrkdir / "logs" else: - allowed_edits = config["allowed_edits"] + logdir = Path(config["log_dir"]) -if "strategy" not in config: - print("Setting default value for strategy to Adjacency") - strategy = "Adjacency" +if "scratch_dir" not in config: + scratch_dir = tempfile.gettempdir() else: - strategy = config["strategy"] + scratch_dir = config[ + "scratch_dir" + ] # snakemake doesnt accept path in resources has to be a string -if "consensus_min_reads" not in config: - if seq_type == "WGS": - print("Setting default value for consensus_min_reads to 3 as SeqType is WGS") - consensus_min_reads = 3 +print(get_data_time(), "Setting working directory to %s" % wrkdir) +print(get_data_time(), "Setting log directory to %s" % logdir) +print(get_data_time(), "Setting temp directory to %s" % scratch_dir) + + +######################################################################### +############# Check if UMIs have been demultiplexed ##################### +######################################################################### + +if "read_structure" in config: + print( + get_data_time(), + "Read structure has been provided expecting only R1 and R2 files for each sample and lane", + ) + read_structure = config["read_structure"] + print(read_structure) + if read_structure == "": + raise ValueError("Read structure defined but not provided") +else: + read_structure = False + + +######################################################################### +############# Setting variables related to trimming ##################### +######################################################################### + +if "trim_adapters" not in config or (not config["trim_adapters"]): + print(get_data_time(), "Adapter trimming has been turned off") + config["trim_adapters"] = False + adapter_seq_r1 = None + adapter_seq_r3 = None +else: + if config["trim_adapters"]: + if "Adapter_R1" not in config: + raise ValueError("Adapter sequence for R1 not provided") + else: + adapter_seq_r1 = config["Adapter_R1"] + if "Adapter_R3" not in config: + raise ValueError("Adapter sequence for R3 not provided") + else: + adapter_seq_r3 = config["Adapter_R3"] + +########################################################################## +############# Setting variables related to UMI correction ################ +########################################################################## + +if "correct_umi" in config: + correct_umi = config["correct_umi"] +else: + correct_umi = False + +if correct_umi: + print(get_data_time(), "UMI Correction is enabled") + if "umi_file" not in config: + raise ValueError("UMI file not provided") + else: + umi_file = config["umi_file"] + if "correct_umi_max_mismatches" in config: + correct_umi_max_mismatches = config["correct_umi_max_mismatches"] else: print( - "Setting default value for consensus_min_reads to 1 as SeqType is not WGS" + get_data_time(), "Setting default value for correct_umi_max_mismatches to 3" ) - consensus_min_reads = 1 + correct_umi_max_mismatches = 3 + if "correct_umi_min_distance" in config: + correct_umi_min_distance = config["correct_umi_min_distance"] + else: + print( + get_data_time(), "Setting default value for correct_umi_min_distance to 1" + ) + correct_umi_min_distance = 1 +else: + print(get_data_time(), "UMI Correction is disabled") + + +########################################################################## +########### Initialising parameters for grouping by fgbio ################ +########################################################################## + +if "group_allowed_edits" not in config: + print(get_data_time(), "Setting default value for allowed_edits to 1") + group_allowed_edits = 1 +else: + group_allowed_edits = config["group_allowed_edits"] + +if "group_strategy" not in config: + print(get_data_time(), "Setting default value for strategy to Adjacency") + group_strategy = "Adjacency" +else: + group_strategy = config["group_strategy"] + +if "group_min_mapq" not in config: + print(get_data_time(), "Setting default value for group min mapq to 20") + group_min_mapq = 20 +else: + group_min_mapq = config["group_min_mapq"] + + +########################################################################## +############# Intialising parameters for consensus calling ############### +########################################################################## + +if "consensus_min_reads" not in config: + print(get_data_time(), "Setting default value for consensus_min_reads to 1") + consensus_min_reads = 1 else: consensus_min_reads = config["consensus_min_reads"] if "consensus_min_base_qual" not in config: - print("Setting default value for consensus_min_base_qual to 20") - consensus_min_base_qual = 20 + print(get_data_time(), "Setting default value for consensus_min_base_qual to 2") + consensus_min_base_qual = 2 else: consensus_min_base_qual = config["consensus_min_base_qual"] + +if "consensus_min_input_base_mapq" not in config: + print( + get_data_time(), "Setting default value for consensus_min_input_base_mapq to 10" + ) + consensus_min_input_base_mapq = 10 +else: + consensus_min_input_base_mapq = config["consensus_min_input_base_mapq"] + +if "consensus_error_rate_pre_umi" not in config: + print(get_data_time(), "Setting default value for consensus_error_rate to 45") + consensus_error_rate_pre_umi = 45 +else: + consensus_error_rate_pre_umi = config["consensus_error_rate_pre_umi"] + +if "consensus_error_rate_post_umi" not in config: + print(get_data_time(), "Setting default value for consensus_error_rate to 30") + consensus_error_rate_post_umi = 30 +else: + consensus_error_rate_post_umi = config["consensus_error_rate_post_umi"] + +############################################################################ +######## Initialising parameters for consensus filtering ################### +############################################################################ + if "filter_min_reads" not in config: - if seq_type == "WGS": - print("Setting default value for filter_min_reads to 3 as SeqType is WGS") - filter_min_reads = 3 - else: - print("Setting default value for filter_min_reads to 1 as SeqType is not WGS") - filter_min_reads = 1 + print(get_data_time(), "Setting default value for filter_min_reads") + filter_min_reads = 1 else: filter_min_reads = config["filter_min_reads"] if "filter_min_base_qual" not in config: - print("Setting default value for filter_min_base_qual to 40") + print(get_data_time(), "Setting default value for filter_min_base_qual to 40") filter_min_base_qual = 40 else: filter_min_base_qual = config["filter_min_base_qual"] -if "filter_max_error_rate" not in config: - print("Setting default value for filter_max_error_rate to 0.2") - filter_max_error_rate = 0.2 +if "filter_max_base_error_rate" not in config: + print( + get_data_time(), "Setting default value for filter_max_base_error_rate to 0.2" + ) + filter_max_base_error_rate = 0.2 else: - filter_max_error_rate = config["filter_max_error_rate"] + filter_max_base_error_rate = config["filter_max_base_error_rate"] +if "filter_max_read_error_rate" not in config: + print( + get_data_time(), "Setting default value for filter_max_base_error_rate to 0.1" + ) + filter_max_read_error_rate = 0.1 +else: + filter_max_read_error_rate = config["filter_max_read_error_rate"] -LANE = metadata["LANE_NO"].unique().tolist() -RUN_ID = metadata["RUN_ID"].unique().tolist() +if "filter_max_no_call_fraction" not in config: + print( + get_data_time(), "Setting default value for filter_max_base_error_rate to 0.1" + ) + filter_max_no_call_fraction = 0.1 +else: + filter_max_no_call_fraction = config["filter_max_no_call_fraction"] -# Create a function which checks if the input configuration is valid +# Create a set of valid combination for run lane and read ids + +LANE = metadata["LANE_NO"].unique().tolist() +RUN_ID = metadata["RUN_ID"].unique().tolist() def filter_combinator(combinator, allow_list): diff --git a/workflow/rules/create_links.smk b/workflow/rules/create_links.smk index 73773f2..277a8a2 100755 --- a/workflow/rules/create_links.smk +++ b/workflow/rules/create_links.smk @@ -2,6 +2,7 @@ rule create_links_files: params: metadata=config["metadata"], fastq_dir=wrkdir / "fastq", + read_structure=read_structure, resources: mem_mb=1000, runtime=20, @@ -21,21 +22,33 @@ rule create_links_files: sample=config["sample"], lane=LANE, ), - fastq_r3=expand( - wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - filtered_product, - run_id=RUN_ID, - sample=config["sample"], - lane=LANE, + fastq_r3=( + expand( + wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, + ) + if not read_structure + else [] ), message: "Creating links to fastq files" run: metadata = pd.read_csv(params.metadata) metadata = metadata[ - (metadata["SAMPLE_NAME"] == config["sample"]) + (metadata["SAMPLE_TYPE"] == config["sample"]) & (metadata["PATIENT_ID"] == config["pid"]) ] + if params.read_structure: + if metadata["READ"].nunique() != 2: + raise ValueError("Read structure is provided but R1 R2 and R3 provided") + else: + if metadata["READ"].nunique() != 3: + raise ValueError( + "Read structure is not provided but R1 R2 and R3 not provided" + ) for index, row in metadata.iterrows(): fastq_file = Path(row["FASTQ_FILE"]) @@ -46,7 +59,7 @@ rule create_links_files: params.fastq_dir / row["RUN_ID"] / ( - row["SAMPLE_NAME"] + row["SAMPLE_TYPE"] + "_" + row["READ"] + "_" diff --git a/workflow/rules/duplicate_marking.smk b/workflow/rules/duplicate_marking.smk index ff10227..3fe1bd5 100755 --- a/workflow/rules/duplicate_marking.smk +++ b/workflow/rules/duplicate_marking.smk @@ -1,10 +1,12 @@ +# Deprecated + rule duplicates: input: - wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam", + wrkdir / "alignments" / "{sample}_temp.bam", output: - bam=wrkdir / "alignments" / "{sample}_dedup.bam", - bai=wrkdir / "alignments" / "{sample}_dedup.bam.bai", + bam=temp(wrkdir / "alignments" / "{sample}_dedup.bam"), + # bai=temp(wrkdir / "alignments" / "{sample}_temp.bam.bai"), metric=wrkdir / "metrics" / "{sample}_marked_dup_metrics.txt", params: SORTING_COLLECTION_SIZE_RATIO=SORTING_COLLECTION_SIZE_RATIO, @@ -15,14 +17,13 @@ rule duplicates: logdir / "gatk/{sample}_dedup.log", resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, message: - "Marking duplicates on consensus reads." + "Marking duplicates on pre-consensus reads.: Decrapated slated for removal." shell: " ( " " gatk --java-options '-Xmx{resources.mem_mb}m' MarkDuplicates -I {input} " - " -O {output.bam} -M {output.metric} " - " --SORTING_COLLECTION_SIZE_RATIO {params.SORTING_COLLECTION_SIZE_RATIO} && " - " samtools index -b {output.bam} {output.bai} " + " -O {output.bam} -M {output.metric} --BARCODE_TAG RX " + " --SORTING_COLLECTION_SIZE_RATIO {params.SORTING_COLLECTION_SIZE_RATIO} " " ) &> {log} " diff --git a/workflow/rules/flagstatt.smk b/workflow/rules/flagstatt.smk index da699de..d783037 100755 --- a/workflow/rules/flagstatt.smk +++ b/workflow/rules/flagstatt.smk @@ -1,23 +1,3 @@ -# rule flagstatt: -# input: -# bam=wrkdir / "alignments" / "{sample}_dedup.recall.sorted.bam", -# output: -# wrkdir / "metrics" / "{sample}.flagstat", -# conda: -# "../envs/sambamba.yaml" -# threads: 1 -# resources: -# mem_mb=8000, -# runtime=24 * 60, -# nodes=1, -# log: -# logdir / "sambamba/{sample}.log", -# message: -# "Running Flagstat" -# shell: -# "(sambamba flagstat {input.bam} > {output}) &> {log}" - - rule flagstatt: input: bam=wrkdir / "alignments" / "{sample}_{ext}.bam", @@ -30,6 +10,7 @@ rule flagstatt: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "sambamba/{sample}_{ext}.log", message: diff --git a/workflow/rules/metric_hsmetrics.smk b/workflow/rules/metric_hsmetrics.smk index 687458a..8adfb11 100755 --- a/workflow/rules/metric_hsmetrics.smk +++ b/workflow/rules/metric_hsmetrics.smk @@ -41,8 +41,9 @@ if seq_type in ["Panel", "WES"]: threads: 1 resources: mem_mb=8000, - runtime=24 * 60, + runtime=4 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "bedtools" / "{sample}_slop.log", message: @@ -68,6 +69,7 @@ if seq_type in ["Panel", "WES"]: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, message: "Calculating HS metrics for Panel and WES data" log: diff --git a/workflow/rules/metric_insert_size.smk b/workflow/rules/metric_insert_size.smk index ecb892d..ab0a1bc 100755 --- a/workflow/rules/metric_insert_size.smk +++ b/workflow/rules/metric_insert_size.smk @@ -12,6 +12,7 @@ rule InsertSize: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "picard/{sample}_insert_size.log", message: diff --git a/workflow/rules/metric_mosdepth.smk b/workflow/rules/metric_mosdepth.smk index 4939aeb..89fef0c 100755 --- a/workflow/rules/metric_mosdepth.smk +++ b/workflow/rules/metric_mosdepth.smk @@ -16,6 +16,7 @@ if seq_type in ["Panel", "WES"]: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/mosdepth.yaml" log: @@ -40,6 +41,7 @@ else: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/mosdepth.yaml" message: diff --git a/workflow/rules/metric_wgs_coverage.smk b/workflow/rules/metric_wgs_coverage.smk index ab52c8f..4fe5378 100755 --- a/workflow/rules/metric_wgs_coverage.smk +++ b/workflow/rules/metric_wgs_coverage.smk @@ -10,6 +10,7 @@ rule coveragePlot: mem_mb=10000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/coveragePlot.yaml" log: diff --git a/workflow/rules/recalibration.smk b/workflow/rules/recalibration.smk index f15ef7a..9353c33 100755 --- a/workflow/rules/recalibration.smk +++ b/workflow/rules/recalibration.smk @@ -1,19 +1,21 @@ rule baseRecalibrator: input: - bam=wrkdir / "alignments" / "{sample}_dedup.bam", + bam=wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam", dbsnp=dbsnp, genome=genome, output: table=wrkdir / "metrics" / "{sample}_recal_data.table", bam=temp(wrkdir / "alignments" / "{sample}_dedup.recall.bam"), + bai=temp(wrkdir / "alignments" / "{sample}_dedup.recall.bai"), analyse_covariates=wrkdir / "metrics" / "{sample}_covariates.pdf", conda: "../envs/gatk.yaml" threads: 8 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "gatk/{sample}_recal.log", message: @@ -43,6 +45,7 @@ rule sort_index: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "samtools/{sample}_sort.log", message: diff --git a/workflow/rules/umi_consensus.smk b/workflow/rules/umi_based_rules.smk similarity index 57% rename from workflow/rules/umi_consensus.smk rename to workflow/rules/umi_based_rules.smk index f190eb2..f6d089e 100755 --- a/workflow/rules/umi_consensus.smk +++ b/workflow/rules/umi_based_rules.smk @@ -1,22 +1,4 @@ -rule fgbio: - input: - alignment=wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", - fastq_r2=wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz", - output: - temp(wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}_umi_annot.bam"), - threads: 1 - resources: - mem_mb=8000, - runtime=4 * 60, - nodes=1, - conda: - "../envs/fgbio.yaml" - log: - logdir / "fgbio" / "annotate_umi_{run_id}_{sample}_{lane}.log", - message: - "Annotating BAM with UMIs from fastq file." - shell: - "fgbio AnnotateBamWithUmis -i {input.alignment} -f {input.fastq_r2} -o {output} -t RX -q UQ -s true &> {log}" + rule fix_mate: @@ -26,14 +8,19 @@ rule fix_mate: This can cause problems in downstream analysis. """ input: - bam=wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", + bam=( + wrkdir / "alignments" / "{sample}_merged_umi_annot.bam" + if not read_structure + else wrkdir / "alignments" / "{sample}_merged_umi_annot.bam" + ), output: bam=temp(wrkdir / "alignments" / "{sample}_mate_fix.bam"), threads: 2 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/fgbio.yaml" log: @@ -41,7 +28,7 @@ rule fix_mate: message: "Fixing mate information if required" shell: - "fgbio -Xmx{resources.mem_mb}m --compression 1 --async-io SetMateInformation " + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 --async-io SetMateInformation " "--input {input.bam} " "--output {output.bam} " "--allow-missing-mates true" @@ -57,13 +44,15 @@ rule group_reads: ), stats=wrkdir / "metrics" / "{sample}.grouped-family-sizes.txt", params: - strategy=strategy, - allowed_edits=allowed_edits, + strategy=group_strategy, + allowed_edits=group_allowed_edits, + min_mapq=group_min_mapq, threads: 2 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/fgbio.yaml" log: @@ -71,10 +60,11 @@ rule group_reads: message: "Grouping reads by UMI and position for consensus calling." shell: - "fgbio -Xmx{resources.mem_mb}m --compression 1 --async-io GroupReadsByUmi " + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 --async-io GroupReadsByUmi " "--input {input.bam} " "--strategy {params.strategy} " "--edits {params.allowed_edits} " + "--min-map-q {params.min_mapq} " "--output {output.bam} " "--family-size-histogram {output.stats} " "&> {log} " @@ -85,14 +75,19 @@ rule call_consensus_reads: bam=wrkdir / "alignments" / "{sample}_merged_aln_umi_annot_sorted_grouped.bam", output: bam=temp(wrkdir / "alignments" / "{sample}.cons.unmapped.bam"), + metrics=logdir / "fgbio/call_consensus_reads.{sample}.log", params: min_reads=consensus_min_reads, min_base_qual=consensus_min_base_qual, + min_input_base_mapq=consensus_min_input_base_mapq, + error_rate_post_umi=consensus_error_rate_post_umi, + error_rate_pre_umi=consensus_error_rate_pre_umi, threads: 4 resources: mem_mb=4000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/fgbio.yaml" log: @@ -100,11 +95,14 @@ rule call_consensus_reads: message: "Calling consensus reads from grouped reads." shell: - "fgbio -Xmx{resources.mem_mb}m --compression 0 CallMolecularConsensusReads " + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 0 CallMolecularConsensusReads " "--input {input.bam} " "--output {output.bam} " "--min-reads {params.min_reads} " - "--min-input-base-quality {params.min_base_qual} " + "--error-rate-pre-umi {params.error_rate_pre_umi} " + "--error-rate-post-umi {params.error_rate_post_umi} " + "--min-consensus-base-quality {params.min_base_qual} " + "--min-input-base-quality {params.min_input_base_mapq} " "--threads {threads} " "&> {log}" @@ -118,12 +116,15 @@ rule filter_consensus_reads: params: min_reads=filter_min_reads, min_base_qual=filter_min_base_qual, - max_error_rate=filter_max_error_rate, + max_base_error_rate=filter_max_base_error_rate, + max_read_error_rate=filter_max_read_error_rate, + max_no_call_fraction=filter_max_no_call_fraction, threads: 8 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/fgbio.yaml" log: @@ -133,11 +134,35 @@ rule filter_consensus_reads: shell: "(" "samtools sort -n -u {input.bam} | " - "fgbio -Xmx{resources.mem_mb}m --compression 1 FilterConsensusReads " + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 FilterConsensusReads " "--input /dev/stdin " "--output {output.bam} " "--ref {input.ref} " "--min-reads {params.min_reads} " + "--max-read-error-rate {params.max_read_error_rate} " + "--max-base-error-rate {params.max_base_error_rate} " "--min-base-quality {params.min_base_qual} " - "--max-base-error-rate {params.max_error_rate} " + "--max-no-call-fraction {params.max_no_call_fraction} " + ") &> {log} " + + +rule gatherConsensusMetrics: + input: + logdir / "fgbio/call_consensus_reads.{sample}.log", + output: + wrkdir / "metrics" / "{sample}_consensus_metrics.tsv", + params: + sample=str(config["sample"]), + threads: 1 + resources: + mem_mb=1000, + runtime=60, + nodes=1, + tmpdir=scratch_dir, + log: + logdir / "metrics" / "{sample}_consensus_metrics.log", + conda: + "../envs/consensus.yaml" + script: + "../scripts/getConsensusMetrics.py" diff --git a/workflow/scripts/getConsensusMetrics.py b/workflow/scripts/getConsensusMetrics.py new file mode 100755 index 0000000..3706538 --- /dev/null +++ b/workflow/scripts/getConsensusMetrics.py @@ -0,0 +1,24 @@ +import os +import pandas as pd + + + +column_set1=['Raw Reads Filtered Due to Insufficient Support', 'Raw Reads Filtered Due to Mismatching Cigars', 'Raw Reads Filtered Due to Low Base Quality', + 'Raw Reads Filtered Due to Orphan Consensus Created','Consensus reads emitted', 'Total Raw Reads Considered'] +column_set2=['overlapping templates', 'corrected templates', 'overlapping bases', 'corrected bases' ] +consensus_df=pd.DataFrame(columns=column_set1+column_set2, index=[snakemake.params['sample']]) +# print(consensus_df) +with open(snakemake.input[0]) as handle: + for line in handle: + for i in consensus_df.columns: + if i in line: + # print(line) + if i in column_set1: + line=line.split(':')[-1].split('(')[0].replace('.', '').replace(',','').replace(' ', '').strip() + else: + line=line.split(']')[-1].split('(')[0].replace('.', '').replace(i, '').replace(',','').replace(' ', '').strip() + line=int(line) + consensus_df.loc['T-NHL_98_malign', i]=line + break + +consensus_df.to_csv(snakemake.output[0], sep='\t', index=True) From b6212e93c2310f325ee32765d905b6f81bc59a7b Mon Sep 17 00:00:00 2001 From: sahays Date: Sat, 22 Jun 2024 12:39:44 +0200 Subject: [PATCH 05/11] Added threads Fixes and closes #29 --- workflow/rules/alignment.smk | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk index 1b68346..f70bd15 100755 --- a/workflow/rules/alignment.smk +++ b/workflow/rules/alignment.smk @@ -232,14 +232,17 @@ rule realign: output: bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam"), bai=temp(wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam.bai"), - threads: 8 + threads: 12 resources: - mem_mb=26000, + mem_mb=22000, # 8GB for BWA, 4GB for fgbio, 8GB for samtools sort and an overhead memory of 2GB runtime=72 * 60, nodes=1, - mem_fgbio=8000, - mem_samtools=1000, + mem_fgbio=4000, + mem_samtools=2000, tmpdir=scratch_dir, + params: + samtools_threads=4, + bwa_threads=8, conda: "../envs/fgbio.yaml" log: @@ -249,11 +252,11 @@ rule realign: shell: "(" "samtools fastq {input.bam} " - "| bwa mem -K 150000000 -Y -t {threads} -p {input.ref} - " + "| bwa mem -K 150000000 -Y -t {params.bwa_threads} -p {input.ref} - " "| fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_fgbio}m --compression 0 --async-io ZipperBams " "--unmapped {input.bam} " "--ref {input.ref} " "--tags-to-reverse Consensus " "--tags-to-revcomp Consensus " - "| samtools sort --threads {threads} -m{resources.mem_samtools}m -o {output.bam}##idx##{output.bai} --write-index " + "| samtools sort --threads {params.samtools_threads} -m{resources.mem_samtools}m -T {resources.tmpdir} -o {output.bam}##idx##{output.bai} --write-index " ") &> {log} " From aa992398b5cec2f27b3e5a2fc4f84b98e9524095 Mon Sep 17 00:00:00 2001 From: sahays Date: Sat, 22 Jun 2024 12:41:01 +0200 Subject: [PATCH 06/11] added a tmp dir argument to the sort index --- workflow/rules/recalibration.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/recalibration.smk b/workflow/rules/recalibration.smk index 9353c33..9ab2905 100755 --- a/workflow/rules/recalibration.smk +++ b/workflow/rules/recalibration.smk @@ -52,5 +52,5 @@ rule sort_index: "Sorting and indexing recalibrated bam file" shell: " ( " - " samtools sort --threads 8 -o {output.bam}##idx##{output.bai} {input.bam} --write-index" + " samtools sort --threads 8 -o {output.bam}##idx##{output.bai} {input.bam} -T {resources.tmpdir} --write-index" " ) &> {log} " From 4ea38c6192116fee6be10a47f0ebfbe217ec9a8a Mon Sep 17 00:00:00 2001 From: sahays Date: Sat, 22 Jun 2024 12:42:50 +0200 Subject: [PATCH 07/11] Split the sort and and filter rules as sort takes a lot more time and could not work with 8 threads in parallel leading to large performance loss --- workflow/rules/umi_based_rules.smk | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/workflow/rules/umi_based_rules.smk b/workflow/rules/umi_based_rules.smk index f6d089e..5cc2bfe 100755 --- a/workflow/rules/umi_based_rules.smk +++ b/workflow/rules/umi_based_rules.smk @@ -107,9 +107,32 @@ rule call_consensus_reads: "&> {log}" -rule filter_consensus_reads: +rule sort_name_index: input: bam=wrkdir / "alignments" / "{sample}.cons.unmapped.bam", + output: + bam=temp(wrkdir / "alignments" / "{sample}.cons.unmapped.sorted.bam"), + conda: + "../envs/samtools.yaml" + threads: 8 + resources: + mem_mb=8000, + runtime=24 * 60, + nodes=1, + tmpdir=scratch_dir, + log: + logdir / "samtools/{sample}_sort_name.log", + message: + "Sorting and indexing concensus bam file" + shell: + " ( " + " samtools sort --threads 8 -n -u -T {resources.tmpdir} -o {output.bam} {input.bam} " + " ) &> {log} " + + +rule filter_consensus_reads: + input: + bam=wrkdir / "alignments" / "{sample}.cons.unmapped.sorted.bam", ref=genome, output: bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered.bam"), @@ -133,9 +156,8 @@ rule filter_consensus_reads: "Filtering consensus reads and sorting into coordinate order." shell: "(" - "samtools sort -n -u {input.bam} | " "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 FilterConsensusReads " - "--input /dev/stdin " + "--input {input.bam} " "--output {output.bam} " "--ref {input.ref} " "--min-reads {params.min_reads} " From 5b077a39050b8869037ac5137def28321310d9e3 Mon Sep 17 00:00:00 2001 From: sahays Date: Sun, 14 Jul 2024 19:37:50 +0200 Subject: [PATCH 08/11] Removed checkpoint file after merge, causes greater load on the filesystem, run with --notemp to keep it --- workflow/Snakefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index de314d4..1cd7a26 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -25,10 +25,10 @@ include: "rules/recalibration.smk" #done rule all: input: - expand( - wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", - sample=config["sample"], - ), + # expand( + # wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", + # sample=config["sample"], + # ), expand( wrkdir / "alignments" / "{sample}_dedup.recall.sorted.bam", sample=config["sample"], From 93c7891f8aaa72bc7b820d9d5334837459111403 Mon Sep 17 00:00:00 2001 From: sahays Date: Sun, 14 Jul 2024 19:41:11 +0200 Subject: [PATCH 09/11] Increased memory required for each rule for faster processing, merged consensus calling and filtering to reduce load on the file system, removed sort_name_index as a rule it was redundant an not required, --- workflow/rules/umi_based_rules.smk | 56 +++++++++++++++++++----------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/workflow/rules/umi_based_rules.smk b/workflow/rules/umi_based_rules.smk index 5cc2bfe..bf6e389 100755 --- a/workflow/rules/umi_based_rules.smk +++ b/workflow/rules/umi_based_rules.smk @@ -15,9 +15,9 @@ rule fix_mate: ), output: bam=temp(wrkdir / "alignments" / "{sample}_mate_fix.bam"), - threads: 2 + threads: 1 resources: - mem_mb=8000, + mem_mb=50000, runtime=72 * 60, nodes=1, tmpdir=scratch_dir, @@ -28,11 +28,11 @@ rule fix_mate: message: "Fixing mate information if required" shell: - "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 --async-io SetMateInformation " + "(fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 --async-io SetMateInformation " "--input {input.bam} " "--output {output.bam} " - "--allow-missing-mates true" - "&> {log} " + "--allow-missing-mates true )" + " &> {log} " rule group_reads: @@ -49,7 +49,7 @@ rule group_reads: min_mapq=group_min_mapq, threads: 2 resources: - mem_mb=8000, + mem_mb=50000, runtime=72 * 60, nodes=1, tmpdir=scratch_dir, @@ -70,21 +70,28 @@ rule group_reads: "&> {log} " -rule call_consensus_reads: +rule call_filter_consensus_reads: input: bam=wrkdir / "alignments" / "{sample}_merged_aln_umi_annot_sorted_grouped.bam", + ref=genome, output: - bam=temp(wrkdir / "alignments" / "{sample}.cons.unmapped.bam"), + bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered.bam"), metrics=logdir / "fgbio/call_consensus_reads.{sample}.log", params: min_reads=consensus_min_reads, - min_base_qual=consensus_min_base_qual, min_input_base_mapq=consensus_min_input_base_mapq, error_rate_post_umi=consensus_error_rate_post_umi, error_rate_pre_umi=consensus_error_rate_pre_umi, - threads: 4 + filter_min_reads=filter_min_reads, + min_base_qual=filter_min_base_qual, + max_base_error_rate=filter_max_base_error_rate, + max_read_error_rate=filter_max_read_error_rate, + max_no_call_fraction=filter_max_no_call_fraction, + memory_consensus=25000, + memory_filter=25000, + threads: 24 resources: - mem_mb=4000, + mem_mb=50000, runtime=72 * 60, nodes=1, tmpdir=scratch_dir, @@ -95,15 +102,22 @@ rule call_consensus_reads: message: "Calling consensus reads from grouped reads." shell: - "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 0 CallMolecularConsensusReads " + "( fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{params.memory_consensus}m --compression 0 CallMolecularConsensusReads " "--input {input.bam} " - "--output {output.bam} " + "--output /dev/stdout " "--min-reads {params.min_reads} " "--error-rate-pre-umi {params.error_rate_pre_umi} " "--error-rate-post-umi {params.error_rate_post_umi} " - "--min-consensus-base-quality {params.min_base_qual} " "--min-input-base-quality {params.min_input_base_mapq} " - "--threads {threads} " + "--threads {threads} | fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{params.memory_filter}m --compression 1 FilterConsensusReads " + "--input /dev/stdin " + "--output {output.bam} " + "--ref {input.ref} " + "--min-reads {params.filter_min_reads} " + "--max-read-error-rate {params.max_read_error_rate} " + "--max-base-error-rate {params.max_base_error_rate} " + "--min-base-quality {params.min_base_qual} " + "--max-no-call-fraction {params.max_no_call_fraction} ) " "&> {log}" @@ -115,8 +129,10 @@ rule sort_name_index: conda: "../envs/samtools.yaml" threads: 8 + params: + mem_thread=8000, resources: - mem_mb=8000, + mem_mb=8 * 8000, runtime=24 * 60, nodes=1, tmpdir=scratch_dir, @@ -126,16 +142,16 @@ rule sort_name_index: "Sorting and indexing concensus bam file" shell: " ( " - " samtools sort --threads 8 -n -u -T {resources.tmpdir} -o {output.bam} {input.bam} " + " samtools sort --threads 8 -m{params.mem_thread}m -n -u -T {resources.tmpdir} -o {output.bam} {input.bam} " " ) &> {log} " rule filter_consensus_reads: input: - bam=wrkdir / "alignments" / "{sample}.cons.unmapped.sorted.bam", + bam=wrkdir / "alignments" / "{sample}.cons.unmapped#.bam", ref=genome, output: - bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered.bam"), + bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered#.bam"), params: min_reads=filter_min_reads, min_base_qual=filter_min_base_qual, @@ -144,7 +160,7 @@ rule filter_consensus_reads: max_no_call_fraction=filter_max_no_call_fraction, threads: 8 resources: - mem_mb=8000, + mem_mb=50000, runtime=72 * 60, nodes=1, tmpdir=scratch_dir, From 4b5af2c424827b82c9c12699645e934f79df91f2 Mon Sep 17 00:00:00 2001 From: sahays Date: Sun, 14 Jul 2024 19:51:23 +0200 Subject: [PATCH 10/11] Changed defaults for filtering reads --- workflow/rules/alignment.smk | 20 ++++++++++---------- workflow/rules/common.smk | 14 ++++++++++---- workflow/rules/metric_wgs_coverage.smk | 2 +- workflow/rules/recalibration.smk | 8 +++++--- workflow/scripts/getConsensusMetrics.py | 2 +- 5 files changed, 27 insertions(+), 19 deletions(-) diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk index f70bd15..2224d6e 100755 --- a/workflow/rules/alignment.smk +++ b/workflow/rules/alignment.smk @@ -161,9 +161,9 @@ rule bwa_map: ), output: temp(wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam"), - threads: 8 + threads: 24 resources: - mem_mb=15000, + mem_mb=24000, runtime=72 * 60, nodes=1, tmpdir=scratch_dir, @@ -205,8 +205,8 @@ rule merge: lane=LANE, ), output: - bam=wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", - threads: 8 + bam=temp(wrkdir / "alignments" / "{sample}_merged_umi_annot.bam"), + threads: 24 resources: mem_mb=8000, runtime=72 * 60, @@ -219,7 +219,7 @@ rule merge: log: logdir / "samtools/{sample}_merge.log", shell: - "(samtools merge -f {output.bam} {input}) &> {log} " + "(samtools merge --threads {threads} -f {output.bam} {input}) &> {log} " rule realign: @@ -232,17 +232,17 @@ rule realign: output: bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam"), bai=temp(wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam.bai"), - threads: 12 + threads: 28 resources: - mem_mb=22000, # 8GB for BWA, 4GB for fgbio, 8GB for samtools sort and an overhead memory of 2GB + mem_mb=80000, # 8GB for BWA, 4GB for fgbio, 64GB for samtools sort and an overhead memory of 2GB runtime=72 * 60, nodes=1, mem_fgbio=4000, - mem_samtools=2000, + mem_samtools=8000, tmpdir=scratch_dir, params: - samtools_threads=4, - bwa_threads=8, + samtools_threads=8, + bwa_threads=24, conda: "../envs/fgbio.yaml" log: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b5862a6..8999674 100755 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -123,6 +123,9 @@ else: scratch_dir = config[ "scratch_dir" ] # snakemake doesnt accept path in resources has to be a string + if not os.path.exists(scratch_dir): + print(get_data_time(), "Scratch directory does not exist") + os.makedirs(scratch_dir) print(get_data_time(), "Setting working directory to %s" % wrkdir) print(get_data_time(), "Setting log directory to %s" % logdir) @@ -270,11 +273,12 @@ else: filter_min_reads = config["filter_min_reads"] if "filter_min_base_qual" not in config: - print(get_data_time(), "Setting default value for filter_min_base_qual to 40") - filter_min_base_qual = 40 + filter_min_base_qual = 10 else: filter_min_base_qual = config["filter_min_base_qual"] +print(get_data_time(), "Setting filter_min_base_qual to %s" % filter_min_base_qual) + if "filter_max_base_error_rate" not in config: print( get_data_time(), "Setting default value for filter_max_base_error_rate to 0.2" @@ -284,10 +288,12 @@ else: filter_max_base_error_rate = config["filter_max_base_error_rate"] if "filter_max_read_error_rate" not in config: + filter_max_read_error_rate = 0.1 print( - get_data_time(), "Setting default value for filter_max_base_error_rate to 0.1" + get_data_time(), + "Setting default value for filter_max_base_error_rate to %s " + % filter_max_read_error_rate, ) - filter_max_read_error_rate = 0.1 else: filter_max_read_error_rate = config["filter_max_read_error_rate"] diff --git a/workflow/rules/metric_wgs_coverage.smk b/workflow/rules/metric_wgs_coverage.smk index 4fe5378..8666453 100755 --- a/workflow/rules/metric_wgs_coverage.smk +++ b/workflow/rules/metric_wgs_coverage.smk @@ -7,7 +7,7 @@ rule coveragePlot: plot=wrkdir / "metrics" / "{sample}_coverage.png", threads: 20 resources: - mem_mb=10000, + mem_mb=60000, runtime=24 * 60, nodes=1, tmpdir=scratch_dir, diff --git a/workflow/rules/recalibration.smk b/workflow/rules/recalibration.smk index 9ab2905..fd33af2 100755 --- a/workflow/rules/recalibration.smk +++ b/workflow/rules/recalibration.smk @@ -12,7 +12,7 @@ rule baseRecalibrator: "../envs/gatk.yaml" threads: 8 resources: - mem_mb=8000, + mem_mb=50000, runtime=72 * 60, nodes=1, tmpdir=scratch_dir, @@ -41,8 +41,10 @@ rule sort_index: conda: "../envs/samtools.yaml" threads: 8 + params: + mem_thread=8000, resources: - mem_mb=8000, + mem_mb=8 * 8000, runtime=24 * 60, nodes=1, tmpdir=scratch_dir, @@ -52,5 +54,5 @@ rule sort_index: "Sorting and indexing recalibrated bam file" shell: " ( " - " samtools sort --threads 8 -o {output.bam}##idx##{output.bai} {input.bam} -T {resources.tmpdir} --write-index" + " samtools sort --threads 8 -m{params.mem_thread}m -o {output.bam}##idx##{output.bai} {input.bam} -T {resources.tmpdir} --write-index" " ) &> {log} " diff --git a/workflow/scripts/getConsensusMetrics.py b/workflow/scripts/getConsensusMetrics.py index 3706538..81c2128 100755 --- a/workflow/scripts/getConsensusMetrics.py +++ b/workflow/scripts/getConsensusMetrics.py @@ -18,7 +18,7 @@ else: line=line.split(']')[-1].split('(')[0].replace('.', '').replace(i, '').replace(',','').replace(' ', '').strip() line=int(line) - consensus_df.loc['T-NHL_98_malign', i]=line + consensus_df.loc[snakemake.params['sample'], i]=line break consensus_df.to_csv(snakemake.output[0], sep='\t', index=True) From 65dbd6bc9bb2d40d765b8bdaebc3058a4536945d Mon Sep 17 00:00:00 2001 From: sahays Date: Tue, 23 Jul 2024 15:42:21 +0200 Subject: [PATCH 11/11] Added max coverage as an parameter for hsmetrics --- .pre-commit-config.yaml | 2 ++ workflow/rules/common.smk | 7 +++++++ workflow/rules/metric_hsmetrics.smk | 4 +++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7baf2b1..9f1cdd9 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,3 +13,5 @@ repos: rev: v0.10.0 # Replace by any tag/version ≥0.2.4 : https://github.com/snakemake/snakefmt/releases hooks: - id: snakefmt +ci: + autoupdate_schedule: quarterly diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 8999674..6cf7050 100755 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -306,6 +306,13 @@ else: filter_max_no_call_fraction = config["filter_max_no_call_fraction"] +if "max_coverage" not in config: + print(get_data_time(), "Setting default value for max_coverage to 10000") + max_coverage = 10000 +else: + max_coverage = config["max_coverage"] + + # Create a set of valid combination for run lane and read ids LANE = metadata["LANE_NO"].unique().tolist() diff --git a/workflow/rules/metric_hsmetrics.smk b/workflow/rules/metric_hsmetrics.smk index 8adfb11..cfc8c9c 100755 --- a/workflow/rules/metric_hsmetrics.smk +++ b/workflow/rules/metric_hsmetrics.smk @@ -62,6 +62,8 @@ if seq_type in ["Panel", "WES"]: genome=genome, output: target=wrkdir / "metrics" / "{sample}.hs_metrics.txt", + params: + max_coverage=max_coverage, conda: "../envs/gatk.yaml" threads: 1 @@ -75,4 +77,4 @@ if seq_type in ["Panel", "WES"]: log: logdir / "picard/{sample}.hs_metrics.log", shell: - "gatk CollectHsMetrics -I {input.bam} -O {output.target} -R {input.genome} -TI {input.target_intervals} -BI {input.bait_intervals} &> {log}" + "gatk CollectHsMetrics -I {input.bam} -O {output.target} -R {input.genome} -TI {input.target_intervals} -BI {input.bait_intervals} --COVERAGE_CAP {params.max_coverage} &> {log}"