diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 0b9ea32..945d63d --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ config/S*.yaml .snakemake/ Singularity/ conda_envs/ +conda_envs.bkp/ + conda_envs_ohne_singularity/ profile/config.yaml.bkp diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100644 new mode 100755 index 7baf2b1..9f1cdd9 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,3 +13,5 @@ repos: rev: v0.10.0 # Replace by any tag/version ≥0.2.4 : https://github.com/snakemake/snakefmt/releases hooks: - id: snakefmt +ci: + autoupdate_schedule: quarterly diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 699aa83..cc6c102 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The pipeline was tested and supported by Daniel Steiert. The pipeline is made for aligning UMI based WGS/WES and Panel Seq data and to compute the QC metrics associated with it. -We require the sequencing is performed in paired end mode and must contain R1 (forward read) R2 (UMI) and R3 (reverse read) for each lane and run +We require the sequencing is performed in paired end mode Currently the ability to provide support is limited. @@ -93,15 +93,40 @@ Please modify the entry for 16. `dict_genome`: An absolute path to dict file for the given genomes, ignored if `SeqType` is `WGS` +17. `group_allowed_edits`: Number of edit allowed when grouping based on umi. defaults to 0, should be set to zero if correct_umi is true +18. `group_min_mapq: 20`: Set `--min-map-q` of groupReadsByUMI +19. `group_strategy`: Set the `--strategy` param of groupReadsByUMI deafults to `Adjacency` +20. `consensus_min_reads`: 1 +21. `consensus_min_base_qual`: 2 +22. `consensus_min_input_base_mapq`: 10 +23. `consensus_error_rate_pre_umi`: 45 +24. `consensus_error_rate_post_umi`: 30 + + +25. `filter_min_reads`: 3 +26. `filter_min_base_qual`: 2 +27. `filter_max_base_error_rate`: 0.1 +28. `filter_max_read_error_rate`: 0.05 +29. `filter_max_no_call_fraction`: 0.2 + + +30. `read_structure`: 8M143T 8M143T + + +31. `correct_umi`: True +32. `correct_umi_max_mismatches`: 3 +33. `correct_umi_min_distance`: 1 +34. `umi_file`: + ## Metadata file -Please create a metadata file with columns +Please create a metadata file with columns other columns can exist but not required 1. FASTQ_FILE: A column containing absolute paths to the locations of the FASTQ files, 2. READ: A column contain information with regards to R1, R2 and R3 of the sequencing file. Needs to be prefixed with `R` if not present 3. LANE_NO: A column containing the lane information for the sequencing files. Should be prefixed with `L_` if not present -4. SAMPLE_NAME: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run +4. SAMPLE_TYPE: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run 5. PATIENT_ID :Containing the ```pid``` which is inputed in the config file. Please note if the metadata file consists of multipe PIDs only the ```sample``` ```pid``` combination mentioned in the sample and pid directive of the config.yaml will be run. 7. RUN_ID: Please mention the run id for the squencing run for the sample diff --git a/config/config.yaml b/config/config.yaml old mode 100644 new mode 100755 index 5919356..36a31a4 --- a/config/config.yaml +++ b/config/config.yaml @@ -42,3 +42,31 @@ bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions f chrom_sizes: /Path/to/chrom_sizes.tsv # Tab separated file containing the chromosome names and their lengths should match the reference genome dict_genome: /Path/to/genome.dict # Dictionary file for the reference genome + +#Parameters for Read_Grouping +group_allowed_edits: 0 +group_min_mapq: 20 +group_strategy: Adjacency + +# Configuration for consensus calling +consensus_min_reads: 1 +consensus_min_base_qual: 2 +consensus_min_input_base_mapq: 10 +consensus_error_rate_pre_umi: 45 +consensus_error_rate_post_umi: 30 + +# Configuration for consensus filter +filter_min_reads: 3 +filter_min_base_qual: 2 +filter_max_base_error_rate: 0.1 +filter_max_read_error_rate: 0.05 +filter_max_no_call_fraction: 0.2 + +# Configuration when UMI present in R1 and R2 files +read_structure: 8M143T 8M143T + +# Correct UMI based on list of UMI's +correct_umi: True +correct_umi_max_mismatches: 3 +correct_umi_min_distance: 1 +umi_file: /path/to/umi.txt diff --git a/config/test_config_exliquid.yaml b/config/test_config_exliquid.yaml new file mode 100755 index 0000000..ac9ba5e --- /dev/null +++ b/config/test_config_exliquid.yaml @@ -0,0 +1,53 @@ +Adapter_R1: [] +Adapter_R3: [] +SeqType: Panel +chrom_sizes: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/stats/hg19_chrTotalLength.tsv + +dbsnp: /applications/otp/ngs_share_complete/assemblies/hg19_GRCh37_1000genomes/databases/dbSNP/dbSNP_147/00-All.vcf.gz +dict_genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.dict + +genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.fa + +library_prep_kit: IDT_xGen_cfDNA_FFPE + +log_dir: /dh-projects/exliquid/scratch/results_alignment_test/logs/ + +metadata: /dh-projects/exliquid/scratch/input/bare_min_meta_data.csv + +pid: EXLIQUID_EX59 +sample: EXLIQUID_EX59-BUFFYCOAT_control + +target_regions: /dh-projects/exliquid/raw_data/reference/Targets-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed +bait_regions: /dh-projects/exliquid/raw_data/reference/Probes-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed +# trim_adapters: false + +work_dir: /dh-projects/exliquid/scratch/results_alignment_test/ + +# bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions for the analysis only required if SeqType is Panel/WES, if not provided a bait file will be generated from the target file + +#Parameters for Read_Grouping +group_allowed_edits: 0 +group_min_mapq: 20 +group_strategy: Adjacency + +# Configuration for consensus calling +consensus_min_reads: 1 +consensus_min_base_qual: 2 +consensus_min_input_base_mapq: 10 +consensus_error_rate_pre_umi: 45 +consensus_error_rate_post_umi: 30 + +# Configuration for consensus filter +filter_min_reads: 3 +filter_min_base_qual: 2 +filter_max_base_error_rate: 0.1 +filter_max_read_error_rate: 0.05 +filter_max_no_call_fraction: 0.2 + +# Configuration when UMI present in R1 and R2 files +read_structure: 8M143T 8M143T +# Correct UMI based on list of UMI's +correct_umi: True +correct_umi_max_mismatches: 3 +correct_umi_min_distance: 1 +umi_file: /dh-projects/exliquid/raw_data/reference/umi_list.txt diff --git a/profile/config.yaml b/profile/config.yaml old mode 100644 new mode 100755 index b38fca9..dbfda83 --- a/profile/config.yaml +++ b/profile/config.yaml @@ -3,14 +3,15 @@ jobs: 10 latency-wait: 60 reason: True - +# default-resources: +# slurm_partition: master-fasttrack keep-going: True printshellcmds: True rerun-incomplete: True restart-times: 2 - # delete-temp-output: True -conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs +conda-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/conda_envs # conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs_ohne_singularity -singularity-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/Singularity -singularity-args: "-B /dh-projects/richter_transformation:/dh-projects/richter_transformation,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy" +singularity-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/Singularity +singularity-args: "-B /dh-projects/ag-ishaque:/dh-projects/ag-ishaque,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy,/dh-projects/otp:/dh-projects/otp" +# /dh-projects/exliquid:/dh-projects/exliquid, diff --git a/workflow/Snakefile b/workflow/Snakefile old mode 100644 new mode 100755 index 9a8334e..1cd7a26 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,7 +1,9 @@ import os +from datetime import datetime import pandas as pd from pathlib import Path from itertools import product +import tempfile container: "docker://condaforge/mambaforge" @@ -9,10 +11,10 @@ container: "docker://condaforge/mambaforge" include: "rules/common.smk" #done include: "rules/create_links.smk" +include: "rules/umi_based_rules.smk" #done include: "rules/adapter_trimming.smk" #done include: "rules/alignment.smk" #done -include: "rules/umi_consensus.smk" #done -include: "rules/duplicate_marking.smk" #done +# include: "rules/duplicate_marking.smk" #done include: "rules/flagstatt.smk" #done include: "rules/metric_hsmetrics.smk" #done include: "rules/metric_insert_size.smk" #done @@ -23,6 +25,10 @@ include: "rules/recalibration.smk" #done rule all: input: + # expand( + # wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", + # sample=config["sample"], + # ), expand( wrkdir / "alignments" / "{sample}_dedup.recall.sorted.bam", sample=config["sample"], @@ -35,11 +41,10 @@ rule all: wrkdir / "metrics" / "{sample}.mosdepth.global.dist.txt", sample=config["sample"], ), - # expand(wrkdir / "metrics" / "{sample}.flagstat", sample=config["sample"]), expand( wrkdir / "metrics" / "{sample}_{ext}.flagstat", sample=config["sample"], - ext=["dedup.recall.sorted", "merged_umi_annot"], + ext=["dedup.recall.sorted"], ), expand( wrkdir / "metrics" / "{sample}_insert_size_metrics.txt", @@ -64,3 +69,20 @@ rule all: expand(wrkdir / "metrics" / "{sample}.hs_metrics.txt", sample=config["sample"]) if config["SeqType"] in ["Panel", "WES"] else [], + expand( + wrkdir + / "metrics" + / "correct_umi" + / "{run_id}" + / "{sample}_{lane}_umi_metrics.txt", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, + ) + if correct_umi + else [], + expand( + wrkdir / "metrics" / "{sample}_consensus_metrics.tsv", + sample=config["sample"], + ), diff --git a/workflow/envs/consensus.yaml b/workflow/envs/consensus.yaml new file mode 100755 index 0000000..ff898d8 --- /dev/null +++ b/workflow/envs/consensus.yaml @@ -0,0 +1,6 @@ +name: consensus +channels: + - conda-forge +dependencies: + - pandas=2.2.2 + - python=3.11 diff --git a/workflow/envs/coveragePlot.yaml b/workflow/envs/coveragePlot.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/cutadapt.yaml b/workflow/envs/cutadapt.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/fgbio.yaml b/workflow/envs/fgbio.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/gatk.yaml b/workflow/envs/gatk.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/mosdepth.yaml b/workflow/envs/mosdepth.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/sambamba.yaml b/workflow/envs/sambamba.yaml old mode 100644 new mode 100755 diff --git a/workflow/envs/samtools.yaml b/workflow/envs/samtools.yaml old mode 100644 new mode 100755 diff --git a/workflow/rules/adapter_trimming.smk b/workflow/rules/adapter_trimming.smk index 0778088..4395e46 100755 --- a/workflow/rules/adapter_trimming.smk +++ b/workflow/rules/adapter_trimming.smk @@ -1,103 +1,80 @@ # disabling adapter trimming when when set to false, # To maintain the same input and output files we create links instead of running cutadapt -if config["trim_adapters"]: - rule create_adapter_fastq: - params: - adapt_1=adapter_seq_r1, - adapt_3=adapter_seq_r3, - output: - adapt_1=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq"), - adapt_3=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq"), - threads: 1 - resources: - mem_mb=1000, - runtime=20, - nodes=1, - message: - "Creating adapter fastq files" - run: - with open(output.adapt_1, "w") as handle: - count = 1 - for i in params.adapt_1: - handle.write(">adapter_" + str(count) + "\n") - handle.write(i + "\n") - count += 1 +rule create_adapter_fastq: + params: + adapt_1=adapter_seq_r1, + adapt_3=adapter_seq_r3, + output: + adapt_1=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq"), + adapt_3=temp(wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq"), + threads: 1 + resources: + mem_mb=1000, + runtime=20, + nodes=1, + tmpdir=scratch_dir, + message: + "Creating adapter fastq files" + run: + with open(output.adapt_1, "w") as handle: + count = 1 + for i in params.adapt_1: + handle.write(">adapter_" + str(count) + "\n") + handle.write(i + "\n") + count += 1 - with open(output.adapt_3, "w") as handle: - count = 1 - for i in params.adapt_3: - handle.write(">adapter_" + str(count) + "\n") - handle.write(i + "\n") - count += 1 + with open(output.adapt_3, "w") as handle: + count = 1 + for i in params.adapt_3: + handle.write(">adapter_" + str(count) + "\n") + handle.write(i + "\n") + count += 1 - rule cutadapt: - input: - adapt_1=wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq", - adapt_3=wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq", - fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", - fastq_r3=wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - output: - fastq_r1=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz" - ), - fastq_r3=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz" - ), - log: - logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", - threads: 8 - resources: - mem_mb=8000, - runtime=4 * 60, - nodes=1, - conda: - "../envs/cutadapt.yaml" - message: - "Trimming adapters using cutadapt" - shell: - "cutadapt -j {threads} -a file:{input.adapt_1} -A file:{input.adapt_3} -o {output.fastq_r1} -p {output.fastq_r3} {input.fastq_r1} {input.fastq_r3} &> {log}" -else: - - rule cutadapt: - input: - fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", - fastq_r3=wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - output: - fastq_r1=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz" - ), - fastq_r3=temp( - wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz" - ), - log: - logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", - threads: 1 - resources: - mem_mb=1000, - runtime=60, - nodes=1, - message: - "Skipping adapter trimming, creating links instead of running cutadapt" - conda: - "../envs/cutadapt.yaml" - shell: - "(ln -s {input.fastq_r1} {output.fastq_r1} && ln -s {input.fastq_r3} {output.fastq_r3}) &> {log}" +rule cutadapt: + input: + adapt_1=wrkdir / "{sample}" / "cutadapt" / "adapt_1.fastq", + adapt_3=wrkdir / "{sample}" / "cutadapt" / "adapt_3.fastq", + fastq_r1=wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz", + fastq_r3=( + wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz" + if read_structure + else wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz" + ), + output: + fastq_r1=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R1_{lane}_trim.fastq.gz" + ), + fastq_r3=temp( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R2_{lane}_trim.fastq.gz" + if read_structure + else wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R3_{lane}.fastq.gz" + ), + log: + logdir / "cutadapt/{run_id}_{sample}_R1_{lane}.log", + threads: 8 + resources: + mem_mb=8000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/cutadapt.yaml" + message: + "Trimming adapters using cutadapt" + shell: + "cutadapt -j {threads} -a file:{input.adapt_1} -A file:{input.adapt_3} -o {output.fastq_r1} -p {output.fastq_r3} {input.fastq_r1} {input.fastq_r3} &> {log}" diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk index 86d1746..2224d6e 100755 --- a/workflow/rules/alignment.smk +++ b/workflow/rules/alignment.smk @@ -6,25 +6,53 @@ rule fastqbam: """ input: genome=genome, - fastq_r1=wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R1_{lane}_trim.fastq.gz", - fastq_r3=wrkdir - / "fastq" - / "{run_id}" - / "cutadapt" - / "{sample}_R3_{lane}_trim.fastq.gz", + fastq_r1=( + ( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R1_{lane}_trim.fastq.gz" + ) + if config["trim_adapters"] + else (wrkdir / "fastq" / "{run_id}" / "{sample}_R1_{lane}.fastq.gz") + ), + fastq_r3=( + ( + ( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R2_{lane}_trim.fastq.gz" + ) + if config["trim_adapters"] + else (wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz") + ) + if read_structure + else ( + ( + wrkdir + / "fastq" + / "{run_id}" + / "cutadapt" + / "{sample}_R3_{lane}_trim.fastq.gz" + ) + if config["trim_adapters"] + else (wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz") + ) + ), output: temp(wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam"), params: library=library_prep_kit, - threads: 8 + read_structure="--read-structures " + read_structure if read_structure else "", + threads: 1 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/fgbio.yaml" log: @@ -33,14 +61,89 @@ rule fastqbam: "Converting fastq to bam to assign read group and library information." shell: "(" - "fgbio -Xmx{resources.mem_mb}m --compression 1 FastqToBam " + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 FastqToBam " "--input {input.fastq_r1} {input.fastq_r3} " "--sample {wildcards.sample} " "--library {params.library} " - "--output {output} " + "--output {output} {params.read_structure} " ") &> {log}" +if correct_umi: + + rule CorrectUMI: + input: + bam=wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam", + umi_file=umi_file, + output: + bam=temp( + wrkdir + / "fastq" + / "{run_id}" + / "{sample}_{lane}_unmapped_umi_corrected.bam" + ), + metrics=wrkdir + / "metrics" + / "correct_umi" + / "{run_id}" + / "{sample}_{lane}_umi_metrics.txt", + params: + max_mismatches=correct_umi_max_mismatches, + min_distance=correct_umi_min_distance, + threads: 1 + resources: + mem_mb=2000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio" / "correct_umi_{run_id}_{sample}_{lane}.log", + message: + "Correcting UMIs." + shell: + "(" + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 --async-io CorrectUmis " + "--input {input.bam} " + "--output {output.bam} " + "--max-mismatches 3 " + "--min-distance 2 " + "--umi-files {input.umi_file} " + "--metrics {output.metrics} " + "--dont-store-original-umis ) &> {log}" + + +if not read_structure: + print("Hi") + + rule AnnotateUMI: + input: + alignment=wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", + fastq_r2=wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz", + output: + temp( + wrkdir + / "alignments" + / "{run_id}" + / "{sample}_aln_{lane}_umi_annot.bam" + ), + threads: 1 + resources: + mem_mb=8000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio" / "annotate_umi_{run_id}_{sample}_{lane}.log", + message: + "Annotating BAM with UMIs from fastq file." + shell: + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m AnnotateBamWithUmis -i {input.alignment} -f {input.fastq_r2} -o {output} -t RX -q UQ -s true &> {log}" + + rule bwa_map: """ First pass alignemnt @@ -48,14 +151,22 @@ rule bwa_map: """ input: genome=genome, - bam=wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam", + bam=( + wrkdir + / "fastq" + / "{run_id}" + / "{sample}_{lane}_unmapped_umi_corrected.bam" + if correct_umi + else wrkdir / "fastq" / "{run_id}" / "{sample}_{lane}_unmapped.bam" + ), output: temp(wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam"), - threads: 8 + threads: 24 resources: - mem_mb=12000, - runtime=24 * 60, + mem_mb=24000, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/fgbio.yaml" message: @@ -66,7 +177,7 @@ rule bwa_map: "(" "samtools fastq {input.bam} " "| bwa mem -Y -K 150000000 -t {threads} -p {input.genome} - " - "| fgbio -Xmx4G --compression 1 --async-io ZipperBams " + "| fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx4G --compression 1 --async-io ZipperBams " "--unmapped {input.bam} " "--ref {input.genome} " "--output {output} " @@ -79,6 +190,14 @@ rule merge: """ input: expand( + wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, + ) + if read_structure + else expand( wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}_umi_annot.bam", filtered_product, run_id=RUN_ID, @@ -86,12 +205,13 @@ rule merge: lane=LANE, ), output: - temp(wrkdir / "alignments" / "{sample}_merged_umi_annot.bam"), - threads: 8 + bam=temp(wrkdir / "alignments" / "{sample}_merged_umi_annot.bam"), + threads: 24 resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/samtools.yaml" message: @@ -99,7 +219,7 @@ rule merge: log: logdir / "samtools/{sample}_merge.log", shell: - "samtools merge -f {output} {input} &> {log}" + "(samtools merge --threads {threads} -f {output.bam} {input}) &> {log} " rule realign: @@ -112,12 +232,17 @@ rule realign: output: bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam"), bai=temp(wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam.bai"), - threads: 8 + threads: 28 resources: - mem_mb=16000, - runtime=24 * 60, + mem_mb=80000, # 8GB for BWA, 4GB for fgbio, 64GB for samtools sort and an overhead memory of 2GB + runtime=72 * 60, nodes=1, - mem_fgbio=8000, + mem_fgbio=4000, + mem_samtools=8000, + tmpdir=scratch_dir, + params: + samtools_threads=8, + bwa_threads=24, conda: "../envs/fgbio.yaml" log: @@ -127,11 +252,11 @@ rule realign: shell: "(" "samtools fastq {input.bam} " - "| bwa mem -K 150000000 -Y -t {threads} -p {input.ref} - " - "| fgbio -Xmx{resources.mem_fgbio}m --compression 0 --async-io ZipperBams " + "| bwa mem -K 150000000 -Y -t {params.bwa_threads} -p {input.ref} - " + "| fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_fgbio}m --compression 0 --async-io ZipperBams " "--unmapped {input.bam} " "--ref {input.ref} " "--tags-to-reverse Consensus " "--tags-to-revcomp Consensus " - "| samtools sort --threads {threads} -o {output.bam}##idx##{output.bai} --write-index " + "| samtools sort --threads {params.samtools_threads} -m{resources.mem_samtools}m -T {resources.tmpdir} -o {output.bam}##idx##{output.bai} --write-index " ") &> {log} " diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk old mode 100644 new mode 100755 index ef7d139..6cf7050 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,54 +1,50 @@ -if "library_prep_kit" in config: - library_prep_kit = config["library_prep_kit"] - if library_prep_kit == "": - library_prep_kit = "Unknown" -else: - library_prep_kit = "Unknown" +print( + """ + \tAlignment pipeline for UMI based sequencing reads + \tAuthor: Shashwat Sahay + \tEmail: shashwat.sahay@charite.de + \tVersion: 0.2.0 -if "sample" not in config: - raise ValueError("Sample name not provided") + """ +) -if "pid" not in config: - raise ValueError("Patient ID not provided") -if "genome" not in config: - raise ValueError("Genome not provided") -else: - genome = config["genome"] +def get_data_time(): + now = datetime.now() -if "work_dir" not in config: - raise ValueError("Work directory not provided") -else: - wrkdir = Path(config["work_dir"]) + dt_string = now.strftime("%d/%m/%Y %H:%M:%S") + return "========= " + dt_string + " ========= " -if "log_dir" not in config: - logdir = wrkdir / "logs" -else: - logdir = Path(config["log_dir"]) -if "trim_adapters" not in config: - config["trim_adapters"] = False +print(get_data_time(), "Starting run") -if config["trim_adapters"]: - if "Adapter_R1" not in config: - raise ValueError("Adapter sequence for R1 not provided") - else: - adapter_seq_r1 = config["Adapter_R1"] - if "Adapter_R3" not in config: - raise ValueError("Adapter sequence for R3 not provided") - else: - adapter_seq_r3 = config["Adapter_R3"] +####################################################################### +################# Sanity check on the input data ###################### +####################################################################### + +if "sample" not in config: + raise ValueError("Sample name not provided") + +if "pid" not in config: + raise ValueError("Patient ID not provided") if "metadata" not in config: raise ValueError("Metadata file not provided") else: metadata = pd.read_csv(config["metadata"]) metadata = metadata[ - (metadata["SAMPLE_NAME"] == config["sample"]) + (metadata["SAMPLE_TYPE"] == config["sample"]) & (metadata["PATIENT_ID"] == config["pid"]) ] if metadata.shape[0] == 0: - raise ValueError("No metadata found for the given sample and patient ID") + raise ValueError( + "No metadata found for the given sample: %s and patient ID: %s" + % (config["sample"], config["pid"]) + ) + +########################################################## +###### Setting Sequencing type and related parameters #### +########################################################## if "SeqType" not in config: raise ValueError("Sequencing type not provided") @@ -72,79 +68,255 @@ else: dict_genome = config["dict_genome"] if "bait_regions" not in config: - print("Bait regions not provided will be calculated from target regions") + print( + get_data_time(), + "Bait regions not provided will be calculated from target regions", + ) else: bait_regions = config["bait_regions"] + +########################################################## +################ Setting Library kit ##################### +########################################################## + +if "library_prep_kit" in config: + library_prep_kit = config["library_prep_kit"] + if library_prep_kit == "": + library_prep_kit = "Unknown" +else: + library_prep_kit = "Unknown" + + +########################################################## +################## Setting Genome ######################## +########################################################## + +if "genome" not in config: + raise ValueError("Genome not provided") +else: + genome = config["genome"] + if "dbsnp" not in config: raise ValueError("dbSNP file not provided") else: dbsnp = config["dbsnp"] -if "SORTING_COLLECTION_SIZE_RATIO" not in config: - print("Setting default value for SORTING_COLLECTION_SIZE_RATIO to 0.01") - SORTING_COLLECTION_SIZE_RATIO = 0.01 + +########################################################## +############# Setting working directory ################## +########################################################## + +if "work_dir" not in config: + raise ValueError("Work directory not provided") else: - SORTING_COLLECTION_SIZE_RATIO = config["SORTING_COLLECTION_SIZE_RATIO"] + wrkdir = Path(config["work_dir"]) -if "allowed_edits" not in config: - print("Setting default value for allowed_edits to 1") - allowed_edits = 1 +if "log_dir" not in config: + logdir = wrkdir / "logs" else: - allowed_edits = config["allowed_edits"] + logdir = Path(config["log_dir"]) -if "strategy" not in config: - print("Setting default value for strategy to Adjacency") - strategy = "Adjacency" +if "scratch_dir" not in config: + scratch_dir = tempfile.gettempdir() else: - strategy = config["strategy"] + scratch_dir = config[ + "scratch_dir" + ] # snakemake doesnt accept path in resources has to be a string + if not os.path.exists(scratch_dir): + print(get_data_time(), "Scratch directory does not exist") + os.makedirs(scratch_dir) -if "consensus_min_reads" not in config: - if seq_type == "WGS": - print("Setting default value for consensus_min_reads to 3 as SeqType is WGS") - consensus_min_reads = 3 +print(get_data_time(), "Setting working directory to %s" % wrkdir) +print(get_data_time(), "Setting log directory to %s" % logdir) +print(get_data_time(), "Setting temp directory to %s" % scratch_dir) + + +######################################################################### +############# Check if UMIs have been demultiplexed ##################### +######################################################################### + +if "read_structure" in config: + print( + get_data_time(), + "Read structure has been provided expecting only R1 and R2 files for each sample and lane", + ) + read_structure = config["read_structure"] + print(read_structure) + if read_structure == "": + raise ValueError("Read structure defined but not provided") +else: + read_structure = False + + +######################################################################### +############# Setting variables related to trimming ##################### +######################################################################### + +if "trim_adapters" not in config or (not config["trim_adapters"]): + print(get_data_time(), "Adapter trimming has been turned off") + config["trim_adapters"] = False + adapter_seq_r1 = None + adapter_seq_r3 = None +else: + if config["trim_adapters"]: + if "Adapter_R1" not in config: + raise ValueError("Adapter sequence for R1 not provided") + else: + adapter_seq_r1 = config["Adapter_R1"] + if "Adapter_R3" not in config: + raise ValueError("Adapter sequence for R3 not provided") + else: + adapter_seq_r3 = config["Adapter_R3"] + +########################################################################## +############# Setting variables related to UMI correction ################ +########################################################################## + +if "correct_umi" in config: + correct_umi = config["correct_umi"] +else: + correct_umi = False + +if correct_umi: + print(get_data_time(), "UMI Correction is enabled") + if "umi_file" not in config: + raise ValueError("UMI file not provided") + else: + umi_file = config["umi_file"] + if "correct_umi_max_mismatches" in config: + correct_umi_max_mismatches = config["correct_umi_max_mismatches"] + else: + print( + get_data_time(), "Setting default value for correct_umi_max_mismatches to 3" + ) + correct_umi_max_mismatches = 3 + if "correct_umi_min_distance" in config: + correct_umi_min_distance = config["correct_umi_min_distance"] else: print( - "Setting default value for consensus_min_reads to 1 as SeqType is not WGS" + get_data_time(), "Setting default value for correct_umi_min_distance to 1" ) - consensus_min_reads = 1 + correct_umi_min_distance = 1 +else: + print(get_data_time(), "UMI Correction is disabled") + + +########################################################################## +########### Initialising parameters for grouping by fgbio ################ +########################################################################## + +if "group_allowed_edits" not in config: + print(get_data_time(), "Setting default value for allowed_edits to 1") + group_allowed_edits = 1 +else: + group_allowed_edits = config["group_allowed_edits"] + +if "group_strategy" not in config: + print(get_data_time(), "Setting default value for strategy to Adjacency") + group_strategy = "Adjacency" +else: + group_strategy = config["group_strategy"] + +if "group_min_mapq" not in config: + print(get_data_time(), "Setting default value for group min mapq to 20") + group_min_mapq = 20 +else: + group_min_mapq = config["group_min_mapq"] + + +########################################################################## +############# Intialising parameters for consensus calling ############### +########################################################################## + +if "consensus_min_reads" not in config: + print(get_data_time(), "Setting default value for consensus_min_reads to 1") + consensus_min_reads = 1 else: consensus_min_reads = config["consensus_min_reads"] if "consensus_min_base_qual" not in config: - print("Setting default value for consensus_min_base_qual to 20") - consensus_min_base_qual = 20 + print(get_data_time(), "Setting default value for consensus_min_base_qual to 2") + consensus_min_base_qual = 2 else: consensus_min_base_qual = config["consensus_min_base_qual"] + +if "consensus_min_input_base_mapq" not in config: + print( + get_data_time(), "Setting default value for consensus_min_input_base_mapq to 10" + ) + consensus_min_input_base_mapq = 10 +else: + consensus_min_input_base_mapq = config["consensus_min_input_base_mapq"] + +if "consensus_error_rate_pre_umi" not in config: + print(get_data_time(), "Setting default value for consensus_error_rate to 45") + consensus_error_rate_pre_umi = 45 +else: + consensus_error_rate_pre_umi = config["consensus_error_rate_pre_umi"] + +if "consensus_error_rate_post_umi" not in config: + print(get_data_time(), "Setting default value for consensus_error_rate to 30") + consensus_error_rate_post_umi = 30 +else: + consensus_error_rate_post_umi = config["consensus_error_rate_post_umi"] + +############################################################################ +######## Initialising parameters for consensus filtering ################### +############################################################################ + if "filter_min_reads" not in config: - if seq_type == "WGS": - print("Setting default value for filter_min_reads to 3 as SeqType is WGS") - filter_min_reads = 3 - else: - print("Setting default value for filter_min_reads to 1 as SeqType is not WGS") - filter_min_reads = 1 + print(get_data_time(), "Setting default value for filter_min_reads") + filter_min_reads = 1 else: filter_min_reads = config["filter_min_reads"] if "filter_min_base_qual" not in config: - print("Setting default value for filter_min_base_qual to 40") - filter_min_base_qual = 40 + filter_min_base_qual = 10 else: filter_min_base_qual = config["filter_min_base_qual"] -if "filter_max_error_rate" not in config: - print("Setting default value for filter_max_error_rate to 0.2") - filter_max_error_rate = 0.2 +print(get_data_time(), "Setting filter_min_base_qual to %s" % filter_min_base_qual) + +if "filter_max_base_error_rate" not in config: + print( + get_data_time(), "Setting default value for filter_max_base_error_rate to 0.2" + ) + filter_max_base_error_rate = 0.2 else: - filter_max_error_rate = config["filter_max_error_rate"] + filter_max_base_error_rate = config["filter_max_base_error_rate"] +if "filter_max_read_error_rate" not in config: + filter_max_read_error_rate = 0.1 + print( + get_data_time(), + "Setting default value for filter_max_base_error_rate to %s " + % filter_max_read_error_rate, + ) +else: + filter_max_read_error_rate = config["filter_max_read_error_rate"] -LANE = metadata["LANE_NO"].unique().tolist() -RUN_ID = metadata["RUN_ID"].unique().tolist() +if "filter_max_no_call_fraction" not in config: + print( + get_data_time(), "Setting default value for filter_max_base_error_rate to 0.1" + ) + filter_max_no_call_fraction = 0.1 +else: + filter_max_no_call_fraction = config["filter_max_no_call_fraction"] -# Create a function which checks if the input configuration is valid +if "max_coverage" not in config: + print(get_data_time(), "Setting default value for max_coverage to 10000") + max_coverage = 10000 +else: + max_coverage = config["max_coverage"] + + +# Create a set of valid combination for run lane and read ids + +LANE = metadata["LANE_NO"].unique().tolist() +RUN_ID = metadata["RUN_ID"].unique().tolist() def filter_combinator(combinator, allow_list): diff --git a/workflow/rules/create_links.smk b/workflow/rules/create_links.smk old mode 100644 new mode 100755 index 73773f2..277a8a2 --- a/workflow/rules/create_links.smk +++ b/workflow/rules/create_links.smk @@ -2,6 +2,7 @@ rule create_links_files: params: metadata=config["metadata"], fastq_dir=wrkdir / "fastq", + read_structure=read_structure, resources: mem_mb=1000, runtime=20, @@ -21,21 +22,33 @@ rule create_links_files: sample=config["sample"], lane=LANE, ), - fastq_r3=expand( - wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", - filtered_product, - run_id=RUN_ID, - sample=config["sample"], - lane=LANE, + fastq_r3=( + expand( + wrkdir / "fastq" / "{run_id}" / "{sample}_R3_{lane}.fastq.gz", + filtered_product, + run_id=RUN_ID, + sample=config["sample"], + lane=LANE, + ) + if not read_structure + else [] ), message: "Creating links to fastq files" run: metadata = pd.read_csv(params.metadata) metadata = metadata[ - (metadata["SAMPLE_NAME"] == config["sample"]) + (metadata["SAMPLE_TYPE"] == config["sample"]) & (metadata["PATIENT_ID"] == config["pid"]) ] + if params.read_structure: + if metadata["READ"].nunique() != 2: + raise ValueError("Read structure is provided but R1 R2 and R3 provided") + else: + if metadata["READ"].nunique() != 3: + raise ValueError( + "Read structure is not provided but R1 R2 and R3 not provided" + ) for index, row in metadata.iterrows(): fastq_file = Path(row["FASTQ_FILE"]) @@ -46,7 +59,7 @@ rule create_links_files: params.fastq_dir / row["RUN_ID"] / ( - row["SAMPLE_NAME"] + row["SAMPLE_TYPE"] + "_" + row["READ"] + "_" diff --git a/workflow/rules/duplicate_marking.smk b/workflow/rules/duplicate_marking.smk index ff10227..3fe1bd5 100755 --- a/workflow/rules/duplicate_marking.smk +++ b/workflow/rules/duplicate_marking.smk @@ -1,10 +1,12 @@ +# Deprecated + rule duplicates: input: - wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam", + wrkdir / "alignments" / "{sample}_temp.bam", output: - bam=wrkdir / "alignments" / "{sample}_dedup.bam", - bai=wrkdir / "alignments" / "{sample}_dedup.bam.bai", + bam=temp(wrkdir / "alignments" / "{sample}_dedup.bam"), + # bai=temp(wrkdir / "alignments" / "{sample}_temp.bam.bai"), metric=wrkdir / "metrics" / "{sample}_marked_dup_metrics.txt", params: SORTING_COLLECTION_SIZE_RATIO=SORTING_COLLECTION_SIZE_RATIO, @@ -15,14 +17,13 @@ rule duplicates: logdir / "gatk/{sample}_dedup.log", resources: mem_mb=8000, - runtime=24 * 60, + runtime=72 * 60, nodes=1, message: - "Marking duplicates on consensus reads." + "Marking duplicates on pre-consensus reads.: Decrapated slated for removal." shell: " ( " " gatk --java-options '-Xmx{resources.mem_mb}m' MarkDuplicates -I {input} " - " -O {output.bam} -M {output.metric} " - " --SORTING_COLLECTION_SIZE_RATIO {params.SORTING_COLLECTION_SIZE_RATIO} && " - " samtools index -b {output.bam} {output.bai} " + " -O {output.bam} -M {output.metric} --BARCODE_TAG RX " + " --SORTING_COLLECTION_SIZE_RATIO {params.SORTING_COLLECTION_SIZE_RATIO} " " ) &> {log} " diff --git a/workflow/rules/flagstatt.smk b/workflow/rules/flagstatt.smk index da699de..d783037 100755 --- a/workflow/rules/flagstatt.smk +++ b/workflow/rules/flagstatt.smk @@ -1,23 +1,3 @@ -# rule flagstatt: -# input: -# bam=wrkdir / "alignments" / "{sample}_dedup.recall.sorted.bam", -# output: -# wrkdir / "metrics" / "{sample}.flagstat", -# conda: -# "../envs/sambamba.yaml" -# threads: 1 -# resources: -# mem_mb=8000, -# runtime=24 * 60, -# nodes=1, -# log: -# logdir / "sambamba/{sample}.log", -# message: -# "Running Flagstat" -# shell: -# "(sambamba flagstat {input.bam} > {output}) &> {log}" - - rule flagstatt: input: bam=wrkdir / "alignments" / "{sample}_{ext}.bam", @@ -30,6 +10,7 @@ rule flagstatt: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "sambamba/{sample}_{ext}.log", message: diff --git a/workflow/rules/metric_hsmetrics.smk b/workflow/rules/metric_hsmetrics.smk index 687458a..cfc8c9c 100755 --- a/workflow/rules/metric_hsmetrics.smk +++ b/workflow/rules/metric_hsmetrics.smk @@ -41,8 +41,9 @@ if seq_type in ["Panel", "WES"]: threads: 1 resources: mem_mb=8000, - runtime=24 * 60, + runtime=4 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "bedtools" / "{sample}_slop.log", message: @@ -61,6 +62,8 @@ if seq_type in ["Panel", "WES"]: genome=genome, output: target=wrkdir / "metrics" / "{sample}.hs_metrics.txt", + params: + max_coverage=max_coverage, conda: "../envs/gatk.yaml" threads: 1 @@ -68,9 +71,10 @@ if seq_type in ["Panel", "WES"]: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, message: "Calculating HS metrics for Panel and WES data" log: logdir / "picard/{sample}.hs_metrics.log", shell: - "gatk CollectHsMetrics -I {input.bam} -O {output.target} -R {input.genome} -TI {input.target_intervals} -BI {input.bait_intervals} &> {log}" + "gatk CollectHsMetrics -I {input.bam} -O {output.target} -R {input.genome} -TI {input.target_intervals} -BI {input.bait_intervals} --COVERAGE_CAP {params.max_coverage} &> {log}" diff --git a/workflow/rules/metric_insert_size.smk b/workflow/rules/metric_insert_size.smk index ecb892d..ab0a1bc 100755 --- a/workflow/rules/metric_insert_size.smk +++ b/workflow/rules/metric_insert_size.smk @@ -12,6 +12,7 @@ rule InsertSize: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "picard/{sample}_insert_size.log", message: diff --git a/workflow/rules/metric_mosdepth.smk b/workflow/rules/metric_mosdepth.smk index 4939aeb..89fef0c 100755 --- a/workflow/rules/metric_mosdepth.smk +++ b/workflow/rules/metric_mosdepth.smk @@ -16,6 +16,7 @@ if seq_type in ["Panel", "WES"]: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/mosdepth.yaml" log: @@ -40,6 +41,7 @@ else: mem_mb=8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/mosdepth.yaml" message: diff --git a/workflow/rules/metric_wgs_coverage.smk b/workflow/rules/metric_wgs_coverage.smk index ab52c8f..8666453 100755 --- a/workflow/rules/metric_wgs_coverage.smk +++ b/workflow/rules/metric_wgs_coverage.smk @@ -7,9 +7,10 @@ rule coveragePlot: plot=wrkdir / "metrics" / "{sample}_coverage.png", threads: 20 resources: - mem_mb=10000, + mem_mb=60000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, conda: "../envs/coveragePlot.yaml" log: diff --git a/workflow/rules/recalibration.smk b/workflow/rules/recalibration.smk index f15ef7a..fd33af2 100755 --- a/workflow/rules/recalibration.smk +++ b/workflow/rules/recalibration.smk @@ -1,19 +1,21 @@ rule baseRecalibrator: input: - bam=wrkdir / "alignments" / "{sample}_dedup.bam", + bam=wrkdir / "alignments" / "{sample}.cons.filtered.realigned.bam", dbsnp=dbsnp, genome=genome, output: table=wrkdir / "metrics" / "{sample}_recal_data.table", bam=temp(wrkdir / "alignments" / "{sample}_dedup.recall.bam"), + bai=temp(wrkdir / "alignments" / "{sample}_dedup.recall.bai"), analyse_covariates=wrkdir / "metrics" / "{sample}_covariates.pdf", conda: "../envs/gatk.yaml" threads: 8 resources: - mem_mb=8000, - runtime=24 * 60, + mem_mb=50000, + runtime=72 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "gatk/{sample}_recal.log", message: @@ -39,15 +41,18 @@ rule sort_index: conda: "../envs/samtools.yaml" threads: 8 + params: + mem_thread=8000, resources: - mem_mb=8000, + mem_mb=8 * 8000, runtime=24 * 60, nodes=1, + tmpdir=scratch_dir, log: logdir / "samtools/{sample}_sort.log", message: "Sorting and indexing recalibrated bam file" shell: " ( " - " samtools sort --threads 8 -o {output.bam}##idx##{output.bai} {input.bam} --write-index" + " samtools sort --threads 8 -m{params.mem_thread}m -o {output.bam}##idx##{output.bai} {input.bam} -T {resources.tmpdir} --write-index" " ) &> {log} " diff --git a/workflow/rules/umi_based_rules.smk b/workflow/rules/umi_based_rules.smk new file mode 100755 index 0000000..bf6e389 --- /dev/null +++ b/workflow/rules/umi_based_rules.smk @@ -0,0 +1,206 @@ + + + +rule fix_mate: + """ + Fixing mate information if required + For some reason for some reads the mate information is not properly set. + This can cause problems in downstream analysis. + """ + input: + bam=( + wrkdir / "alignments" / "{sample}_merged_umi_annot.bam" + if not read_structure + else wrkdir / "alignments" / "{sample}_merged_umi_annot.bam" + ), + output: + bam=temp(wrkdir / "alignments" / "{sample}_mate_fix.bam"), + threads: 1 + resources: + mem_mb=50000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio/fixmate_{sample}.log", + message: + "Fixing mate information if required" + shell: + "(fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 --async-io SetMateInformation " + "--input {input.bam} " + "--output {output.bam} " + "--allow-missing-mates true )" + " &> {log} " + + +rule group_reads: + input: + bam=wrkdir / "alignments" / "{sample}_mate_fix.bam", + output: + bam=temp( + wrkdir / "alignments" / "{sample}_merged_aln_umi_annot_sorted_grouped.bam" + ), + stats=wrkdir / "metrics" / "{sample}.grouped-family-sizes.txt", + params: + strategy=group_strategy, + allowed_edits=group_allowed_edits, + min_mapq=group_min_mapq, + threads: 2 + resources: + mem_mb=50000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio/group_{sample}.log", + message: + "Grouping reads by UMI and position for consensus calling." + shell: + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 --async-io GroupReadsByUmi " + "--input {input.bam} " + "--strategy {params.strategy} " + "--edits {params.allowed_edits} " + "--min-map-q {params.min_mapq} " + "--output {output.bam} " + "--family-size-histogram {output.stats} " + "&> {log} " + + +rule call_filter_consensus_reads: + input: + bam=wrkdir / "alignments" / "{sample}_merged_aln_umi_annot_sorted_grouped.bam", + ref=genome, + output: + bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered.bam"), + metrics=logdir / "fgbio/call_consensus_reads.{sample}.log", + params: + min_reads=consensus_min_reads, + min_input_base_mapq=consensus_min_input_base_mapq, + error_rate_post_umi=consensus_error_rate_post_umi, + error_rate_pre_umi=consensus_error_rate_pre_umi, + filter_min_reads=filter_min_reads, + min_base_qual=filter_min_base_qual, + max_base_error_rate=filter_max_base_error_rate, + max_read_error_rate=filter_max_read_error_rate, + max_no_call_fraction=filter_max_no_call_fraction, + memory_consensus=25000, + memory_filter=25000, + threads: 24 + resources: + mem_mb=50000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio/call_consensus_reads.{sample}.log", + message: + "Calling consensus reads from grouped reads." + shell: + "( fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{params.memory_consensus}m --compression 0 CallMolecularConsensusReads " + "--input {input.bam} " + "--output /dev/stdout " + "--min-reads {params.min_reads} " + "--error-rate-pre-umi {params.error_rate_pre_umi} " + "--error-rate-post-umi {params.error_rate_post_umi} " + "--min-input-base-quality {params.min_input_base_mapq} " + "--threads {threads} | fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{params.memory_filter}m --compression 1 FilterConsensusReads " + "--input /dev/stdin " + "--output {output.bam} " + "--ref {input.ref} " + "--min-reads {params.filter_min_reads} " + "--max-read-error-rate {params.max_read_error_rate} " + "--max-base-error-rate {params.max_base_error_rate} " + "--min-base-quality {params.min_base_qual} " + "--max-no-call-fraction {params.max_no_call_fraction} ) " + "&> {log}" + + +rule sort_name_index: + input: + bam=wrkdir / "alignments" / "{sample}.cons.unmapped.bam", + output: + bam=temp(wrkdir / "alignments" / "{sample}.cons.unmapped.sorted.bam"), + conda: + "../envs/samtools.yaml" + threads: 8 + params: + mem_thread=8000, + resources: + mem_mb=8 * 8000, + runtime=24 * 60, + nodes=1, + tmpdir=scratch_dir, + log: + logdir / "samtools/{sample}_sort_name.log", + message: + "Sorting and indexing concensus bam file" + shell: + " ( " + " samtools sort --threads 8 -m{params.mem_thread}m -n -u -T {resources.tmpdir} -o {output.bam} {input.bam} " + " ) &> {log} " + + +rule filter_consensus_reads: + input: + bam=wrkdir / "alignments" / "{sample}.cons.unmapped#.bam", + ref=genome, + output: + bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered#.bam"), + params: + min_reads=filter_min_reads, + min_base_qual=filter_min_base_qual, + max_base_error_rate=filter_max_base_error_rate, + max_read_error_rate=filter_max_read_error_rate, + max_no_call_fraction=filter_max_no_call_fraction, + threads: 8 + resources: + mem_mb=50000, + runtime=72 * 60, + nodes=1, + tmpdir=scratch_dir, + conda: + "../envs/fgbio.yaml" + log: + logdir / "fgbio" / "filter_consensus_reads.{sample}.log", + message: + "Filtering consensus reads and sorting into coordinate order." + shell: + "(" + "fgbio -Djava.io.tmpdir={resources.tmpdir} -Xmx{resources.mem_mb}m --compression 1 FilterConsensusReads " + "--input {input.bam} " + "--output {output.bam} " + "--ref {input.ref} " + "--min-reads {params.min_reads} " + "--max-read-error-rate {params.max_read_error_rate} " + "--max-base-error-rate {params.max_base_error_rate} " + "--min-base-quality {params.min_base_qual} " + "--max-no-call-fraction {params.max_no_call_fraction} " + + ") &> {log} " + + +rule gatherConsensusMetrics: + input: + logdir / "fgbio/call_consensus_reads.{sample}.log", + output: + wrkdir / "metrics" / "{sample}_consensus_metrics.tsv", + params: + sample=str(config["sample"]), + threads: 1 + resources: + mem_mb=1000, + runtime=60, + nodes=1, + tmpdir=scratch_dir, + log: + logdir / "metrics" / "{sample}_consensus_metrics.log", + conda: + "../envs/consensus.yaml" + script: + "../scripts/getConsensusMetrics.py" diff --git a/workflow/rules/umi_consensus.smk b/workflow/rules/umi_consensus.smk deleted file mode 100755 index f190eb2..0000000 --- a/workflow/rules/umi_consensus.smk +++ /dev/null @@ -1,143 +0,0 @@ -rule fgbio: - input: - alignment=wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}.bam", - fastq_r2=wrkdir / "fastq" / "{run_id}" / "{sample}_R2_{lane}.fastq.gz", - output: - temp(wrkdir / "alignments" / "{run_id}" / "{sample}_aln_{lane}_umi_annot.bam"), - threads: 1 - resources: - mem_mb=8000, - runtime=4 * 60, - nodes=1, - conda: - "../envs/fgbio.yaml" - log: - logdir / "fgbio" / "annotate_umi_{run_id}_{sample}_{lane}.log", - message: - "Annotating BAM with UMIs from fastq file." - shell: - "fgbio AnnotateBamWithUmis -i {input.alignment} -f {input.fastq_r2} -o {output} -t RX -q UQ -s true &> {log}" - - -rule fix_mate: - """ - Fixing mate information if required - For some reason for some reads the mate information is not properly set. - This can cause problems in downstream analysis. - """ - input: - bam=wrkdir / "alignments" / "{sample}_merged_umi_annot.bam", - output: - bam=temp(wrkdir / "alignments" / "{sample}_mate_fix.bam"), - threads: 2 - resources: - mem_mb=8000, - runtime=24 * 60, - nodes=1, - conda: - "../envs/fgbio.yaml" - log: - logdir / "fgbio/fixmate_{sample}.log", - message: - "Fixing mate information if required" - shell: - "fgbio -Xmx{resources.mem_mb}m --compression 1 --async-io SetMateInformation " - "--input {input.bam} " - "--output {output.bam} " - "--allow-missing-mates true" - "&> {log} " - - -rule group_reads: - input: - bam=wrkdir / "alignments" / "{sample}_mate_fix.bam", - output: - bam=temp( - wrkdir / "alignments" / "{sample}_merged_aln_umi_annot_sorted_grouped.bam" - ), - stats=wrkdir / "metrics" / "{sample}.grouped-family-sizes.txt", - params: - strategy=strategy, - allowed_edits=allowed_edits, - threads: 2 - resources: - mem_mb=8000, - runtime=24 * 60, - nodes=1, - conda: - "../envs/fgbio.yaml" - log: - logdir / "fgbio/group_{sample}.log", - message: - "Grouping reads by UMI and position for consensus calling." - shell: - "fgbio -Xmx{resources.mem_mb}m --compression 1 --async-io GroupReadsByUmi " - "--input {input.bam} " - "--strategy {params.strategy} " - "--edits {params.allowed_edits} " - "--output {output.bam} " - "--family-size-histogram {output.stats} " - "&> {log} " - - -rule call_consensus_reads: - input: - bam=wrkdir / "alignments" / "{sample}_merged_aln_umi_annot_sorted_grouped.bam", - output: - bam=temp(wrkdir / "alignments" / "{sample}.cons.unmapped.bam"), - params: - min_reads=consensus_min_reads, - min_base_qual=consensus_min_base_qual, - threads: 4 - resources: - mem_mb=4000, - runtime=24 * 60, - nodes=1, - conda: - "../envs/fgbio.yaml" - log: - logdir / "fgbio/call_consensus_reads.{sample}.log", - message: - "Calling consensus reads from grouped reads." - shell: - "fgbio -Xmx{resources.mem_mb}m --compression 0 CallMolecularConsensusReads " - "--input {input.bam} " - "--output {output.bam} " - "--min-reads {params.min_reads} " - "--min-input-base-quality {params.min_base_qual} " - "--threads {threads} " - "&> {log}" - - -rule filter_consensus_reads: - input: - bam=wrkdir / "alignments" / "{sample}.cons.unmapped.bam", - ref=genome, - output: - bam=temp(wrkdir / "alignments" / "{sample}.cons.filtered.bam"), - params: - min_reads=filter_min_reads, - min_base_qual=filter_min_base_qual, - max_error_rate=filter_max_error_rate, - threads: 8 - resources: - mem_mb=8000, - runtime=24 * 60, - nodes=1, - conda: - "../envs/fgbio.yaml" - log: - logdir / "fgbio" / "filter_consensus_reads.{sample}.log", - message: - "Filtering consensus reads and sorting into coordinate order." - shell: - "(" - "samtools sort -n -u {input.bam} | " - "fgbio -Xmx{resources.mem_mb}m --compression 1 FilterConsensusReads " - "--input /dev/stdin " - "--output {output.bam} " - "--ref {input.ref} " - "--min-reads {params.min_reads} " - "--min-base-quality {params.min_base_qual} " - "--max-base-error-rate {params.max_error_rate} " - ") &> {log} " diff --git a/workflow/scripts/getConsensusMetrics.py b/workflow/scripts/getConsensusMetrics.py new file mode 100755 index 0000000..81c2128 --- /dev/null +++ b/workflow/scripts/getConsensusMetrics.py @@ -0,0 +1,24 @@ +import os +import pandas as pd + + + +column_set1=['Raw Reads Filtered Due to Insufficient Support', 'Raw Reads Filtered Due to Mismatching Cigars', 'Raw Reads Filtered Due to Low Base Quality', + 'Raw Reads Filtered Due to Orphan Consensus Created','Consensus reads emitted', 'Total Raw Reads Considered'] +column_set2=['overlapping templates', 'corrected templates', 'overlapping bases', 'corrected bases' ] +consensus_df=pd.DataFrame(columns=column_set1+column_set2, index=[snakemake.params['sample']]) +# print(consensus_df) +with open(snakemake.input[0]) as handle: + for line in handle: + for i in consensus_df.columns: + if i in line: + # print(line) + if i in column_set1: + line=line.split(':')[-1].split('(')[0].replace('.', '').replace(',','').replace(' ', '').strip() + else: + line=line.split(']')[-1].split('(')[0].replace('.', '').replace(i, '').replace(',','').replace(' ', '').strip() + line=int(line) + consensus_df.loc[snakemake.params['sample'], i]=line + break + +consensus_df.to_csv(snakemake.output[0], sep='\t', index=True) diff --git a/workflow/scripts/getCoveragePlot_snakemake.R b/workflow/scripts/getCoveragePlot_snakemake.R old mode 100644 new mode 100755