Merge pull request #30 from HiDiHlabs/dev

Preparing release
HiDiHlabs · Jul 23, 2024 · 88bb7a8 · 88bb7a8
2 parents 96310b8 + 65dbd6b
commit 88bb7a8
Show file tree

Hide file tree

Showing 31 changed files with 904 additions and 396 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@ config/S*.yaml
 .snakemake/
 Singularity/
 conda_envs/
+conda_envs.bkp/
+
 conda_envs_ohne_singularity/
 profile/config.yaml.bkp
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,3 +13,5 @@ repos:
     rev: v0.10.0 # Replace by any tag/version ≥0.2.4 : https://github.com/snakemake/snakefmt/releases
     hooks:
       - id: snakefmt
+ci:
+    autoupdate_schedule: quarterly
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ The pipeline was tested and supported by Daniel Steiert.
 
 The pipeline is made for aligning UMI based WGS/WES and Panel Seq data and to compute the QC metrics associated with it.
 
-We require the sequencing is performed in paired end mode and must contain R1 (forward read) R2 (UMI) and R3 (reverse read) for each lane and run
+We require the sequencing is performed in paired end mode
 
 Currently the ability to provide support is limited.
 
@@ -93,15 +93,40 @@ Please modify the entry for
 
 16. `dict_genome`: An absolute path to dict file for the given genomes, ignored if `SeqType` is `WGS`
 
+17. `group_allowed_edits`: Number of edit allowed when grouping based on umi. defaults to  0, should be set to zero if correct_umi is true
+18. `group_min_mapq: 20`: Set `--min-map-q` of groupReadsByUMI
+19. `group_strategy`: Set the `--strategy` param of groupReadsByUMI deafults to `Adjacency`
 
 
+20. `consensus_min_reads`: 1
+21. `consensus_min_base_qual`: 2
+22. `consensus_min_input_base_mapq`: 10
+23. `consensus_error_rate_pre_umi`: 45
+24. `consensus_error_rate_post_umi`: 30
+
+
+25. `filter_min_reads`: 3
+26. `filter_min_base_qual`: 2
+27. `filter_max_base_error_rate`: 0.1
+28. `filter_max_read_error_rate`: 0.05
+29. `filter_max_no_call_fraction`: 0.2
+
+
+30. `read_structure`: 8M143T 8M143T
+
+
+31. `correct_umi`: True
+32. `correct_umi_max_mismatches`: 3
+33. `correct_umi_min_distance`: 1
+34. `umi_file`:
+
 ## Metadata file
 
-Please create a metadata file with columns
+Please create a metadata file with columns other columns can exist but not required
 1. FASTQ_FILE: A column containing absolute paths to the locations of the FASTQ files,
 2. READ: A column contain information with regards to R1, R2 and R3 of the sequencing file. Needs to be prefixed with `R` if not present
 3. LANE_NO: A column containing the lane information for the sequencing files. Should be prefixed with `L_` if not present
-4. SAMPLE_NAME: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run
+4. SAMPLE_TYPE: Containing the sample name which is inputed in the config file. Please note if the metadata file consists of multipe samples only the sample pid combination mentioned in the sample and pid directive of the config.yaml will be run
 5. PATIENT_ID :Containing the ```pid``` which is inputed in the config file. Please note if the metadata file consists of multipe PIDs only the ```sample``` ```pid``` combination mentioned in the sample and pid directive of the config.yaml will be run.
 7. RUN_ID: Please mention the run id for the squencing run for the sample
 

diff --git a/config/config.yaml b/config/config.yaml
@@ -42,3 +42,31 @@ bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions f
 
 chrom_sizes: /Path/to/chrom_sizes.tsv # Tab separated file containing the chromosome names and their lengths should match the reference genome
 dict_genome: /Path/to/genome.dict # Dictionary file for the reference genome
+
+#Parameters for Read_Grouping
+group_allowed_edits: 0
+group_min_mapq: 20
+group_strategy: Adjacency
+
+# Configuration for consensus calling
+consensus_min_reads: 1
+consensus_min_base_qual: 2
+consensus_min_input_base_mapq: 10
+consensus_error_rate_pre_umi: 45
+consensus_error_rate_post_umi: 30
+
+# Configuration for consensus filter
+filter_min_reads: 3
+filter_min_base_qual: 2
+filter_max_base_error_rate: 0.1
+filter_max_read_error_rate: 0.05
+filter_max_no_call_fraction: 0.2
+
+# Configuration when UMI present in R1 and R2 files
+read_structure: 8M143T 8M143T
+
+# Correct UMI based on list of UMI's
+correct_umi: True
+correct_umi_max_mismatches: 3
+correct_umi_min_distance: 1
+umi_file: /path/to/umi.txt
diff --git a/config/test_config_exliquid.yaml b/config/test_config_exliquid.yaml
@@ -0,0 +1,53 @@
+Adapter_R1: []
+Adapter_R3: []
+SeqType: Panel
+chrom_sizes: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/stats/hg19_chrTotalLength.tsv
+
+dbsnp: /applications/otp/ngs_share_complete/assemblies/hg19_GRCh37_1000genomes/databases/dbSNP/dbSNP_147/00-All.vcf.gz
+dict_genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.dict
+
+genome: /applications/otp/reference-genomes/bwa06_1KGRef_PhiX/hs37d5_PhiX.fa
+
+library_prep_kit: IDT_xGen_cfDNA_FFPE
+
+log_dir: /dh-projects/exliquid/scratch/results_alignment_test/logs/
+
+metadata: /dh-projects/exliquid/scratch/input/bare_min_meta_data.csv
+
+pid: EXLIQUID_EX59
+sample: EXLIQUID_EX59-BUFFYCOAT_control
+
+target_regions: /dh-projects/exliquid/raw_data/reference/Targets-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed
+bait_regions: /dh-projects/exliquid/raw_data/reference/Probes-XGEN.69EBBD23F90841409EAFA66D9BC58A17.g.bed
+# trim_adapters: false
+
+work_dir: /dh-projects/exliquid/scratch/results_alignment_test/
+
+# bait_regions: /Path/to/bait_regions.bed # Bed file containing the bait regions for the analysis only required if SeqType is Panel/WES, if not provided a bait file will be generated from the target file
+
+#Parameters for Read_Grouping
+group_allowed_edits: 0
+group_min_mapq: 20
+group_strategy: Adjacency
+
+# Configuration for consensus calling
+consensus_min_reads: 1
+consensus_min_base_qual: 2
+consensus_min_input_base_mapq: 10
+consensus_error_rate_pre_umi: 45
+consensus_error_rate_post_umi: 30
+
+# Configuration for consensus filter
+filter_min_reads: 3
+filter_min_base_qual: 2
+filter_max_base_error_rate: 0.1
+filter_max_read_error_rate: 0.05
+filter_max_no_call_fraction: 0.2
+
+# Configuration when UMI present in R1 and R2 files
+read_structure: 8M143T 8M143T
+# Correct UMI based on list of UMI's
+correct_umi: True
+correct_umi_max_mismatches: 3
+correct_umi_min_distance: 1
+umi_file: /dh-projects/exliquid/raw_data/reference/umi_list.txt
diff --git a/profile/config.yaml b/profile/config.yaml
@@ -3,14 +3,15 @@ jobs: 10
 
 latency-wait: 60
 reason: True
-
+# default-resources:
+#   slurm_partition: master-fasttrack
 keep-going: True
 printshellcmds: True
 rerun-incomplete: True
 restart-times: 2
-
 # delete-temp-output: True
-conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs
+conda-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/conda_envs
 # conda-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/conda_envs_ohne_singularity
-singularity-prefix: /dh-projects/richter_transformation/analysis/wgs_analysis/alignment_pipeline/Singularity
-singularity-args: "-B /dh-projects/richter_transformation:/dh-projects/richter_transformation,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy"
+singularity-prefix: /dh-projects/ag-ishaque/analysis/sahays/alignment_pipeline/Singularity
+singularity-args: "-B /dh-projects/ag-ishaque:/dh-projects/ag-ishaque,/applications/:/applications,/dh-projects/T-NHL-chapuy:/dh-projects/T-NHL-chapuy,/dh-projects/otp:/dh-projects/otp"
+# /dh-projects/exliquid:/dh-projects/exliquid,
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -1,18 +1,20 @@
 import os
+from datetime import datetime
 import pandas as pd
 from pathlib import Path
 from itertools import product
+import tempfile
 
 
 container: "docker://condaforge/mambaforge"
 
 
 include: "rules/common.smk"  #done
 include: "rules/create_links.smk"
+include: "rules/umi_based_rules.smk"  #done
 include: "rules/adapter_trimming.smk"  #done
 include: "rules/alignment.smk"  #done
-include: "rules/umi_consensus.smk"  #done
-include: "rules/duplicate_marking.smk"  #done
+#  include: "rules/duplicate_marking.smk"  #done
 include: "rules/flagstatt.smk"  #done
 include: "rules/metric_hsmetrics.smk"  #done
 include: "rules/metric_insert_size.smk"  #done
@@ -23,6 +25,10 @@ include: "rules/recalibration.smk"  #done
 
 rule all:
     input:
+        # expand(
+        #     wrkdir / "alignments" / "{sample}_merged_umi_annot.bam",
+        #     sample=config["sample"],
+        # ),
         expand(
             wrkdir / "alignments" / "{sample}_dedup.recall.sorted.bam",
             sample=config["sample"],
@@ -35,11 +41,10 @@ rule all:
             wrkdir / "metrics" / "{sample}.mosdepth.global.dist.txt",
             sample=config["sample"],
         ),
-        # expand(wrkdir / "metrics" / "{sample}.flagstat", sample=config["sample"]),
         expand(
             wrkdir / "metrics" / "{sample}_{ext}.flagstat",
             sample=config["sample"],
-            ext=["dedup.recall.sorted", "merged_umi_annot"],
+            ext=["dedup.recall.sorted"],
         ),
         expand(
             wrkdir / "metrics" / "{sample}_insert_size_metrics.txt",
@@ -64,3 +69,20 @@ rule all:
         expand(wrkdir / "metrics" / "{sample}.hs_metrics.txt", sample=config["sample"])
         if config["SeqType"] in ["Panel", "WES"]
         else [],
+        expand(
+            wrkdir
+            / "metrics"
+            / "correct_umi"
+            / "{run_id}"
+            / "{sample}_{lane}_umi_metrics.txt",
+            filtered_product,
+            run_id=RUN_ID,
+            sample=config["sample"],
+            lane=LANE,
+        )
+        if correct_umi
+        else [],
+        expand(
+            wrkdir / "metrics" / "{sample}_consensus_metrics.tsv",
+            sample=config["sample"],
+        ),
diff --git a/workflow/envs/consensus.yaml b/workflow/envs/consensus.yaml
@@ -0,0 +1,6 @@
+name: consensus
+channels:
+  - conda-forge
+dependencies:
+  - pandas=2.2.2
+  - python=3.11
diff --git a/workflow/envs/coveragePlot.yaml b/workflow/envs/coveragePlot.yaml
diff --git a/workflow/envs/cutadapt.yaml b/workflow/envs/cutadapt.yaml
diff --git a/workflow/envs/fgbio.yaml b/workflow/envs/fgbio.yaml
diff --git a/workflow/envs/gatk.yaml b/workflow/envs/gatk.yaml
diff --git a/workflow/envs/mosdepth.yaml b/workflow/envs/mosdepth.yaml
diff --git a/workflow/envs/sambamba.yaml b/workflow/envs/sambamba.yaml
diff --git a/workflow/envs/samtools.yaml b/workflow/envs/samtools.yaml