From 3dda02c9f292792d06fa3cd4c7ab2daea89154c9 Mon Sep 17 00:00:00 2001
From: Fraser Combe <frascom@hotmail.com>
Date: Mon, 4 Nov 2024 12:36:35 -0600
Subject: [PATCH] [TheiaCov] wfs add percentage_mapped_reads (#641)

* Added percentage_mapped_reads output to ivar_consensus.wdl and updated stats_n_coverage task

* update mapped reads trying read_float

* get read numbers from stats file

* get read numbers from stats filev2

* change from bc to awk for calculation

* update awk

* metric output txt instead of csv

* reswitchack to read_string output t

* percentage mapped reads based on trimmed bam file theiacov_ont

* update theiacov-ont for mapped reads

* pass output ivar cons mapped reads to wf for terra output

* perc mapped reads output flu track PE, ONT and clearlabs and doc update

* updated namings outputs cov_ONT and removed extra call assembly metrics

* change naming output stat n coverage task

* update flu mapped reads perc variable name

* make theiacov_ont conditional output flu mapped reads

* wdl does not support if cond in output change to select first

* wdl does not support if cond in output change to select first

* combine flu and non flu into same mapped reads output

* correct assembled reads call

* update mdsums

* update clearlabs for statncov call

* float?

* mdsums and pe wf update flu irma defined

* mdsum pe

* move to flue track

* tidy output pe and ont

* update strings and provide default values

* clean tab/spaces echo

* my fav commit mdsums!

* remove extra space because it was bothering me

* adding a space because i am crazy

* update flu output

* update outputs

* remove string in name

* correctly name percentage mapped reads

* correct output in flu track wdl

* update mdsums

* primtrim output

* another mdsum

---------

Co-authored-by: Sage Wright <sage.wright@theiagen.com>
---
 .../genomic_characterization/theiacov.md      |  1 +
 .../task_assembly_metrics.wdl                 | 35 +++++++++++++++++--
 .../theiacov/test_wf_theiacov_clearlabs.yml   |  4 +--
 .../theiacov/test_wf_theiacov_illumina_pe.yml |  4 +--
 .../theiacov/test_wf_theiacov_illumina_se.yml |  4 +--
 .../theiacov/test_wf_theiacov_ont.yml         |  4 +--
 workflows/theiacov/wf_theiacov_clearlabs.wdl  |  2 ++
 .../theiacov/wf_theiacov_illumina_pe.wdl      |  5 +--
 .../theiacov/wf_theiacov_illumina_se.wdl      |  2 ++
 workflows/theiacov/wf_theiacov_ont.wdl        |  4 ++-
 workflows/utilities/wf_flu_track.wdl          |  7 +++-
 workflows/utilities/wf_ivar_consensus.wdl     |  4 ++-
 12 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md
index 2e72b86d4..ba7eac3c4 100644
--- a/docs/workflows/genomic_characterization/theiacov.md
+++ b/docs/workflows/genomic_characterization/theiacov.md
@@ -1157,6 +1157,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch)
 | pangolin_notes | String | Lineage notes as determined by Pangolin | CL, FASTA, ONT, PE, SE |
 | pangolin_versions | String | All Pangolin software and database versions | CL, FASTA, ONT, PE, SE |
 | percent_reference_coverage | Float | Percent coverage of the reference genome after performing primer trimming; calculated as assembly_length_unambiguous / length of the reference genome (SC2: 29903) x 100 | CL, FASTA, ONT, PE, SE |
+| percentage_mapped_reads | String | Percentage of reads that successfully aligned to the reference genome. This value is calculated by number of mapped reads / total number of reads x 100. | ONT, PE, SE |
 | primer_bed_name | String | Name of the primer bed files used for primer trimming | CL, ONT, PE, SE |
 | primer_trimmed_read_percent | Float | Percentage of read data with primers trimmed as determined by iVar trim | PE, SE |
 | qc_check | String | The results of the QC Check task | CL, FASTA, ONT, PE, SE |
diff --git a/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl b/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl
index 9fe5f843b..762db23cf 100644
--- a/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl
+++ b/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl
@@ -14,11 +14,11 @@ task stats_n_coverage {
     samtools --version | head -n1 | tee VERSION
 
     samtools stats ~{bamfile} > ~{samplename}.stats.txt
-
     samtools coverage ~{bamfile} -m -o ~{samplename}.cov.hist
     samtools coverage ~{bamfile} -o ~{samplename}.cov.txt
     samtools flagstat ~{bamfile} > ~{samplename}.flagstat.txt
 
+     # Extracting coverage, depth, meanbaseq, and meanmapq
     coverage=$(cut -f 6 ~{samplename}.cov.txt | tail -n 1)
     depth=$(cut -f 7 ~{samplename}.cov.txt | tail -n 1)
     meanbaseq=$(cut -f 8 ~{samplename}.cov.txt | tail -n 1)
@@ -33,6 +33,34 @@ task stats_n_coverage {
     echo $depth | tee DEPTH
     echo $meanbaseq | tee MEANBASEQ
     echo $meanmapq | tee MEANMAPQ
+
+    # Parsing stats.txt for total and mapped reads
+    total_reads=$(grep "^SN" ~{samplename}.stats.txt | grep "raw total sequences:" | cut -f 3)
+    mapped_reads=$(grep "^SN" ~{samplename}.stats.txt | grep "reads mapped:" | cut -f 3)
+
+    # Check for empty values and set defaults to avoid errors
+    if [ -z "$total_reads" ]; then total_reads="1"; fi  # Avoid division by zero
+    if [ -z "$mapped_reads" ]; then mapped_reads="0"; fi
+
+    # Calculate the percentage of mapped reads
+    percentage_mapped_reads=$(awk "BEGIN {printf \"%.2f\", ($mapped_reads / $total_reads) * 100}")
+
+    # If the percentage calculation fails, default to 0.0
+    if [ -z "$percentage_mapped_reads" ]; then percentage_mapped_reads="0.0"; fi
+
+    # Output the result
+    echo $percentage_mapped_reads | tee PERCENTAGE_MAPPED_READS
+
+    #output all metrics in one txt file
+    # Output header row (for CSV)
+    echo -e "Statistic\tValue" > ~{samplename}_metrics.txt
+
+    # Output each statistic as a row
+    echo -e "Coverage\t$coverage" >> ~{samplename}_metrics.txt
+    echo -e "Depth\t$depth" >> ~{samplename}_metrics.txt
+    echo -e "Mean Base Quality\t$meanbaseq" >> ~{samplename}_metrics.txt
+    echo -e "Mean Mapping Quality\t$meanmapq" >> ~{samplename}_metrics.txt
+    echo -e "Percentage Mapped Reads\t$percentage_mapped_reads" >> ~{samplename}_metrics.txt
   >>>
   output {
     String date = read_string("DATE")
@@ -45,6 +73,9 @@ task stats_n_coverage {
     Float depth = read_string("DEPTH")
     Float meanbaseq = read_string("MEANBASEQ")
     Float meanmapq = read_string("MEANMAPQ")
+    Float percentage_mapped_reads = read_string("PERCENTAGE_MAPPED_READS")
+    File metrics_txt = "~{samplename}_metrics.txt"
+
   }
   runtime {
     docker: docker
@@ -55,4 +86,4 @@ task stats_n_coverage {
     preemptible: 0
     maxRetries: 3
   }
-}
\ No newline at end of file
+}
diff --git a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml
index 599fec45e..fe20263a2 100644
--- a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml
+++ b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml
@@ -318,7 +318,7 @@
     - path: miniwdl_run/call-pangolin4/work/clearlabs.pangolin_report.csv
       md5sum: 151390c419b00ca44eb83e2bbfb96996
     - path: miniwdl_run/call-stats_n_coverage/command
-      md5sum: 51da320ddc7de2ffeb263f0ddd85ced6
+      md5sum: ac020678f99ac145b11d3dbc7b9fe9ba
     - path: miniwdl_run/call-stats_n_coverage/inputs.json
       contains: ["bamfile", "samplename"]
     - path: miniwdl_run/call-stats_n_coverage/outputs.json
@@ -350,7 +350,7 @@
     - path: miniwdl_run/call-stats_n_coverage/work/clearlabs.stats.txt
       md5sum: bfed5344c91ce6f4db1f688cac0a3ab9
     - path: miniwdl_run/call-stats_n_coverage_primtrim/command
-      md5sum: a84f90b8877babe54bf8c068d244fbe8
+      md5sum: 2974f886e1959cd5eaae5e495c91f7cc
     - path: miniwdl_run/call-stats_n_coverage_primtrim/inputs.json
       contains: ["bamfile", "samplename"]
     - path: miniwdl_run/call-stats_n_coverage_primtrim/outputs.json
diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml
index 4c7542334..dfef9c994 100644
--- a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml
+++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml
@@ -205,7 +205,7 @@
       md5sum: 511e696afe25f8b96a84d68ec5a8af8a
     # stats n coverage primer trim
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/command
-      md5sum: 260c3887be6d99b18caf6d3914c5737f
+      md5sum: 1c61b89c2a94e87518a6679a04885341
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/inputs.json
       contains: ["bamfile", "samplename"]
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/outputs.json
@@ -288,7 +288,7 @@
       md5sum: 6c63395a125f8618334b8af2de4e2d88
     # stats n coverage
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/command
-      md5sum: 1ffac4cc3e9bdd84a0f9228e8e5ca5d9
+      md5sum: e49a297b1c0eb195a2acd80f00672668
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/inputs.json
       contains: ["bamfile", "samplename"]
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/outputs.json
diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml
index 0742c19a9..22453bdd5 100644
--- a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml
+++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml
@@ -157,7 +157,7 @@
       md5sum: 603c3cbc771ca910b96d3c032aafe7c9
     # stats n coverage primer trim
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/command
-      md5sum: 67cac223adcf059a9dfaa9f28ed34f68
+      md5sum: affacdcfda48ad5e371a4510f19520bd
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/inputs.json
       contains: ["bamfile", "samplename"]
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/outputs.json
@@ -240,7 +240,7 @@
       md5sum: 03c5ecf22fdfdb6b240ac3880281a056
     # stats n coverage
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/command
-      md5sum: 3dacccb252429a0ff46c079a75a09377
+      md5sum: cb4de0e459b3fada21bcf08a8dbea89f
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/inputs.json
       contains: ["bamfile", "samplename"]
     - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/outputs.json
diff --git a/tests/workflows/theiacov/test_wf_theiacov_ont.yml b/tests/workflows/theiacov/test_wf_theiacov_ont.yml
index 6077323b9..333f39b90 100644
--- a/tests/workflows/theiacov/test_wf_theiacov_ont.yml
+++ b/tests/workflows/theiacov/test_wf_theiacov_ont.yml
@@ -225,7 +225,7 @@
       md5sum: 32c0be4fb7f3030bf9c74c0a836d4f2e
     - path: miniwdl_run/call-raw_check_reads/work/_miniwdl_inputs/0/ont.fastq.gz
     - path: miniwdl_run/call-stats_n_coverage/command
-      md5sum: 93414eacbbb9d7c4813bb54a8a507078
+      md5sum: fbd85e82af1bbfaa734a13a9c1394300
     - path: miniwdl_run/call-stats_n_coverage/inputs.json
       contains: ["bamfile", "samplename"]
     - path: miniwdl_run/call-stats_n_coverage/outputs.json
@@ -250,7 +250,7 @@
     - path: miniwdl_run/call-stats_n_coverage/work/ont.flagstat.txt
     - path: miniwdl_run/call-stats_n_coverage/work/ont.stats.txt
     - path: miniwdl_run/call-stats_n_coverage_primtrim/command
-      md5sum: c6e7de70dfdbb1858229e44777b84110
+      md5sum: 3689a902aa96e8c132e6ef4946699e61
     - path: miniwdl_run/call-stats_n_coverage_primtrim/inputs.json
       contains: ["bamfile", "samplename"]
     - path: miniwdl_run/call-stats_n_coverage_primtrim/outputs.json
diff --git a/workflows/theiacov/wf_theiacov_clearlabs.wdl b/workflows/theiacov/wf_theiacov_clearlabs.wdl
index 8932c983f..0368bef95 100644
--- a/workflows/theiacov/wf_theiacov_clearlabs.wdl
+++ b/workflows/theiacov/wf_theiacov_clearlabs.wdl
@@ -207,6 +207,8 @@ workflow theiacov_clearlabs {
     Int number_Degenerate = consensus_qc.number_Degenerate
     Int number_Total = consensus_qc.number_Total
     Float percent_reference_coverage = consensus_qc.percent_reference_coverage
+    # Percentage mapped reads
+    Float percentage_mapped_reads = stats_n_coverage.percentage_mapped_reads
     # SC2 specific coverage outputs
     Float? sc2_s_gene_mean_coverage = gene_coverage.sc2_s_gene_depth
     Float? sc2_s_gene_percent_coverage = gene_coverage.sc2_s_gene_percent_coverage
diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl
index ece3c201a..5f5e5b651 100644
--- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl
+++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl
@@ -144,7 +144,7 @@ workflow theiacov_illumina_pe {
             trim_primers = trim_primers
         }
       }
-      # perform flu-specific tasks
+      # for flu organisms call flu_track
       if (organism_parameters.standardized_organism == "flu") {
         call run_flu_track.flu_track {
           input:
@@ -439,6 +439,7 @@ workflow theiacov_illumina_pe {
     # QC_Check Results
     String? qc_check = qc_check_task.qc_check
     File? qc_standard = qc_check_task.qc_standard
- 
+    # Capture percentage_mapped_reads from ivar_consensus task or flu_track task
+    String percentage_mapped_reads = select_first([ivar_consensus.percentage_mapped_reads, flu_track.percentage_mapped_reads, ""])
   }
 }
diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl
index fa1044c24..4ae59f5dc 100644
--- a/workflows/theiacov/wf_theiacov_illumina_se.wdl
+++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl
@@ -317,5 +317,7 @@ workflow theiacov_illumina_se {
     # QC_Check Results
     String? qc_check = qc_check_task.qc_check
     File? qc_standard = qc_check_task.qc_standard
+    # Capture percentage_mapped_reads from ivar_consensus task
+    String? percentage_mapped_reads = ivar_consensus.percentage_mapped_reads
   }
 }
\ No newline at end of file
diff --git a/workflows/theiacov/wf_theiacov_ont.wdl b/workflows/theiacov/wf_theiacov_ont.wdl
index 984934062..7d8d29ad4 100644
--- a/workflows/theiacov/wf_theiacov_ont.wdl
+++ b/workflows/theiacov/wf_theiacov_ont.wdl
@@ -149,7 +149,7 @@ workflow theiacov_ont {
             standardized_organism = organism_parameters.standardized_organism,
             seq_method = seq_method
         }
-      }              
+      }
       # nanoplot for basic QC metrics
       call nanoplot_task.nanoplot as nanoplot_raw {
         input:
@@ -427,5 +427,7 @@ workflow theiacov_ont {
     # QC_Check Results
     String? qc_check = qc_check_task.qc_check
     File? qc_standard = qc_check_task.qc_standard
+    # Non-flu specific outputs
+    String percentage_mapped_reads = select_first([stats_n_coverage_primtrim.percentage_mapped_reads, stats_n_coverage.percentage_mapped_reads, flu_track.percentage_mapped_reads, ""])
   }
 }
\ No newline at end of file
diff --git a/workflows/utilities/wf_flu_track.wdl b/workflows/utilities/wf_flu_track.wdl
index 71e8e952d..6bb2f8c85 100644
--- a/workflows/utilities/wf_flu_track.wdl
+++ b/workflows/utilities/wf_flu_track.wdl
@@ -114,6 +114,10 @@ workflow flu_track {
   }
   # combine HA & NA assembly coverages
   String ha_na_assembly_coverage_string = "HA: " + select_first([ha_assembly_coverage.depth, ""]) + ", NA: " + select_first([na_assembly_coverage.depth, ""])
+  
+  # combine HA & NA mapped reads percentages
+  String ha_na_percentage_mapped_reads = "HA: " + select_first([ha_assembly_coverage.percentage_mapped_reads, ""]) + ", NA: " + select_first([na_assembly_coverage.percentage_mapped_reads, ""])
+
   # ABRICATE will run if assembly is provided, or was generated with IRMA
   if (defined(irma.irma_assemblies) && defined(irma.irma_assembly_fasta)){
     call abricate.abricate_flu {
@@ -249,13 +253,14 @@ workflow flu_track {
     File? irma_mp_segment_fasta = irma.seg_mp_assembly
     File? irma_np_segment_fasta = irma.seg_np_assembly
     File? irma_ns_segment_fasta = irma.seg_ns_assembly
-
     Array[File] irma_assemblies = irma.irma_assemblies
     Array[File] irma_vcfs = irma.irma_vcfs
     Array[File] irma_bams = irma.irma_bams
     File? irma_ha_bam = irma.seg_ha_bam
     File? irma_na_bam = irma.seg_na_bam
     String ha_na_assembly_coverage = ha_na_assembly_coverage_string
+     # calulate mapped reads percentage for flu samples
+    String percentage_mapped_reads = ha_na_percentage_mapped_reads
     # GenoFLU outputs
     String? genoflu_version = genoflu.genoflu_version
     String? genoflu_genotype = genoflu.genoflu_genotype
diff --git a/workflows/utilities/wf_ivar_consensus.wdl b/workflows/utilities/wf_ivar_consensus.wdl
index 8dadbd2b9..4e7112943 100644
--- a/workflows/utilities/wf_ivar_consensus.wdl
+++ b/workflows/utilities/wf_ivar_consensus.wdl
@@ -153,6 +153,8 @@ workflow ivar_consensus {
     String meanmapq_trim = select_first([stats_n_coverage_primtrim.meanmapq, stats_n_coverage.meanmapq,""])
     String assembly_mean_coverage = select_first([stats_n_coverage_primtrim.depth, stats_n_coverage.depth,""])
     String samtools_version_stats = stats_n_coverage.samtools_version
-  
+
+    # Assembly metrics
+    String percentage_mapped_reads = select_first([stats_n_coverage_primtrim.percentage_mapped_reads, stats_n_coverage.percentage_mapped_reads,""])
   }
 }