From 3dda02c9f292792d06fa3cd4c7ab2daea89154c9 Mon Sep 17 00:00:00 2001 From: Fraser Combe Date: Mon, 4 Nov 2024 12:36:35 -0600 Subject: [PATCH] [TheiaCov] wfs add percentage_mapped_reads (#641) * Added percentage_mapped_reads output to ivar_consensus.wdl and updated stats_n_coverage task * update mapped reads trying read_float * get read numbers from stats file * get read numbers from stats filev2 * change from bc to awk for calculation * update awk * metric output txt instead of csv * reswitchack to read_string output t * percentage mapped reads based on trimmed bam file theiacov_ont * update theiacov-ont for mapped reads * pass output ivar cons mapped reads to wf for terra output * perc mapped reads output flu track PE, ONT and clearlabs and doc update * updated namings outputs cov_ONT and removed extra call assembly metrics * change naming output stat n coverage task * update flu mapped reads perc variable name * make theiacov_ont conditional output flu mapped reads * wdl does not support if cond in output change to select first * wdl does not support if cond in output change to select first * combine flu and non flu into same mapped reads output * correct assembled reads call * update mdsums * update clearlabs for statncov call * float? * mdsums and pe wf update flu irma defined * mdsum pe * move to flue track * tidy output pe and ont * update strings and provide default values * clean tab/spaces echo * my fav commit mdsums! * remove extra space because it was bothering me * adding a space because i am crazy * update flu output * update outputs * remove string in name * correctly name percentage mapped reads * correct output in flu track wdl * update mdsums * primtrim output * another mdsum --------- Co-authored-by: Sage Wright --- .../genomic_characterization/theiacov.md | 1 + .../task_assembly_metrics.wdl | 35 +++++++++++++++++-- .../theiacov/test_wf_theiacov_clearlabs.yml | 4 +-- .../theiacov/test_wf_theiacov_illumina_pe.yml | 4 +-- .../theiacov/test_wf_theiacov_illumina_se.yml | 4 +-- .../theiacov/test_wf_theiacov_ont.yml | 4 +-- workflows/theiacov/wf_theiacov_clearlabs.wdl | 2 ++ .../theiacov/wf_theiacov_illumina_pe.wdl | 5 +-- .../theiacov/wf_theiacov_illumina_se.wdl | 2 ++ workflows/theiacov/wf_theiacov_ont.wdl | 4 ++- workflows/utilities/wf_flu_track.wdl | 7 +++- workflows/utilities/wf_ivar_consensus.wdl | 4 ++- 12 files changed, 61 insertions(+), 15 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md index 2e72b86d4..ba7eac3c4 100644 --- a/docs/workflows/genomic_characterization/theiacov.md +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -1157,6 +1157,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | pangolin_notes | String | Lineage notes as determined by Pangolin | CL, FASTA, ONT, PE, SE | | pangolin_versions | String | All Pangolin software and database versions | CL, FASTA, ONT, PE, SE | | percent_reference_coverage | Float | Percent coverage of the reference genome after performing primer trimming; calculated as assembly_length_unambiguous / length of the reference genome (SC2: 29903) x 100 | CL, FASTA, ONT, PE, SE | +| percentage_mapped_reads | String | Percentage of reads that successfully aligned to the reference genome. This value is calculated by number of mapped reads / total number of reads x 100. | ONT, PE, SE | | primer_bed_name | String | Name of the primer bed files used for primer trimming | CL, ONT, PE, SE | | primer_trimmed_read_percent | Float | Percentage of read data with primers trimmed as determined by iVar trim | PE, SE | | qc_check | String | The results of the QC Check task | CL, FASTA, ONT, PE, SE | diff --git a/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl b/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl index 9fe5f843b..762db23cf 100644 --- a/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl +++ b/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl @@ -14,11 +14,11 @@ task stats_n_coverage { samtools --version | head -n1 | tee VERSION samtools stats ~{bamfile} > ~{samplename}.stats.txt - samtools coverage ~{bamfile} -m -o ~{samplename}.cov.hist samtools coverage ~{bamfile} -o ~{samplename}.cov.txt samtools flagstat ~{bamfile} > ~{samplename}.flagstat.txt + # Extracting coverage, depth, meanbaseq, and meanmapq coverage=$(cut -f 6 ~{samplename}.cov.txt | tail -n 1) depth=$(cut -f 7 ~{samplename}.cov.txt | tail -n 1) meanbaseq=$(cut -f 8 ~{samplename}.cov.txt | tail -n 1) @@ -33,6 +33,34 @@ task stats_n_coverage { echo $depth | tee DEPTH echo $meanbaseq | tee MEANBASEQ echo $meanmapq | tee MEANMAPQ + + # Parsing stats.txt for total and mapped reads + total_reads=$(grep "^SN" ~{samplename}.stats.txt | grep "raw total sequences:" | cut -f 3) + mapped_reads=$(grep "^SN" ~{samplename}.stats.txt | grep "reads mapped:" | cut -f 3) + + # Check for empty values and set defaults to avoid errors + if [ -z "$total_reads" ]; then total_reads="1"; fi # Avoid division by zero + if [ -z "$mapped_reads" ]; then mapped_reads="0"; fi + + # Calculate the percentage of mapped reads + percentage_mapped_reads=$(awk "BEGIN {printf \"%.2f\", ($mapped_reads / $total_reads) * 100}") + + # If the percentage calculation fails, default to 0.0 + if [ -z "$percentage_mapped_reads" ]; then percentage_mapped_reads="0.0"; fi + + # Output the result + echo $percentage_mapped_reads | tee PERCENTAGE_MAPPED_READS + + #output all metrics in one txt file + # Output header row (for CSV) + echo -e "Statistic\tValue" > ~{samplename}_metrics.txt + + # Output each statistic as a row + echo -e "Coverage\t$coverage" >> ~{samplename}_metrics.txt + echo -e "Depth\t$depth" >> ~{samplename}_metrics.txt + echo -e "Mean Base Quality\t$meanbaseq" >> ~{samplename}_metrics.txt + echo -e "Mean Mapping Quality\t$meanmapq" >> ~{samplename}_metrics.txt + echo -e "Percentage Mapped Reads\t$percentage_mapped_reads" >> ~{samplename}_metrics.txt >>> output { String date = read_string("DATE") @@ -45,6 +73,9 @@ task stats_n_coverage { Float depth = read_string("DEPTH") Float meanbaseq = read_string("MEANBASEQ") Float meanmapq = read_string("MEANMAPQ") + Float percentage_mapped_reads = read_string("PERCENTAGE_MAPPED_READS") + File metrics_txt = "~{samplename}_metrics.txt" + } runtime { docker: docker @@ -55,4 +86,4 @@ task stats_n_coverage { preemptible: 0 maxRetries: 3 } -} \ No newline at end of file +} diff --git a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml index 599fec45e..fe20263a2 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml @@ -318,7 +318,7 @@ - path: miniwdl_run/call-pangolin4/work/clearlabs.pangolin_report.csv md5sum: 151390c419b00ca44eb83e2bbfb96996 - path: miniwdl_run/call-stats_n_coverage/command - md5sum: 51da320ddc7de2ffeb263f0ddd85ced6 + md5sum: ac020678f99ac145b11d3dbc7b9fe9ba - path: miniwdl_run/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage/outputs.json @@ -350,7 +350,7 @@ - path: miniwdl_run/call-stats_n_coverage/work/clearlabs.stats.txt md5sum: bfed5344c91ce6f4db1f688cac0a3ab9 - path: miniwdl_run/call-stats_n_coverage_primtrim/command - md5sum: a84f90b8877babe54bf8c068d244fbe8 + md5sum: 2974f886e1959cd5eaae5e495c91f7cc - path: miniwdl_run/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage_primtrim/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml index 4c7542334..dfef9c994 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml @@ -205,7 +205,7 @@ md5sum: 511e696afe25f8b96a84d68ec5a8af8a # stats n coverage primer trim - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/command - md5sum: 260c3887be6d99b18caf6d3914c5737f + md5sum: 1c61b89c2a94e87518a6679a04885341 - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/outputs.json @@ -288,7 +288,7 @@ md5sum: 6c63395a125f8618334b8af2de4e2d88 # stats n coverage - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/command - md5sum: 1ffac4cc3e9bdd84a0f9228e8e5ca5d9 + md5sum: e49a297b1c0eb195a2acd80f00672668 - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml index 0742c19a9..22453bdd5 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml @@ -157,7 +157,7 @@ md5sum: 603c3cbc771ca910b96d3c032aafe7c9 # stats n coverage primer trim - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/command - md5sum: 67cac223adcf059a9dfaa9f28ed34f68 + md5sum: affacdcfda48ad5e371a4510f19520bd - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/outputs.json @@ -240,7 +240,7 @@ md5sum: 03c5ecf22fdfdb6b240ac3880281a056 # stats n coverage - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/command - md5sum: 3dacccb252429a0ff46c079a75a09377 + md5sum: cb4de0e459b3fada21bcf08a8dbea89f - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_ont.yml b/tests/workflows/theiacov/test_wf_theiacov_ont.yml index 6077323b9..333f39b90 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_ont.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_ont.yml @@ -225,7 +225,7 @@ md5sum: 32c0be4fb7f3030bf9c74c0a836d4f2e - path: miniwdl_run/call-raw_check_reads/work/_miniwdl_inputs/0/ont.fastq.gz - path: miniwdl_run/call-stats_n_coverage/command - md5sum: 93414eacbbb9d7c4813bb54a8a507078 + md5sum: fbd85e82af1bbfaa734a13a9c1394300 - path: miniwdl_run/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage/outputs.json @@ -250,7 +250,7 @@ - path: miniwdl_run/call-stats_n_coverage/work/ont.flagstat.txt - path: miniwdl_run/call-stats_n_coverage/work/ont.stats.txt - path: miniwdl_run/call-stats_n_coverage_primtrim/command - md5sum: c6e7de70dfdbb1858229e44777b84110 + md5sum: 3689a902aa96e8c132e6ef4946699e61 - path: miniwdl_run/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage_primtrim/outputs.json diff --git a/workflows/theiacov/wf_theiacov_clearlabs.wdl b/workflows/theiacov/wf_theiacov_clearlabs.wdl index 8932c983f..0368bef95 100644 --- a/workflows/theiacov/wf_theiacov_clearlabs.wdl +++ b/workflows/theiacov/wf_theiacov_clearlabs.wdl @@ -207,6 +207,8 @@ workflow theiacov_clearlabs { Int number_Degenerate = consensus_qc.number_Degenerate Int number_Total = consensus_qc.number_Total Float percent_reference_coverage = consensus_qc.percent_reference_coverage + # Percentage mapped reads + Float percentage_mapped_reads = stats_n_coverage.percentage_mapped_reads # SC2 specific coverage outputs Float? sc2_s_gene_mean_coverage = gene_coverage.sc2_s_gene_depth Float? sc2_s_gene_percent_coverage = gene_coverage.sc2_s_gene_percent_coverage diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index ece3c201a..5f5e5b651 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -144,7 +144,7 @@ workflow theiacov_illumina_pe { trim_primers = trim_primers } } - # perform flu-specific tasks + # for flu organisms call flu_track if (organism_parameters.standardized_organism == "flu") { call run_flu_track.flu_track { input: @@ -439,6 +439,7 @@ workflow theiacov_illumina_pe { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard - + # Capture percentage_mapped_reads from ivar_consensus task or flu_track task + String percentage_mapped_reads = select_first([ivar_consensus.percentage_mapped_reads, flu_track.percentage_mapped_reads, ""]) } } diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl index fa1044c24..4ae59f5dc 100644 --- a/workflows/theiacov/wf_theiacov_illumina_se.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl @@ -317,5 +317,7 @@ workflow theiacov_illumina_se { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard + # Capture percentage_mapped_reads from ivar_consensus task + String? percentage_mapped_reads = ivar_consensus.percentage_mapped_reads } } \ No newline at end of file diff --git a/workflows/theiacov/wf_theiacov_ont.wdl b/workflows/theiacov/wf_theiacov_ont.wdl index 984934062..7d8d29ad4 100644 --- a/workflows/theiacov/wf_theiacov_ont.wdl +++ b/workflows/theiacov/wf_theiacov_ont.wdl @@ -149,7 +149,7 @@ workflow theiacov_ont { standardized_organism = organism_parameters.standardized_organism, seq_method = seq_method } - } + } # nanoplot for basic QC metrics call nanoplot_task.nanoplot as nanoplot_raw { input: @@ -427,5 +427,7 @@ workflow theiacov_ont { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard + # Non-flu specific outputs + String percentage_mapped_reads = select_first([stats_n_coverage_primtrim.percentage_mapped_reads, stats_n_coverage.percentage_mapped_reads, flu_track.percentage_mapped_reads, ""]) } } \ No newline at end of file diff --git a/workflows/utilities/wf_flu_track.wdl b/workflows/utilities/wf_flu_track.wdl index 71e8e952d..6bb2f8c85 100644 --- a/workflows/utilities/wf_flu_track.wdl +++ b/workflows/utilities/wf_flu_track.wdl @@ -114,6 +114,10 @@ workflow flu_track { } # combine HA & NA assembly coverages String ha_na_assembly_coverage_string = "HA: " + select_first([ha_assembly_coverage.depth, ""]) + ", NA: " + select_first([na_assembly_coverage.depth, ""]) + + # combine HA & NA mapped reads percentages + String ha_na_percentage_mapped_reads = "HA: " + select_first([ha_assembly_coverage.percentage_mapped_reads, ""]) + ", NA: " + select_first([na_assembly_coverage.percentage_mapped_reads, ""]) + # ABRICATE will run if assembly is provided, or was generated with IRMA if (defined(irma.irma_assemblies) && defined(irma.irma_assembly_fasta)){ call abricate.abricate_flu { @@ -249,13 +253,14 @@ workflow flu_track { File? irma_mp_segment_fasta = irma.seg_mp_assembly File? irma_np_segment_fasta = irma.seg_np_assembly File? irma_ns_segment_fasta = irma.seg_ns_assembly - Array[File] irma_assemblies = irma.irma_assemblies Array[File] irma_vcfs = irma.irma_vcfs Array[File] irma_bams = irma.irma_bams File? irma_ha_bam = irma.seg_ha_bam File? irma_na_bam = irma.seg_na_bam String ha_na_assembly_coverage = ha_na_assembly_coverage_string + # calulate mapped reads percentage for flu samples + String percentage_mapped_reads = ha_na_percentage_mapped_reads # GenoFLU outputs String? genoflu_version = genoflu.genoflu_version String? genoflu_genotype = genoflu.genoflu_genotype diff --git a/workflows/utilities/wf_ivar_consensus.wdl b/workflows/utilities/wf_ivar_consensus.wdl index 8dadbd2b9..4e7112943 100644 --- a/workflows/utilities/wf_ivar_consensus.wdl +++ b/workflows/utilities/wf_ivar_consensus.wdl @@ -153,6 +153,8 @@ workflow ivar_consensus { String meanmapq_trim = select_first([stats_n_coverage_primtrim.meanmapq, stats_n_coverage.meanmapq,""]) String assembly_mean_coverage = select_first([stats_n_coverage_primtrim.depth, stats_n_coverage.depth,""]) String samtools_version_stats = stats_n_coverage.samtools_version - + + # Assembly metrics + String percentage_mapped_reads = select_first([stats_n_coverage_primtrim.percentage_mapped_reads, stats_n_coverage.percentage_mapped_reads,""]) } }