From 3e57b3dc2c2d3ff366867803b0dc6f8f04f988b8 Mon Sep 17 00:00:00 2001 From: jrotieno Date: Thu, 22 Aug 2024 20:18:22 +0000 Subject: [PATCH 01/15] capture qc metrics from snipy variants in snippy_tree and snippy_streamline --- .../task_snippy_variants.wdl | 30 +++++++++++++++++++ .../phylogenetics/wf_snippy_streamline.wdl | 3 +- workflows/phylogenetics/wf_snippy_tree.wdl | 12 ++++++++ .../standalone_modules/wf_snippy_variants.wdl | 1 + 4 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl index c04da5116..cc8d7a491 100644 --- a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl +++ b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl @@ -84,6 +84,35 @@ task snippy_variants { echo $reference_length_passed_depth $reference_length | awk '{ print ($1/$2)*100 }' > PERCENT_REF_COVERAGE fi + # create qc metrics file + line_count=$(wc -l < "~{samplename}/~{samplename}_coverage.tsv") + # Check the number of lines in the file, to consider scenarios e.g. for V. cholerae that has two chromosomes and therefore coverage metrics per chromosome + if [ "$line_count" -eq 2 ]; then + head -n 1 "~{samplename}/~{samplename}_coverage.tsv" | tr ' ' '\t' > COVERAGE_HEADER + sed -n '2p' "~{samplename}/~{samplename}_coverage.tsv" | tr ' ' '\t' > COVERAGE_VALUES + else + header=$(head -n 1 "~{samplename}/~{samplename}_coverage.tsv") + output_header="$header" + output_values="" + + for (( i=2; i<=$line_count; i++ )) + do + #output_header="$output_header" + values=$(sed -n "${i}p" "~{samplename}/~{samplename}_coverage.tsv") + if [ -z "$output_values" ]; then + output_values="$values" + else + output_header="$output_header\t$header" + output_values="$output_values\t$values" + fi + done + echo "$output_header" | tr ' ' '\t' > COVERAGE_HEADER + echo "$output_values" | tr ' ' '\t' > COVERAGE_VALUES + fi + + echo -e "samplename\treads_aligned_to_reference\tvariants_total\tpercent_ref_coverage\t$(cat COVERAGE_HEADER)" > "~{samplename}/~{samplename}_qc_metrics.tsv" + echo -e "~{samplename}\t$(cat READS_ALIGNED_TO_REFERENCE)\t$(cat VARIANTS_TOTAL)\t$(cat PERCENT_REF_COVERAGE)\t$(cat COVERAGE_VALUES)" >> "~{samplename}/~{samplename}_qc_metrics.tsv" + >>> output { String snippy_variants_version = read_string("VERSION") @@ -102,6 +131,7 @@ task snippy_variants { String snippy_variants_ref_length = read_string("REFERENCE_LENGTH") String snippy_variants_ref_length_passed_depth = read_string("REFERENCE_LENGTH_PASSED_DEPTH") String snippy_variants_percent_ref_coverage = read_string("PERCENT_REF_COVERAGE") + File snippy_variants_qc_metrics = "~{samplename}/~{samplename}_qc_metrics.tsv" } runtime { docker: "~{docker}" diff --git a/workflows/phylogenetics/wf_snippy_streamline.wdl b/workflows/phylogenetics/wf_snippy_streamline.wdl index 08e6bf531..56353edde 100644 --- a/workflows/phylogenetics/wf_snippy_streamline.wdl +++ b/workflows/phylogenetics/wf_snippy_streamline.wdl @@ -50,7 +50,8 @@ workflow snippy_streamline { tree_name = tree_name_updated, snippy_variants_outdir_tarball = snippy_variants_wf.snippy_variants_outdir_tarball, samplenames = samplenames, - reference_genome_file = select_first([reference_genome_file, ncbi_datasets_download_genome_accession.ncbi_datasets_assembly_fasta]) + reference_genome_file = select_first([reference_genome_file, ncbi_datasets_download_genome_accession.ncbi_datasets_assembly_fasta]), + snippy_variants_qc_metrics = snippy_variants_wf.snippy_variants_qc_metrics } call versioning.version_capture { input: diff --git a/workflows/phylogenetics/wf_snippy_tree.wdl b/workflows/phylogenetics/wf_snippy_tree.wdl index dc19b1bff..a0a3e278b 100644 --- a/workflows/phylogenetics/wf_snippy_tree.wdl +++ b/workflows/phylogenetics/wf_snippy_tree.wdl @@ -24,6 +24,7 @@ workflow snippy_tree_wf { Boolean use_gubbins = true Boolean core_genome = true Boolean call_shared_variants = true + Array[File]? snippy_variants_qc_metrics String? data_summary_terra_project String? data_summary_terra_workspace @@ -186,6 +187,14 @@ workflow snippy_tree_wf { concatenated_file_name = tree_name_updated } } + if (defined(snippy_variants_qc_metrics)) { + call file_handling.cat_files as concatenate_qc_metrics { + input: + files_to_cat = select_first([snippy_variants_qc_metrics]), + concatenated_file_name = tree_name_updated, + skip_extra_headers = true + } + } call versioning.version_capture { input: } @@ -233,5 +242,8 @@ workflow snippy_tree_wf { # shared snps outputs File? snippy_concatenated_variants = concatenate_variants.concatenated_variants File? snippy_shared_variants_table = shared_variants.shared_variants_table + + # combined qc metrics + File? snippy_combined_qc_metrics = concatenate_qc_metrics.concatenated_files } } diff --git a/workflows/standalone_modules/wf_snippy_variants.wdl b/workflows/standalone_modules/wf_snippy_variants.wdl index 9f981df51..47dfd7cc1 100644 --- a/workflows/standalone_modules/wf_snippy_variants.wdl +++ b/workflows/standalone_modules/wf_snippy_variants.wdl @@ -67,6 +67,7 @@ workflow snippy_variants_wf { File snippy_variants_coverage_tsv = snippy_variants.snippy_variants_coverage_tsv Int snippy_variants_num_variants = snippy_variants.snippy_variants_num_variants Float snippy_variants_percent_ref_coverage = snippy_variants.snippy_variants_percent_ref_coverage + File snippy_variants_qc_metrics = snippy_variants.snippy_variants_qc_metrics # snippy gene query outputs String? snippy_variants_query = snippy_gene_query.snippy_variants_query String? snippy_variants_query_check = snippy_gene_query.snippy_variants_query_check From 247f9464bf54b149ec3b7e4374412298150e1bdd Mon Sep 17 00:00:00 2001 From: jrotieno Date: Thu, 22 Aug 2024 20:42:30 +0000 Subject: [PATCH 02/15] adding combined qc output to snippy streamline --- workflows/phylogenetics/wf_snippy_streamline.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/phylogenetics/wf_snippy_streamline.wdl b/workflows/phylogenetics/wf_snippy_streamline.wdl index 56353edde..7d9310956 100644 --- a/workflows/phylogenetics/wf_snippy_streamline.wdl +++ b/workflows/phylogenetics/wf_snippy_streamline.wdl @@ -109,5 +109,6 @@ workflow snippy_streamline { File? snippy_filtered_metadata = snippy_tree_wf.snippy_filtered_metadata File? snippy_concatenated_variants = snippy_tree_wf.snippy_concatenated_variants File? snippy_shared_variants_table = snippy_tree_wf.snippy_shared_variants_table + File? snippy_snippy_variants_qc_metrics = snippy_tree_wf.snippy_snippy_variants_qc_metrics } } \ No newline at end of file From 8faa84f0cc460cdc5bf5ae7a13e80eb3915c9ddf Mon Sep 17 00:00:00 2001 From: jrotieno Date: Thu, 22 Aug 2024 20:47:02 +0000 Subject: [PATCH 03/15] correct output --- workflows/phylogenetics/wf_snippy_streamline.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/phylogenetics/wf_snippy_streamline.wdl b/workflows/phylogenetics/wf_snippy_streamline.wdl index 7d9310956..e25c15a45 100644 --- a/workflows/phylogenetics/wf_snippy_streamline.wdl +++ b/workflows/phylogenetics/wf_snippy_streamline.wdl @@ -109,6 +109,6 @@ workflow snippy_streamline { File? snippy_filtered_metadata = snippy_tree_wf.snippy_filtered_metadata File? snippy_concatenated_variants = snippy_tree_wf.snippy_concatenated_variants File? snippy_shared_variants_table = snippy_tree_wf.snippy_shared_variants_table - File? snippy_snippy_variants_qc_metrics = snippy_tree_wf.snippy_snippy_variants_qc_metrics + File? snippy_combined_qc_metrics = snippy_tree_wf.snippy_combined_qc_metrics } } \ No newline at end of file From 747f5aa09c45daa25ee708e74fb94c172f407d34 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 7 Oct 2024 15:46:04 -0500 Subject: [PATCH 04/15] merge conflicts outputs snippy tidy up --- tasks/gene_typing/variant_detection/task_snippy_variants.wdl | 2 ++ workflows/standalone_modules/wf_snippy_variants.wdl | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl index b2f1f0af3..66fa27e7d 100644 --- a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl +++ b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl @@ -149,6 +149,8 @@ task snippy_variants { String snippy_variants_ref_length = read_string("REFERENCE_LENGTH") String snippy_variants_ref_length_passed_depth = read_string("REFERENCE_LENGTH_PASSED_DEPTH") String snippy_variants_percent_ref_coverage = read_string("PERCENT_REF_COVERAGE") + File snippy_variants_qc_metrics = "~{samplename}/~{samplename}_qc_metrics.tsv" + String snippy_variants_percent_reads_aligned = read_string("PERCENT_READS_ALIGNED") } runtime { docker: "~{docker}" diff --git a/workflows/standalone_modules/wf_snippy_variants.wdl b/workflows/standalone_modules/wf_snippy_variants.wdl index c9d83cb25..e53e5f770 100644 --- a/workflows/standalone_modules/wf_snippy_variants.wdl +++ b/workflows/standalone_modules/wf_snippy_variants.wdl @@ -70,11 +70,8 @@ workflow snippy_variants_wf { File snippy_variants_coverage_tsv = snippy_variants.snippy_variants_coverage_tsv Int snippy_variants_num_variants = snippy_variants.snippy_variants_num_variants Float snippy_variants_percent_ref_coverage = snippy_variants.snippy_variants_percent_ref_coverage -<<<<<<< HEAD File snippy_variants_qc_metrics = snippy_variants.snippy_variants_qc_metrics -======= Float snippy_variants_percent_reads_aligned = snippy_variants.snippy_variants_percent_reads_aligned ->>>>>>> main # snippy gene query outputs String? snippy_variants_query = snippy_gene_query.snippy_variants_query String? snippy_variants_query_check = snippy_gene_query.snippy_variants_query_check From 74e733f55970deb8f49bd833817ffc0cf2fdbb1f Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 16 Oct 2024 11:35:43 -0500 Subject: [PATCH 05/15] updates qc metrics logic and elif --- .../task_snippy_variants.wdl | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl index 66fa27e7d..458095964 100644 --- a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl +++ b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl @@ -85,51 +85,54 @@ task snippy_variants { reference_length_passed_depth=$(cat "~{samplename}/~{samplename}_depth_~{min_coverage}.tsv" | wc -l) echo $reference_length_passed_depth | tee REFERENCE_LENGTH_PASSED_DEPTH - # check if reference_length is equal to 0, if so, output a warning if [ "$reference_length" -eq 0 ]; then - echo "Could not compute percent reference coverage: reference length is 0" > PERCENT_REF_COVERAGE + echo "0" > PERCENT_REF_COVERAGE else - # compute percent reference coverage - echo $reference_length_passed_depth $reference_length | awk '{ print ($1/$2)*100 }' > PERCENT_REF_COVERAGE + echo $reference_length_passed_depth $reference_length | awk '{ printf("%.2f", ($1/$2)*100) }' > PERCENT_REF_COVERAGE fi # Compute percentage of reads aligned reads_aligned=$(cat READS_ALIGNED_TO_REFERENCE) total_reads=$(samtools view -c "~{samplename}/~{samplename}.bam") + echo $total_reads > TOTAL_READS if [ "$total_reads" -eq 0 ]; then - echo "Could not compute percent reads aligned: total reads is 0" > PERCENT_READS_ALIGNED + echo "0" > PERCENT_READS_ALIGNED else - echo $reads_aligned $total_reads | awk '{ print ($1/$2)*100 }' > PERCENT_READS_ALIGNED + echo $reads_aligned $total_reads | awk '{ printf("%.2f", ($1/$2)*100) }' > PERCENT_READS_ALIGNED fi - # create qc metrics file + # Create QC metrics file line_count=$(wc -l < "~{samplename}/~{samplename}_coverage.tsv") - # Check the number of lines in the file, to consider scenarios e.g. for V. cholerae that has two chromosomes and therefore coverage metrics per chromosome + # Check the number of lines in the coverage file, to consider scenarios e.g. for V. cholerae that has two chromosomes and therefore coverage metrics per chromosome if [ "$line_count" -eq 2 ]; then head -n 1 "~{samplename}/~{samplename}_coverage.tsv" | tr ' ' '\t' > COVERAGE_HEADER sed -n '2p' "~{samplename}/~{samplename}_coverage.tsv" | tr ' ' '\t' > COVERAGE_VALUES - else + elif [ "$line_count" -gt 2 ]; then + # Multiple chromosomes (header + multiple data lines) header=$(head -n 1 "~{samplename}/~{samplename}_coverage.tsv") - output_header="$header" + output_header="" output_values="" - - for (( i=2; i<=$line_count; i++ )) - do - #output_header="$output_header" - values=$(sed -n "${i}p" "~{samplename}/~{samplename}_coverage.tsv") - if [ -z "$output_values" ]; then - output_values="$values" + # while loop to iterate over each line in the coverage file + while read -r line; do + if [ -z "$output_header" ]; then + output_header="$header" + output_values="$line" else output_header="$output_header\t$header" - output_values="$output_values\t$values" + output_values="$output_values\t$line" fi - done + done < <(tail -n +2 "~{samplename}/~{samplename}_coverage.tsv") echo "$output_header" | tr ' ' '\t' > COVERAGE_HEADER echo "$output_values" | tr ' ' '\t' > COVERAGE_VALUES + else + # Coverage file has insufficient data + echo "Coverage file has insufficient data." > COVERAGE_HEADER + echo "" > COVERAGE_VALUES fi - echo -e "samplename\treads_aligned_to_reference\tvariants_total\tpercent_ref_coverage\t$(cat COVERAGE_HEADER)" > "~{samplename}/~{samplename}_qc_metrics.tsv" - echo -e "~{samplename}\t$(cat READS_ALIGNED_TO_REFERENCE)\t$(cat VARIANTS_TOTAL)\t$(cat PERCENT_REF_COVERAGE)\t$(cat COVERAGE_VALUES)" >> "~{samplename}/~{samplename}_qc_metrics.tsv" + # Build the QC metrics file + echo -e "samplename\treads_aligned_to_reference\ttotal_reads\tpercent_reads_aligned\tvariants_total\tpercent_ref_coverage\t$(cat COVERAGE_HEADER)" > "~{samplename}/~{samplename}_qc_metrics.tsv" + echo -e "~{samplename}\t$reads_aligned\t$total_reads\t$(cat PERCENT_READS_ALIGNED)\t$(cat VARIANTS_TOTAL)\t$(cat PERCENT_REF_COVERAGE)\t$(cat COVERAGE_VALUES)" >> "~{samplename}/~{samplename}_qc_metrics.tsv" >>> output { From d523f83159204006f2db1b67fd3071efbd3b69a5 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 17 Oct 2024 09:45:40 -0500 Subject: [PATCH 06/15] update qc metrics combined so output is a tsv file anme in snippytree wf --- workflows/phylogenetics/wf_snippy_tree.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/phylogenetics/wf_snippy_tree.wdl b/workflows/phylogenetics/wf_snippy_tree.wdl index 6c6bc5cfb..b64ef5eba 100644 --- a/workflows/phylogenetics/wf_snippy_tree.wdl +++ b/workflows/phylogenetics/wf_snippy_tree.wdl @@ -184,7 +184,7 @@ workflow snippy_tree_wf { call shared_variants_task.shared_variants { input: concatenated_variants = concatenate_variants.concatenated_variants, - concatenated_file_name = tree_name_updated + concatenated_file_name = tree_name_updated + "_combined_qc_metrics.tsv", } } if (defined(snippy_variants_qc_metrics)) { From dc33edc1b6491cbfd6054d8646231259619078ba Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 17 Oct 2024 11:14:14 -0500 Subject: [PATCH 07/15] update docs for snippy streamline and snippy tree.md --- docs/workflows/phylogenetic_construction/snippy_streamline.md | 1 + docs/workflows/phylogenetic_construction/snippy_tree.md | 1 + workflows/phylogenetics/wf_snippy_tree.wdl | 4 ++-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline.md b/docs/workflows/phylogenetic_construction/snippy_streamline.md index 744b59482..a777641b5 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline.md @@ -188,6 +188,7 @@ For all cases: | snippy_centroid_version | String | Centroid version used | | snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | | snippy_concatenated_variants | File | The concatenated variants file | +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. The file is a tab-separated values (TSV) file with the following columns:
- samplename
- reads_aligned_to_reference
- total_reads
- percent_reads_aligned
- variants_total
- percent_ref_coverage
- #rname
- startpos
- endpos
- numreads
- covbases
- coverage
- meandepth
- meanbaseq
- meanmapq

The last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. | | snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | | snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | | snippy_final_tree | File | Final phylogenetic tree produced by Snippy_Streamline | diff --git a/docs/workflows/phylogenetic_construction/snippy_tree.md b/docs/workflows/phylogenetic_construction/snippy_tree.md index 86a19304c..574fffabf 100644 --- a/docs/workflows/phylogenetic_construction/snippy_tree.md +++ b/docs/workflows/phylogenetic_construction/snippy_tree.md @@ -312,6 +312,7 @@ Sequencing data used in the Snippy_Tree workflow must: |---|---|---| | snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | | snippy_concatenated_variants | File | Concatenated snippy_results file across all samples in the set | +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. The file is a tab-separated values (TSV) file with the following columns:
- samplename
- reads_aligned_to_reference
- total_reads
- percent_reads_aligned
- variants_total
- percent_ref_coverage
- #rname
- startpos
- endpos
- numreads
- covbases
- coverage
- meandepth
- meanbaseq
- meanmapq

The last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. | | snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | | snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | | snippy_final_tree | File | Newick tree produced from the final alignment. Depending on user input for core_genome, the tree could be a core genome tree (default when core_genome is true) or whole genome tree (if core_genome is false) | diff --git a/workflows/phylogenetics/wf_snippy_tree.wdl b/workflows/phylogenetics/wf_snippy_tree.wdl index b64ef5eba..473e046bc 100644 --- a/workflows/phylogenetics/wf_snippy_tree.wdl +++ b/workflows/phylogenetics/wf_snippy_tree.wdl @@ -184,14 +184,14 @@ workflow snippy_tree_wf { call shared_variants_task.shared_variants { input: concatenated_variants = concatenate_variants.concatenated_variants, - concatenated_file_name = tree_name_updated + "_combined_qc_metrics.tsv", + concatenated_file_name = tree_name_updated } } if (defined(snippy_variants_qc_metrics)) { call file_handling.cat_files as concatenate_qc_metrics { input: files_to_cat = select_first([snippy_variants_qc_metrics]), - concatenated_file_name = tree_name_updated, + concatenated_file_name = tree_name_updated + "_combined_qc_metrics.tsv" skip_extra_headers = true } } From ee2014a727eaf8aa0382b5e8015021c61adf8288 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Thu, 17 Oct 2024 11:21:37 -0500 Subject: [PATCH 08/15] forgot the comma --- workflows/phylogenetics/wf_snippy_tree.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/phylogenetics/wf_snippy_tree.wdl b/workflows/phylogenetics/wf_snippy_tree.wdl index 473e046bc..acd4ec53f 100644 --- a/workflows/phylogenetics/wf_snippy_tree.wdl +++ b/workflows/phylogenetics/wf_snippy_tree.wdl @@ -191,7 +191,7 @@ workflow snippy_tree_wf { call file_handling.cat_files as concatenate_qc_metrics { input: files_to_cat = select_first([snippy_variants_qc_metrics]), - concatenated_file_name = tree_name_updated + "_combined_qc_metrics.tsv" + concatenated_file_name = tree_name_updated + "_combined_qc_metrics.tsv", skip_extra_headers = true } } From deebfd4f76b3b352a88b2b0547917dcb90561bbc Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Mon, 21 Oct 2024 08:37:26 -0500 Subject: [PATCH 09/15] update error messages snippy variants --- tasks/gene_typing/variant_detection/task_snippy_variants.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl index 458095964..eea38ac1d 100644 --- a/tasks/gene_typing/variant_detection/task_snippy_variants.wdl +++ b/tasks/gene_typing/variant_detection/task_snippy_variants.wdl @@ -85,8 +85,9 @@ task snippy_variants { reference_length_passed_depth=$(cat "~{samplename}/~{samplename}_depth_~{min_coverage}.tsv" | wc -l) echo $reference_length_passed_depth | tee REFERENCE_LENGTH_PASSED_DEPTH + # check if reference_length is equal to 0, if so, output a warning if [ "$reference_length" -eq 0 ]; then - echo "0" > PERCENT_REF_COVERAGE + echo "Could not compute percent reference coverage: reference length is 0" > PERCENT_REF_COVERAGE else echo $reference_length_passed_depth $reference_length | awk '{ printf("%.2f", ($1/$2)*100) }' > PERCENT_REF_COVERAGE fi @@ -96,7 +97,7 @@ task snippy_variants { total_reads=$(samtools view -c "~{samplename}/~{samplename}.bam") echo $total_reads > TOTAL_READS if [ "$total_reads" -eq 0 ]; then - echo "0" > PERCENT_READS_ALIGNED + echo "Could not compute percent reads aligned: total reads is 0" > PERCENT_READS_ALIGNED else echo $reads_aligned $total_reads | awk '{ printf("%.2f", ($1/$2)*100) }' > PERCENT_READS_ALIGNED fi From c52745ecfa2e5e4d7808479279feba31b667b970 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 22 Oct 2024 17:59:44 -0500 Subject: [PATCH 10/15] updates streamline_fasta and docs --- .../phylogenetic_construction/snippy_streamline_fasta.md | 1 + docs/workflows/phylogenetic_construction/snippy_variants.md | 1 + workflows/phylogenetics/wf_snippy_streamline_fasta.wdl | 2 ++ 3 files changed, 4 insertions(+) diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md index 11f482891..6b33499ff 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md @@ -117,6 +117,7 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a | snippy_centroid_samplename | String | Name of the centroid sample | | snippy_centroid_version | String | Centroid version used | | snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. The file is a tab-separated values (TSV) file with the following columns:
- samplename
- reads_aligned_to_reference
- total_reads
- percent_reads_aligned
- variants_total
- percent_ref_coverage
- #rname
- startpos
- endpos
- numreads
- covbases
- coverage
- meandepth
- meanbaseq
- meanmapq

The last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. | | snippy_concatenated_variants | File | The concatenated variants file | | snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | | snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | diff --git a/docs/workflows/phylogenetic_construction/snippy_variants.md b/docs/workflows/phylogenetic_construction/snippy_variants.md index b1fc18885..8b041ed1a 100644 --- a/docs/workflows/phylogenetic_construction/snippy_variants.md +++ b/docs/workflows/phylogenetic_construction/snippy_variants.md @@ -68,6 +68,7 @@ The `Snippy_Variants` workflow aligns single-end or paired-end reads (in FASTQ f | **Variable** | **Type** | **Description** | |---|---|---| +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. The file is a tab-separated values (TSV) file with the following columns:
- samplename
- reads_aligned_to_reference
- total_reads
- percent_reads_aligned
- variants_total
- percent_ref_coverage
- #rname
- startpos
- endpos
- numreads
- covbases
- coverage
- meandepth
- meanbaseq
- meanmapq

The last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. | | snippy_variants_bai | File | Indexed bam file of the reads aligned to the reference | | snippy_variants_bam | File | Bam file of reads aligned to the reference | | snippy_variants_coverage_tsv | File | Coverage statistics TSV file output by the `samtools coverage` command, providing genome-wide metrics such as the proportion of bases covered (depth ≥ 1), mean depth, and other related statistics. | diff --git a/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl b/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl index 1cc3568ab..a8eef4f97 100644 --- a/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl +++ b/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl @@ -49,6 +49,7 @@ workflow snippy_streamline_fasta { snippy_variants_outdir_tarball = snippy_variants_wf.snippy_variants_outdir_tarball, samplenames = samplenames, reference_genome_file = select_first([reference_genome_file, ncbi_datasets_download_genome_accession.ncbi_datasets_assembly_fasta]) + snippy_variants_qc_metrics = snippy_variants_wf.snippy_variants_qc_metrics } call versioning.version_capture { input: @@ -106,5 +107,6 @@ workflow snippy_streamline_fasta { File? snippy_filtered_metadata = snippy_tree_wf.snippy_filtered_metadata File? snippy_concatenated_variants = snippy_tree_wf.snippy_concatenated_variants File? snippy_shared_variants_table = snippy_tree_wf.snippy_shared_variants_table + File? snippy_combined_qc_metrics = snippy_tree_wf.snippy_combined_qc_metrics } } \ No newline at end of file From b7ecc00cd4249f732e244b5e807e926ed46aeff8 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 22 Oct 2024 19:57:54 -0500 Subject: [PATCH 11/15] forgot the comma --- workflows/phylogenetics/wf_snippy_streamline_fasta.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl b/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl index a8eef4f97..d4b580691 100644 --- a/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl +++ b/workflows/phylogenetics/wf_snippy_streamline_fasta.wdl @@ -48,7 +48,7 @@ workflow snippy_streamline_fasta { tree_name = tree_name_updated, snippy_variants_outdir_tarball = snippy_variants_wf.snippy_variants_outdir_tarball, samplenames = samplenames, - reference_genome_file = select_first([reference_genome_file, ncbi_datasets_download_genome_accession.ncbi_datasets_assembly_fasta]) + reference_genome_file = select_first([reference_genome_file, ncbi_datasets_download_genome_accession.ncbi_datasets_assembly_fasta]), snippy_variants_qc_metrics = snippy_variants_wf.snippy_variants_qc_metrics } call versioning.version_capture { From b45dde689999d0b2e0f183858a399ebb035717b1 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 25 Oct 2024 10:39:43 -0500 Subject: [PATCH 12/15] update documentation snippy qc description --- .../phylogenetic_construction/snippy_streamline.md | 6 ++++++ .../phylogenetic_construction/snippy_streamline_fasta.md | 8 ++++++++ .../phylogenetic_construction/snippy_variants.md | 8 +++++++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline.md b/docs/workflows/phylogenetic_construction/snippy_streamline.md index a777641b5..65d955422 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline.md @@ -169,6 +169,12 @@ For all cases: `Snippy_Variants` aligns reads for each sample against the reference genome. As part of `Snippy_Streamline`, the only output from this workflow is the `snippy_variants_outdir_tarball` which is provided in the set-level data table. Please see the full documentation for [Snippy_Variants](./snippy_variants.md) for more information. +??? task "snippy_variants" (qc_metrics output) + + ##### snippy_variants {#snippy_variants} + + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. + ??? task "Snippy_Tree workflow" ##### Snippy_Tree {#snippy_tree} diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md index 6b33499ff..c7667806c 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md @@ -107,6 +107,14 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a | version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | | version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | +### Workflow Tasks + +??? task "snippy_variants" (qc_metrics output) + + ##### snippy_variants {#snippy_variants} + + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. + ### Outputs | **Variable** | **Type** | **Description** | diff --git a/docs/workflows/phylogenetic_construction/snippy_variants.md b/docs/workflows/phylogenetic_construction/snippy_variants.md index 8b041ed1a..1137f4c9f 100644 --- a/docs/workflows/phylogenetic_construction/snippy_variants.md +++ b/docs/workflows/phylogenetic_construction/snippy_variants.md @@ -58,6 +58,13 @@ The `Snippy_Variants` workflow aligns single-end or paired-end reads (in FASTQ f `Snippy_Variants` uses the snippy tool to align reads to the reference and call SNPs, MNPs and INDELs according to optional input parameters. The output includes a file of variants that is then queried using the `grep` bash command to identify any mutations in specified genes or annotations of interest. The query string MUST match the gene name or annotation as specified in the GenBank file and provided in the output variant file in the `snippy_results` column. +Additionally, `Snippy_Variants` extracts quality control (QC) metrics from the Snippy output for each sample. These per-sample QC metrics are saved in TSV files (`snippy_variants_qc_metrics`). The QC metrics include: + +- **Percentage of reads aligned to the reference genome** (`snippy_variants_percent_reads_aligned`). +- **Percentage of the reference genome covered at or above the specified depth threshold** (`snippy_variants_percent_ref_coverage`). + +These per-sample QC metrics can be combined into a single file (`snippy_combined_qc_metrics`) in downstream workflows, such as `snippy_tree_wf`, providing an overview of QC metrics across all samples. + ### Outputs !!! tip "Visualize your outputs in IGV" @@ -68,7 +75,6 @@ The `Snippy_Variants` workflow aligns single-end or paired-end reads (in FASTQ f | **Variable** | **Type** | **Description** | |---|---|---| -| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. The file is a tab-separated values (TSV) file with the following columns:
- samplename
- reads_aligned_to_reference
- total_reads
- percent_reads_aligned
- variants_total
- percent_ref_coverage
- #rname
- startpos
- endpos
- numreads
- covbases
- coverage
- meandepth
- meanbaseq
- meanmapq

The last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. | | snippy_variants_bai | File | Indexed bam file of the reads aligned to the reference | | snippy_variants_bam | File | Bam file of reads aligned to the reference | | snippy_variants_coverage_tsv | File | Coverage statistics TSV file output by the `samtools coverage` command, providing genome-wide metrics such as the proportion of bases covered (depth ≥ 1), mean depth, and other related statistics. | From 4be3a5b363dd760a43d75f461b545dea158f33ce Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Fri, 25 Oct 2024 11:05:31 -0500 Subject: [PATCH 13/15] moved task snippy variants up a few lines --- .../snippy_streamline_fasta.md | 16 ++++++++-------- .../phylogenetic_construction/snippy_tree.md | 6 ++++++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md index c7667806c..f03695d44 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md @@ -37,6 +37,14 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a **If reference genomes have multiple contigs, they will not be compatible with using Gubbins** to mask recombination in the phylogenetic tree. The automatic selection of a reference genome by the workflow may result in a reference with multiple contigs. In this case, an alternative reference genome should be sought. +### Workflow Tasks + +??? task "snippy_variants" (qc_metrics output) + + ##### snippy_variants {#snippy_variants} + + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. + ### Inputs | **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | @@ -107,14 +115,6 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a | version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | | version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | -### Workflow Tasks - -??? task "snippy_variants" (qc_metrics output) - - ##### snippy_variants {#snippy_variants} - - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. - ### Outputs | **Variable** | **Type** | **Description** | diff --git a/docs/workflows/phylogenetic_construction/snippy_tree.md b/docs/workflows/phylogenetic_construction/snippy_tree.md index 574fffabf..400376292 100644 --- a/docs/workflows/phylogenetic_construction/snippy_tree.md +++ b/docs/workflows/phylogenetic_construction/snippy_tree.md @@ -306,6 +306,12 @@ Sequencing data used in the Snippy_Tree workflow must: | Task | task_shared_variants.wdl | | Software Source Code | [task_shared_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/utilities/task_shared_variants.wdl) | +??? task "snippy_variants" (qc_metrics output) + + ##### snippy_variants {#snippy_variants} + + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. + ### Outputs | **Variable** | **Type** | **Description** | From b4398bd9b75a0a2be9b76324686482866e76979c Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Tue, 5 Nov 2024 14:48:00 -0600 Subject: [PATCH 14/15] update docs for qc metrics output table and task dropdown --- .../snippy_streamline.md | 26 ++++++++++++++++--- .../phylogenetic_construction/snippy_tree.md | 26 ++++++++++++++++--- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline.md b/docs/workflows/phylogenetic_construction/snippy_streamline.md index 65d955422..977745905 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline.md @@ -169,11 +169,31 @@ For all cases: `Snippy_Variants` aligns reads for each sample against the reference genome. As part of `Snippy_Streamline`, the only output from this workflow is the `snippy_variants_outdir_tarball` which is provided in the set-level data table. Please see the full documentation for [Snippy_Variants](./snippy_variants.md) for more information. -??? task "snippy_variants" (qc_metrics output) +??? task "snippy_variants (qc_metrics output)" ##### snippy_variants {#snippy_variants} - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + + - **samplename**: The name of the sample. + - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. + - **total_reads**: The total number of reads in the sample. + - **percent_reads_aligned**: The percentage of reads that aligned to the reference genome. + - **variants_total**: The total number of variants detected between the sample and the reference genome. + - **percent_ref_coverage**: The percentage of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10). + - **#rname**: Reference sequence name (e.g., chromosome or contig name). + - **startpos**: Starting position of the reference sequence. + - **endpos**: Ending position of the reference sequence. + - **numreads**: Number of reads covering the reference sequence. + - **covbases**: Number of bases with coverage. + - **coverage**: Percentage of the reference sequence covered (depth ≥ 1). + - **meandepth**: Mean depth of coverage over the reference sequence. + - **meanbaseq**: Mean base quality over the reference sequence. + - **meanmapq**: Mean mapping quality over the reference sequence. + + These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + + **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. ??? task "Snippy_Tree workflow" @@ -194,7 +214,7 @@ For all cases: | snippy_centroid_version | String | Centroid version used | | snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | | snippy_concatenated_variants | File | The concatenated variants file | -| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. The file is a tab-separated values (TSV) file with the following columns:
- samplename
- reads_aligned_to_reference
- total_reads
- percent_reads_aligned
- variants_total
- percent_ref_coverage
- #rname
- startpos
- endpos
- numreads
- covbases
- coverage
- meandepth
- meanbaseq
- meanmapq

The last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. | +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. | | snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | | snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | | snippy_final_tree | File | Final phylogenetic tree produced by Snippy_Streamline | diff --git a/docs/workflows/phylogenetic_construction/snippy_tree.md b/docs/workflows/phylogenetic_construction/snippy_tree.md index 400376292..b6ed9bb66 100644 --- a/docs/workflows/phylogenetic_construction/snippy_tree.md +++ b/docs/workflows/phylogenetic_construction/snippy_tree.md @@ -306,11 +306,31 @@ Sequencing data used in the Snippy_Tree workflow must: | Task | task_shared_variants.wdl | | Software Source Code | [task_shared_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/utilities/task_shared_variants.wdl) | -??? task "snippy_variants" (qc_metrics output) +??? task "snippy_variants (qc_metrics output)" ##### snippy_variants {#snippy_variants} - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + + - **samplename**: The name of the sample. + - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. + - **total_reads**: The total number of reads in the sample. + - **percent_reads_aligned**: The percentage of reads that aligned to the reference genome. + - **variants_total**: The total number of variants detected between the sample and the reference genome. + - **percent_ref_coverage**: The percentage of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10). + - **#rname**: Reference sequence name (e.g., chromosome or contig name). + - **startpos**: Starting position of the reference sequence. + - **endpos**: Ending position of the reference sequence. + - **numreads**: Number of reads covering the reference sequence. + - **covbases**: Number of bases with coverage. + - **coverage**: Percentage of the reference sequence covered (depth ≥ 1). + - **meandepth**: Mean depth of coverage over the reference sequence. + - **meanbaseq**: Mean base quality over the reference sequence. + - **meanmapq**: Mean mapping quality over the reference sequence. + + These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + + **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. ### Outputs @@ -318,7 +338,7 @@ Sequencing data used in the Snippy_Tree workflow must: |---|---|---| | snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | | snippy_concatenated_variants | File | Concatenated snippy_results file across all samples in the set | -| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. The file is a tab-separated values (TSV) file with the following columns:
- samplename
- reads_aligned_to_reference
- total_reads
- percent_reads_aligned
- variants_total
- percent_ref_coverage
- #rname
- startpos
- endpos
- numreads
- covbases
- coverage
- meandepth
- meanbaseq
- meanmapq

The last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. | +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. | | snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | | snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) | | snippy_final_tree | File | Newick tree produced from the final alignment. Depending on user input for core_genome, the tree could be a core genome tree (default when core_genome is true) or whole genome tree (if core_genome is false) | From 4862f9475d1f4edbe7accc829841c2da5a573905 Mon Sep 17 00:00:00 2001 From: fraser-combe Date: Wed, 6 Nov 2024 10:49:21 -0600 Subject: [PATCH 15/15] update streamline fasta docs --- .../snippy_streamline_fasta.md | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md index f03695d44..ca544c398 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md @@ -39,11 +39,31 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a ### Workflow Tasks -??? task "snippy_variants" (qc_metrics output) +??? task "snippy_variants (qc_metrics output)" ##### snippy_variants {#snippy_variants} - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. + This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + + - **samplename**: The name of the sample. + - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. + - **total_reads**: The total number of reads in the sample. + - **percent_reads_aligned**: The percentage of reads that aligned to the reference genome. + - **variants_total**: The total number of variants detected between the sample and the reference genome. + - **percent_ref_coverage**: The percentage of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10). + - **#rname**: Reference sequence name (e.g., chromosome or contig name). + - **startpos**: Starting position of the reference sequence. + - **endpos**: Ending position of the reference sequence. + - **numreads**: Number of reads covering the reference sequence. + - **covbases**: Number of bases with coverage. + - **coverage**: Percentage of the reference sequence covered (depth ≥ 1). + - **meandepth**: Mean depth of coverage over the reference sequence. + - **meanbaseq**: Mean base quality over the reference sequence. + - **meanmapq**: Mean mapping quality over the reference sequence. + + These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + + **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. ### Inputs @@ -125,7 +145,7 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a | snippy_centroid_samplename | String | Name of the centroid sample | | snippy_centroid_version | String | Centroid version used | | snippy_cg_snp_matrix | File | CSV file of core genome pairwise SNP distances between samples, calculated from the final alignment | -| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. The file is a tab-separated values (TSV) file with the following columns:
- samplename
- reads_aligned_to_reference
- total_reads
- percent_reads_aligned
- variants_total
- percent_ref_coverage
- #rname
- startpos
- endpos
- numreads
- covbases
- coverage
- meandepth
- meanbaseq
- meanmapq

The last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. | +| snippy_combined_qc_metrics | File | Combined QC metrics file containing concatenated QC metrics from all samples. | | snippy_concatenated_variants | File | The concatenated variants file | | snippy_filtered_metadata | File | TSV recording the columns of the Terra data table that were used in the summarize_data task | | snippy_final_alignment | File | Final alignment (FASTA file) used to generate the tree (either after snippy alignment, gubbins recombination removal, and/or core site selection with SNP-sites) |