From fd1be2e91cc8944da0239284762be9faba91c984 Mon Sep 17 00:00:00 2001 From: Michal Babinski Date: Wed, 4 Dec 2024 11:56:34 -0500 Subject: [PATCH] [Documentation] update dataset tags (#681) * Update nextclade dataset tags and pangolin docker version for TheiaCoV workflows * Add searchable table for organism-specific parameters in TheiaCoV documentation * Enhance TheiaCoV documentation with searchable tables for organism-specific parameters --- .../genomic_characterization/theiacov.md | 50 +++++++++++++++---- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md index 319a32ad8..74f5de668 100644 --- a/docs/workflows/genomic_characterization/theiacov.md +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -481,34 +481,44 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo The following tables include the relevant organism-specific parameters; **all of these default values can be overwritten by providing a value for the "Overwrite Variable Name" field**. ??? toggle "SARS-CoV-2 Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | gene_locations_bed_file | sars-cov-2 | `"gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed"` | | genome_length_input | sars-cov-2 | `29903` | | nextclade_dataset_name_input | sars-cov-2 | `"nextstrain/sars-cov-2/wuhan-hu-1/orfs"` | - | nextclade_dataset_tag_input | sars-cov-2 | `"2024-07-17--12-57-03Z"` | - | pangolin_docker_image | sars-cov-2 | `"us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29 "`| + | nextclade_dataset_tag_input | sars-cov-2 | `"2024-11-19--14-18-53Z"` | + | pangolin_docker_image | sars-cov-2 | `"us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.31 "`| | reference_genome | sars-cov-2 | `"gs://theiagen-public-files-rp/terra/augur-sars-cov-2-references/MN908947.fasta"` | | vadr_max_length | sars-cov-2 | `30000` | | vadr_mem | sars-cov-2 | `8` | | vadr_options | sars-cov-2 | `"--noseqnamemax --glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn --out_allfasta"` | +
+ ??? toggle "Mpox Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | gene_locations_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed"` | | genome_length_input | MPXV | `197200` | | kraken_target_organism_input | MPXV | `"Monkeypox virus"` | | nextclade_dataset_name_input | MPXV | `"nextstrain/mpox/lineage-b.1"` | - | nextclade_dataset_tag_input | MPXV | `"2024-04-19--07-50-39Z"` | + | nextclade_dataset_tag_input | MPXV | `"2024-11-19--14-18-53Z"` | | primer_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/MPXV.primer.bed"` | | reference_genome | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/MPXV.MT903345.reference.fasta"` | | reference_gff_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/Mpox-MT903345.1.reference.gff3"` | | vadr_max_length | MPXV | `210000` | | vadr_mem | MPXV | `8` | | vadr_options | MPXV | `"--glsearch -s -r --nomisc --mkey mpxv --r_lowsimok --r_lowsimxd 100 --r_lowsimxl 2000 --alt_pass discontn,dupregin --out_allfasta --minimap2 --s_overhang 150"` | + +
??? toggle "WNV Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | |---|---|---|---| | genome_length_input | WNV | `11000` | | @@ -521,7 +531,11 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | vadr_mem | WNV | `8` | | | vadr_options | WNV | `"--mkey flavi --mdir /opt/vadr/vadr-models-flavi/ --nomisc --noprotid --out_allfasta"` | | +
+ ??? toggle "Flu Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Flu Segment** | **Flu Subtype** | **Default Value** | **Notes** | |---|---|---|---|---|---| | flu_segment | flu | all | all | N/A | TheiaCoV will attempt to automatically assign a flu segment | @@ -531,13 +545,13 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | vadr_mem | flu | all | all | `8` | | | vadr_options | flu | all | all | `"--atgonly --xnocomp --nomisc --alt_fail extrant5,extrant3 --mkey flu"` | | | nextclade_dataset_name_input | flu | ha | h1n1 | `"nextstrain/flu/h1n1pdm/ha/MW626062"` | | - | nextclade_dataset_tag_input | flu | ha | h1n1 | `"2024-07-03--08-29-55Z"` | | + | nextclade_dataset_tag_input | flu | ha | h1n1 | `"2024-11-27--02-51-00Z"` | | | reference_genome | flu | ha | h1n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | h3n2 | `"nextstrain/flu/h3n2/ha/EPI1857216"` | | - | nextclade_dataset_tag_input | flu | ha | h3n2 | `"2024-08-08--05-08-21Z"` | | + | nextclade_dataset_tag_input | flu | ha | h3n2 | `"2024-11-27--02-51-00Z"` | | | reference_genome | flu | ha | h3n2 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | victoria | `"nextstrain/flu/vic/ha/KX058884"` | | - | nextclade_dataset_tag_input | flu | ha | victoria | `"2024-07-03--08-29-55Z"` | | + | nextclade_dataset_tag_input | flu | ha | victoria | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | ha | victoria | `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | yamagata | `"nextstrain/flu/yam/ha/JN993010"` | | | nextclade_dataset_tag_input | flu | ha | yamagata | `"2024-01-30--16-34-55Z"` | | @@ -546,43 +560,55 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | nextclade_dataset_tag_input | flu | ha | h5n1 | `"2024-05-08--11-39-52Z"` | | | reference_genome | flu | ha | h5n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h5n1_ha.fasta"` | | | nextclade_dataset_name_input | flu | na | h1n1 | `"nextstrain/flu/h1n1pdm/na/MW626056"` | | - | nextclade_dataset_tag_input | flu | na | h1n1 | `"2024-07-03--08-29-55Z"` | | + | nextclade_dataset_tag_input | flu | na | h1n1 | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | na | h1n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_na.fasta"` | | | nextclade_dataset_name_input | flu | na | h3n2 | `"nextstrain/flu/h3n2/na/EPI1857215"` | | - | nextclade_dataset_tag_input | flu | na | h3n2 | `"2024-04-19--07-50-39Z"` | | + | nextclade_dataset_tag_input | flu | na | h3n2 | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | na | h3n2 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_na.fasta"` | | | nextclade_dataset_name_input | flu | na | victoria | `"nextstrain/flu/vic/na/CY073894"` | | - | nextclade_dataset_tag_input | flu | na | victoria | `"2024-04-19--07-50-39Z"` | | + | nextclade_dataset_tag_input | flu | na | victoria | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | na | victoria | `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_na.fasta"` | | | nextclade_dataset_name_input | flu | na | yamagata | `"NA"` | | | nextclade_dataset_tag_input | flu | na | yamagata | `"NA"` | | | reference_genome | flu | na | yamagata | `"gs://theiagen-public-files-rp/terra/flu-references/reference_yam_na.fasta"` | | +
+ ??? toggle "RSV-A Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_a | 16000 | | kraken_target_organism | rsv_a | Respiratory syncytial virus | | nextclade_dataset_name_input | rsv_a | nextstrain/rsv/a/EPI_ISL_412866 | - | nextclade_dataset_tag_input | rsv_a | 2024-08-01--22-31-31Z | + | nextclade_dataset_tag_input | rsv_a | "2024-11-27--02-51-00Z" | | reference_genome | rsv_a | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta | | vadr_max_length | rsv_a | 15500 | | vadr_mem | rsv_a | 32 | | vadr_options | rsv_a | -r --mkey rsv --xnocomp | +
+ ??? toggle "RSV-B Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_b | 16000 | | kraken_target_organism | rsv_b | "Human orthopneumovirus" | | nextclade_dataset_name_input | rsv_b | nextstrain/rsv/b/EPI_ISL_1653999 | - | nextclade_dataset_tag_input | rsv_b | "2024-08-01--22-31-31Z" | + | nextclade_dataset_tag_input | rsv_b | "2024-11-27--02-51-00Z" | | reference_genome | rsv_b | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta | | vadr_max_length | rsv_b | 15500 | | vadr_mem | rsv_b | 32 | | vadr_options | rsv_b | -r --mkey rsv --xnocomp | +
+ ??? toggle "HIV Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | |---|---|---|---| | kraken_target_organism_input | HIV | Human immunodeficiency virus 1 | | @@ -595,6 +621,8 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | reference_genome | HIV-v2 | gs://theiagen-public-files/terra/hivgc-files/AY228557.1.headerchanged.fasta | This version of HIV originates from Southern Africa | | reference_gff_file | HIV-v2 | gs://theiagen-public-files/terra/hivgc-files/AY228557.1.gff3 | This version of HIV originates from Southern Africa | +
+ ### Workflow Tasks All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT, and ClearLabs workflows. These undertake read trimming and assembly appropriate to the input data type. TheiaCoV workflows subsequently launch default genome characterization modules for quality assessment, and additional taxa-specific characterization steps. When setting up the workflow, users may choose to use "optional tasks" as additions or alternatives to tasks run in the workflow by default.