diff --git a/.dockstore.yml b/.dockstore.yml index 5306d30ed..146638eb7 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -195,6 +195,11 @@ workflows: primaryDescriptorPath: /workflows/utilities/data_import/wf_terra_2_bq.wdl testParameterFiles: - /tests/inputs/empty.json + - name: Fetch_SRR_Accession_PHB + subclass: WDL + primaryDescriptorPath: /workflows/utilities/data_import/wf_fetch_srr_accession.wdl + testParameterFiles: + - /tests/inputs/empty.json - name: Concatenate_Column_Content_PHB subclass: WDL primaryDescriptorPath: /workflows/utilities/file_handling/wf_concatenate_column.wdl diff --git a/README.md b/README.md index 84bccba76..138a1868c 100644 --- a/README.md +++ b/README.md @@ -42,17 +42,17 @@ You can expect a careful review of every PR and feedback as needed before mergin ### Authorship -(Ordered by contribution [# of lines changed] as of 2024-08-01) +(Ordered by contribution [# of lines changed] as of 2024-12-04) * **Sage Wright** ([@sage-wright](https://github.com/sage-wright)) - Conceptualization, Software, Validation, Supervision * **Inês Mendes** ([@cimendes](https://github.com/cimendes)) - Software, Validation * **Curtis Kapsak** ([@kapsakcj](https://github.com/kapsakcj)) - Conceptualization, Software, Validation -* **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - Software, Validation * **Frank Ambrosio** ([@frankambrosio3](https://github.com/frankambrosio3)) - Conceptualization, Software, Validation * **Michelle Scribner** ([@michellescribner](https://github.com/michellescribner)) - Software, Validation * **Kevin Libuit** ([@kevinlibuit](https://github.com/kevinlibuit)) - Conceptualization, Project Administration, Software, Validation, Supervision -* **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation +* **Fraser Combe** ([@fraser-combe](https://github.com/fraser-combe)) - Software, Validation * **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision +* **Michal Babinski** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation * **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision * **Kelsey Kropp** ([@kelseykropp](https://github.com/kelseykropp)) - Validation * **Emily Smith** ([@emily-smith1](https://github.com/emily-smith1)) - Validation @@ -62,7 +62,9 @@ You can expect a careful review of every PR and feedback as needed before mergin We would like to gratefully acknowledge the following individuals from the public health community for their contributions to the PHB repository: +* **James Otieno** ([@jrotieno](https://github.com/jrotieno)) * **Robert Petit** ([@rpetit3](https://github.com/rpetit3)) +* **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) * **Ash O'Farrel** ([@aofarrel](https://github.com/aofarrel)) * **Sam Baird** ([@sam-baird](https://github.com/sam-baird)) * **Holly Halstead** ([@HNHalstead](https://github.com/HNHalstead)) diff --git a/docs/assets/figures/TheiaEuk_Illumina_PHB_20241106.png b/docs/assets/figures/TheiaEuk_Illumina_PHB_20241106.png new file mode 100644 index 000000000..241b7bb8b Binary files /dev/null and b/docs/assets/figures/TheiaEuk_Illumina_PHB_20241106.png differ diff --git a/docs/assets/new_workflow_template.md b/docs/assets/new_workflow_template.md index 9e7ef6799..41c2b1895 100644 --- a/docs/assets/new_workflow_template.md +++ b/docs/assets/new_workflow_template.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Workflow Type](../../workflows_overview/workflows_type.md/#link-to-workflow-type) | [Applicable Kingdom](../../workflows_overview/workflows_kingdom.md/#link-to-applicable-kingdom) | PHB | | | +| [Link to Workflow Type](../../workflows_overview/workflows_type.md/#link-to-workflow-type) | [Link to Applicable Kingdom](../../workflows_overview/workflows_kingdom.md/#link-to-applicable-kingdom) | PHB | | | ## Workflow_Name_On_Terra @@ -12,6 +12,8 @@ Description of the workflow. ### Inputs +Input should be ordered as they appear on Terra + | **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | |---|---|---|---|---|---| | task_name | **variable_name** | Type | Description | Default Value | Required/Optional | @@ -24,12 +26,12 @@ Description of the workflow tasks Description of the task !!! techdetails "Tool Name Technical Details" - | | Links | - | --- | --- | + | | Links | + | --- | --- | | Task | [link to task on GitHub] | | Software Source Code | [link to tool's source code] | | Software Documentation | [link to tool's documentation] | - | Original Publication | [link to tool's publication] | + | Original Publication(s) | [link to tool's publication] | ### Outputs diff --git a/docs/contributing/code_contribution.md b/docs/contributing/code_contribution.md index cb7ba5727..d5819b9d3 100644 --- a/docs/contributing/code_contribution.md +++ b/docs/contributing/code_contribution.md @@ -8,8 +8,10 @@ Style guide inspired by Scott Frazer’s [WDL Best Practices Style Guide](http ## General Guidelines -- Put tasks and workflows in separate files in the appropriate folders. -- Always add a description as metadata +***Modularity and Metadata*** + +- **Best Practice:** Place tasks and workflows in separate files to maintain modularity and clarity. +- **Add a `meta` block** to every task and workflow to provide a brief description of its purpose. ```bash meta { @@ -17,163 +19,262 @@ Style guide inspired by Scott Frazer’s [WDL Best Practices Style Guide](http } ``` -- Ensure that the docker container is locked to a given version, not `latest` +***Docker Containers*** + +- Use a specific Docker container version instead of 'latest' to ensure reproducibility and prevent unexpected changes in container behavior. ```bash - String docker = "quay.io/docker_image:version" + String docker = "us-docker.pkg.dev/docker_image:version" ``` - Preferentially use containers [`Google's Artifact Registry`](https://console.cloud.google.com/artifacts/docker/general-theiagen/us) rather than those from [`quay.io`](http://quay.io) or [`dockerhub`](https://hub.docker.com/) -- Use 2-space indents (no tabs) + +***Indentation and Whitespace*** + +- Use 2-space indentation for all blocks. Avoid using tabs to ensure uniform formatting across editors: ```bash # perform action - if [ this ]; then - action1(variable) + if [ condition ]; then + perform_action(variable) fi ``` -- Do not use line break for opening braces +- Use a single space when defining variables (`this = that` *not* `this= that` (unless a bash variable where `this=that` is required)) + +***Bracket and Spacing Conventions*** + +- Avoid line breaks for opening braces. Keep them on the same line as the declaration. i.e `input {` instead of `input\n{` + + ```bash + # Correct + input { + String input_variable + } + + # Incorrect + input + { + String input_variable + } + ``` + - Use single space when defining input/output variables & runtime attributes (`output {` instead of `output{`) -- Use single-line breaks between non-intended constructs -- Enclose task commands with triple angle brackets (`<<< ... >>>`) -- Consistently use white space with variables (`this = that` *not* `this= that` (unless a bash variable where `this=that` is required)) +- Separate non-indented constructs (like input and output sections) with a single-line break for readability. + +***Command Block Syntax*** + +- Enclose command blocks in triple angle brackets (<<< ... >>>) for consistency and easier handling of multi-line scripts. It also avoids issues with unescaped special characters in the command block: + + ```bash + command <<< + tool --input ~{input} --output ~{output} + >>> + ``` ## Task Blocks -The task should contain the following sections. Include _single_ spaces between input, command, output, and runtime closing and opening curly brackets. +A WDL task block defines a discrete, reusable step in a workflow. To ensure readability and consistency, follow these conventions when writing task blocks. Include single spaces between the input, command, output, and runtime sections and their enclosing curly brackets. ```bash -input { +task example_task { + input { -} -command <<< + } + command <<< + + >>> + output { ->>> -output { - -} -runtime { + } + runtime { + } } ``` -??? toggle "`input` block" - - The following conventions are used to expose docker, CPU, memory, and disk size +### The `input` block - ```bash - input { - String docker = "..." - Int cpu = x - Int memory = y - Int disk_size = z - } - ``` - - - If additional arguments should be allowed to be passed to the task, this input should follow the convention below: - - ```bash - input { - String args = "" - } - ``` - - - Input and output lists should not be formatted to have the equal sign aligned, but instead use a single space before and after the `=` - - ```bash - output1_x = string1 - output2_that_does_y = string2 - ``` - - - Ensure the docker container is exposed as an input and as an output string - - ```bash - input { - String docker = "" - } - ... - output { - String XX_docker = docker - } - runtime { - docker: docker - } - ``` +- The following conventions are used to expose docker, CPU, memory, and disk size: -??? toggle "`command` block" - - Ensure use of line breaks between different sections of code to improve readability - - ```bash - # if this, perform action 1 - if [ this ]; then - action1(variable) - fi - - # if that, perform action 2 - if [ that ]; then - action2(variable) - fi - ``` - - - Split command calls into multiple lines if they have user input variables and/or if the length of the command is very long to avoid text wrapping and/or side-scrolling, e.g. - - Use indentation as appropriate - - ```bash - tool \ - --option1 ~{option1} \ - --option2 ~{option2} \ - ... - --option999 ~{option999} - ``` - - - Add comments that - - Explain what the optional parameters are - - Provide links to the tool documentation so future readers of the code know where to find that information - - Explain what non-intuitive bash/python text wrangling actions do, e.g. - - ```bash - ## awk for gene column ($6) to grab subtype ($15) - cat ~{file} | awk -F '\t' '{if ($6=="M1") print $15}' > FLU_TYPE - ``` - -??? toggle "`output` block" - - File types should be clearly stated in the output name variables + ```bash + input { + Int cpu = 4 # Number of CPUs + Int disk_size = 100 # Disk space in GB + String docker = "us-docker.pkg.dev/example:1.0.0" # Docker container for the task + Int memory = 16 # Memory in GB + } + ``` + +- Include optional tool parameters as inputs to the task + + ```bash + input { + Int? optional_tool_parameter1 + String optional_tool_parameter2_with_default = "default_value" + } + ``` + +- Input and output lists should **not** be formatted to have the equal sign aligned, but instead **use a single space** before and after the `=` + + ```bash + correct_output = "output_file" + long_variable_name = "long_file_name" + ``` + +- Expose Docker as an input, an output (if versioning information not available), and runtime variable: + + ```bash + input { + String docker = "us-docker.pkg.dev/example:1.0.0" + } + ... + output { + String used_docker = docker + } + runtime { + docker: docker + } + ``` + +### The `command` block + +- Ensure use of line breaks between different sections of code to improve readability + + ```bash + # Perform task step 1 + if [ condition ]; then + action1(variable) + fi + + # Perform task step 2 + if [ another_condition ]; then + action2(variable) + fi + ``` + +- Split command calls into multiple lines if they have user input variables and/or if the length of the command is very long to avoid text wrapping and/or side-scrolling, e.g. + - Use backslashes for continuation and indentation to clarify structure: + + ```bash + tool \ + --input ~{input_file} \ + --output ~{output_file} \ + --option1 ~{option1} \ + ... + --optionN ~{optionN} + ``` + +- Add comments that + - Explain what the optional parameters are + - Provide links to the tool documentation so future readers of the code know where to find that information + - Explain what non-intuitive bash/python text wrangling actions do, e.g. ```bash - output1_csv = file1.csv - output2_tsv = file2.tsv - ``` - - - Ensure the docker container is exposed as an output string, e.g. - - ```bash - input { - String docker - } - ... - output { - String XX_docker = docker - } - runtime { - docker: docker - } + ## awk for gene column ($6) to grab subtype ($15) + cat ~{file} | awk -F '\t' '{if ($6=="M1") print $15}' > FLU_TYPE ``` -??? toggle "`runtime` block" - - Always use a docker container +### The `output` block + +- The output block specifies the files or variables produced by the task. Follow these conventions: + + ```bash + output { + File result_csv = "output.csv" # CSV file generated + File result_log = "log.txt" # Log file + } + ``` + +- Ensure the docker container is exposed as an output string, e.g. + + ```bash + input { + String docker = "us-docker.pkg.dev/general-theiagen/tool:version" + } + ... + output { + String XX_docker = docker + } + runtime { + docker: docker + } + ``` + +### The `runtime` block + +- The runtime block defines the compute resources and environment for the task. +- Always specify a Docker: + + ```bash + runtime { + docker: docker + cpu: cpu + memory: memory + disk: disk_size + } + ``` ## Workflow Blocks -The workflow/sub-workflow file should contain: +A WDL workflow block orchestrates the execution of tasks and subworkflows. It defines the inputs, calls tasks or subworkflows, and specifies the final outputs. To ensure readability and consistency, follow these conventions when writing workflow blocks: + +### The `import` section + +- Include a block of `import` statements (sorted in alphabetical order). + - When a workflow imports a task, ensure it is imported under a unique name to avoid conflicts. -- a block of `import` statements (alphabetical order), - - When a workflow imports a task, make sure that it is imported under a different name than the task it is calling -- a `workflow` block with - - an `input` section - - `call` sections for specified tasks - - an `output` section + ```bash + import "../tasks/task_task1.wdl" as task1_task + import "../tasks/task_task2.wdl" as task2_task + ``` -Example formatting is shown below. +- Order import statements alphabetically by the path of the imported file. + +### The `input` block + +- Optional inputs that should be able to be edited by the user, such as docker containers should be exposed on the workflow level as in the example +- In the case of subworkflows, all optional inputs should be exposed on the workflow level so that they can be modified by users on Terra + +```bash +input { + String input + String task1_docker = "us-docker.pkg.dev/general-theiagen/tool:version" + String? task1_optional_argument +} +``` + +### The `call` sections + +- Import task files as something other than the included task nam in order to avoid namespace conflicts + +```bash +call task1_task.task1 { + input: + input = input, + docker = task1_docker +} +``` + +### The `output` block + +- Define all workflow outputs in this section. +- Use descriptive names for each output variable. +- Order outputs alphabetically by the name of the output variable + +```bash +output { + # Task 1 outputs + File task1_out_csv = task1.output_csv + String task1_version = task1.version + + # Subworkflow outputs + File subworkflow_out_tsv = subworkflow.task3_out_tsv + String subworkflow_version = subworkflow.task3_version +} +``` + +## Example Workflow formats ??? toggle "wf_example_wf.wdl" @@ -190,7 +291,6 @@ Example formatting is shown below. String task2_docker = "us-docker.pkg.dev/general-theiagen//task_2:version" String? hidden_task3_argument String? hidden_task3_docker - String? hidden_task4_argument String? hidden_task4_docker } call task1_task.task1 { @@ -205,7 +305,10 @@ Example formatting is shown below. } call subworkflow.subworkflow { input: - input = input + input = input, + task3_argument = hidden_task3_argument, + task3_docker = hidden_task3_docker + task4_docker = hidden_task4_docker } output { # Task 1 outputs @@ -216,16 +319,19 @@ Example formatting is shown below. File task2_out_tsv = task2.output_tsv String task2_version = task2.version String task2_docker = task2.docker - # Subworkflow outputs + # Subworkflow outputs for task 3 File task3_out_tsv = subworkflow.task3_out_tsv String task3_version = subworkflow.task3_version String task3_docker = subworkflow.task3_docker + # Subworkflow outputs for task 4 + String task4_output = subworkflow.task4_output + String task4_version = subworkflow.task4_version } } ``` - ??? toggle "wf_subworkflow.wdl" + ```bash import "../tasks/task_task3.wdl" as task3_task import "../tasks/task_task4.wdl" as task4_task @@ -239,6 +345,7 @@ Example formatting is shown below. # level so they can be modified by a Terra user String? task3_argument String? task3_docker + String? task4_docker } call task3_task.task3 { input: @@ -246,38 +353,17 @@ Example formatting is shown below. args = task3_argument, docker = task3_docker } + call task4_task.task4 { + input: + input = task3.output_tsv, + docker = task4_docker + } output { File task3_out_tsv = task3.output_tsv String task3_version = task3.version String task3_docker = task3.docker + String task4_output = task4.output + String task4_version = task4.version } } ``` - ---- - -??? toggle "`input` section" - - Optional inputs that should be able to be edited by the user, such as docker containers should be exposed on the workflow level as in the example - - In the case of subworkflows, all optional inputs should be exposed on the workflow level so that they can be modified by users on Terra - -??? toggle "`call` task sections" - - There should be no blank lines between tasks in workflows - - ```bash - task A { - } - task B { - } - ``` - - - Label a group of outputs by the source/species for organizational purposes when a workflow has many different outputs - - ```ebnf - output { - ... - # task99 outputs - String task99_ouput - String task99_file - ... - } - ``` diff --git a/docs/contributing/doc_contribution.md b/docs/contributing/doc_contribution.md index 7f20e5491..4ddb7e0de 100644 --- a/docs/contributing/doc_contribution.md +++ b/docs/contributing/doc_contribution.md @@ -14,7 +14,7 @@ To test your documentation changes, you will need to have the following packages pip install mkdocs-material mkdocs-material-extensions mkdocs-git-revision-date-localized-plugin mike mkdocs-glightbox ``` -The live preview server can be activated by running the following command: +Once installed, navigate to the top directory in PHB. The live preview server can be activated by running the following command: ```bash mkdocs serve @@ -34,49 +34,7 @@ Here are some VSCode Extensions can help you write and edit your markdown files - [Excel to Markdown Table](https://tableconvert.com/excel-to-markdown) - This website will convert an Excel table into markdown format, which can be copied and pasted into your markdown file. - [Material for MkDocs Reference](https://squidfunk.github.io/mkdocs-material/reference/) - This is the official reference for the Material for MkDocs theme, which will help you understand how to use the theme's features. -- [Broken Link Check](https://www.brokenlinkcheck.com/) - This website will scan your website to ensure that all links are working correctly. This will only work on the deployed version of the documentation, not the local version. - -## Documentation Structure - -A brief description of the documentation structure is as follows: - -- `docs/` - Contains the Markdown files for the documentation. - - `assets/` - Contains images and other files used in the documentation. - - `figures/` - Contains images, figures, and workflow diagrams used in the documentation. For workflows that contain many images (such as BaseSpace_Fetch), it is recommended to create a subdirectory for the workflow. - - `files/` - Contains files that are used in the documentation. This may include example outputs or templates. For workflows that contain many files (such as TheiaValidate), it is recommended to create a subdirectory for the workflow. - - `logos/` - Contains Theiagen logos and symbols used int he documentation. - - `metadata_formatters/` - Contains the most up-to-date metadata formatters for our submission workflows. - - `new_workflow_template.md` - A template for adding a new workflow page to the documentation. - - `contributing/` - Contains the Markdown files for our contribution guides, such as this file - - `javascripts/` - Contains JavaScript files used in the documentation. - - `tablesort.js` - A JavaScript file used to enable table sorting in the documentation. - - `overrides/` - Contains HTMLs used to override theme defaults - - `main.html` - Contains the HTML used to display a warning when the latest version is not selected - - `stylesheets/` - Contains CSS files used in the documentation. - - `extra.css` - A custom CSS file used to style the documentation; contains all custom theme elements (scrollable tables, resizable columns, Theiagen colors), and custom admonitions. - - `workflows/` - Contains the Markdown files for each workflow, organized into subdirectories by workflow category - - `workflows_overview/` - Contains the Markdown files for the overview tables for each display type: alphabetically, by applicable kingdom, and by workflow type. - - `index.md` - The home/landing page for our documentation. - -### Adding a Page for a New Workflow {#new-page} - -If you are adding a new workflow, there are a number of things to do in order to include the page in the documentation: - -1. Add a page with the title of the workflow to appropriate subdirectory in `docs/workflows/`. Feel free to use the template found in the `assets/` folder. -2. Collect the following information for your new workflow: - - Workflow Name - Link the name with a relative path to the workflow page in appropriate `docs/workflows/` subdirectory - - Workflow Description - Brief description of the workflow - - Applicable Kingdom - Options: "Any taxa", "Bacteria", "Mycotics", "Viral" - - Workflow Level (_on Terra_) - Options: "Sample-level", "Set-level", or neither - - Command-line compatibility - Options: "Yes", "No", and/or "Some optional features incompatible" - - The version where the last known changes occurred (likely the upcoming version if it is a new workflow) - - Link to the workflow on Dockstore (if applicable) - Workflow name linked to the information tab on Dockstore. -3. Format this information in a table. -4. Copy the previously gathered information to ==**ALL THREE**== overview tables in `docs/workflows_overview/`: - - `workflows_alphabetically.md` - Add the workflow in the appropriate spot based on the workflow name. - - `workflows_kingdom.md` - Add the workflow in the appropriate spot(s) based on the kingdom(s) the workflow is applicable to. Make sure it is added alphabetically within the appropriate subsection(s). - - `workflows_type.md` - Add the workflow in the appropriate spot based on the workflow type. Make sure it is added alphabetically within the appropriate subsection. -5. Copy the path to the workflow to ==**ALL**== of the appropriate locations in the `mkdocs.yml` file (under the `nav:` section) in the main directory of this repository. These should be the exact same spots as in the overview tables but without additional information. This ensures the workflow can be accessed from the navigation sidebar. +- [Dead Link Check](https://www.deadlinkchecker.com/) - This website will scan your website to ensure that all links are working correctly. This will only work on the deployed version of the documentation, not the local version. ## Standard Language & Formatting Conventions @@ -98,10 +56,11 @@ The following language conventions should be followed when writing documentation - **Bold Text** - Use `**bold text**` to indicate text that should be bolded. - _Italicized Text_ - Use `_italicized text_` to indicate text that should be italicized. - ==Highlighted Text== - Use `==highlighted text==` to indicate text that should be highlighted. -- `Code` - Use \`code\` to indicate text that should be formatted as code. +- `Code` - Use ````code` ``` (backticks) to indicate text that should be formatted as code. - ^^Underlined Text^^ - Use `^^underlined text^^` to indicate text that should be underlined (works with our theme; not all Markdown renderers support this). - > Citations - Use a `>` to activate quote formatting for a citation. Make sure to separate multiple citations with a comment line (``) to prevent the citations from running together. + - Use a reputable citation style (e.g., Vancouver, Nature, etc.) for all citations. - Callouts/Admonitions - These features are called "call-outs" in Notion, but are "Admonitions" in MkDocs. [I highly recommend referring to the Material for MkDocs documentation page on Admonitions to learn how best to use this feature](https://squidfunk.github.io/mkdocs-material/reference/admonitions/). Use the following syntax to create a callout: ```markdown @@ -116,18 +75,37 @@ The following language conventions should be followed when writing documentation !!! dna This is a DNA admonition. Admire the cute green DNA emoji. You can create this with the `!!! dna` syntax. + Use this admonition when wanting to convey general information or highlight specific facts. + ???+ toggle This is a toggle-able section. The emoji is an arrow pointing to the right downward. You can create this with the `??? toggle` syntax. I have added a `+` at the end of the question marks to make it open by default. + Use this admonition when wanting to provide additional _optional_ information or details that are not strictly necessary, or take up a lot of space. + ???+ task This is a toggle-able section **for a workflow task**. The emoji is a gear. Use the `??? task` syntax to create this admonition. Use `!!! task` if you want to have it be permanently expanded. I have add a `+` at the end of the question marks to make this admonition open by default and still enable its collapse. + Use this admonition when providing details on a workflow, task, or tool. + !!! caption - This is a caption. The emoji is a painting. You can create this with the `!!! caption` syntax. This is used to enclose an image in a box and looks nice. A caption can be added beneath the picture and will also look nice. + This is a caption. The emoji is a painting. You can create this with the `!!! caption` syntax. A caption can be added beneath the picture and will also look nice. + + Use this admonition when including images or diagrams in the documentation. !!! techdetails This is where you will put technical details for a workflow task. You can create this by `!!! techdetails` syntax. + Use this admonition when providing technical details for a workflow task or tool. These admonitions should include the following table: + + | | Links | + | --- | --- | + | Task | [link to the task file in the PHB repository on GitHub] | + | Software Source Code | [link to tool's source code] | + | Software Documentation | [link to tool's documentation] | + | Original Publication(s) | [link to tool's publication] | + + If any of these items are unfillable, delete the row. + - Images - Use the following syntax to insert an image: ```markdown @@ -135,7 +113,7 @@ The following language conventions should be followed when writing documentation ![Alt Text](/path/to/image.png) ``` -- Indentation - **_FOUR_** spaces are required instead of the typical two. This is a side effect of using this theme. If you use two spaces, the list and/or indentations will not render correctly. This will make your linter sad :( +- Indentation - **_FOUR_** spaces are required instead of the typical two. This is a side effect of using this theme. If you use two spaces, the list and/or indentations will not render correctly. This will make your linter sad :( ```markdown - first item @@ -160,3 +138,45 @@ The following language conventions should be followed when writing documentation ``` - End all pages with an empty line + +## Documentation Structure + +A brief description of the documentation structure is as follows: + +- `docs/` - Contains the Markdown files for the documentation. + - `assets/` - Contains images and other files used in the documentation. + - `figures/` - Contains images, figures, and workflow diagrams used in the documentation. For workflows that contain many images (such as BaseSpace_Fetch), it is recommended to create a subdirectory for the workflow. + - `files/` - Contains files that are used in the documentation. This may include example outputs or templates. For workflows that contain many files (such as TheiaValidate), it is recommended to create a subdirectory for the workflow. + - `logos/` - Contains Theiagen logos and symbols used in the documentation. + - `metadata_formatters/` - Contains the most up-to-date metadata formatters for our submission workflows. + - `new_workflow_template.md` - A template for adding a new workflow page to the documentation. You can see this template [here](../assets/new_workflow_template.md) + - `contributing/` - Contains the Markdown files for our contribution guides, such as this file + - `javascripts/` - Contains JavaScript files used in the documentation. + - `tablesort.js` - A JavaScript file used to enable table sorting in the documentation. + - `overrides/` - Contains HTMLs used to override theme defaults + - `main.html` - Contains the HTML used to display a warning when the latest version is not selected + - `stylesheets/` - Contains CSS files used in the documentation. + - `extra.css` - A custom CSS file used to style the documentation; contains all custom theme elements (scrollable tables, resizable columns, Theiagen colors), and custom admonitions. + - `workflows/` - Contains the Markdown files for each workflow, organized into subdirectories by workflow category + - `workflows_overview/` - Contains the Markdown files for the overview tables for each display type: alphabetically, by applicable kingdom, and by workflow type. + - `index.md` - The home/landing page for our documentation. + +### Adding a Page for a New Workflow {#new-page} + +If you are adding a new workflow, there are a number of things to do in order to include the page in the documentation: + +1. Add a page with the title of the workflow to appropriate subdirectory in `docs/workflows/`. Feel free to use the template found in the `assets/` folder. +2. Collect the following information for your new workflow: + - Workflow Name - Link the name with a relative path to the workflow page in appropriate `docs/workflows/` subdirectory + - Workflow Description - Brief description of the workflow + - Applicable Kingdom - Options: "Any taxa", "Bacteria", "Mycotics", "Viral" + - Workflow Level (_on Terra_) - Options: "Sample-level", "Set-level", or neither + - Command-line compatibility - Options: "Yes", "No", and/or "Some optional features incompatible" + - The version where the last known changes occurred (likely the upcoming version if it is a new workflow) + - Link to the workflow on Dockstore (if applicable) - Workflow name linked to the information tab on Dockstore. +3. Format this information in a table. +4. Copy the previously gathered information to ==**ALL THREE**== overview tables in `docs/workflows_overview/`: + - `workflows_alphabetically.md` - Add the workflow in the appropriate spot based on the workflow name. + - `workflows_kingdom.md` - Add the workflow in the appropriate spot(s) based on the kingdom(s) the workflow is applicable to. Make sure it is added alphabetically within the appropriate subsection(s). + - `workflows_type.md` - Add the workflow in the appropriate spot based on the workflow type. Make sure it is added alphabetically within the appropriate subsection. +5. Copy the path to the workflow to ==**ALL**== of the appropriate locations in the `mkdocs.yml` file (under the `nav:` section) in the main directory of this repository. These should be the exact same spots as in the overview tables but without additional information. This ensures the workflow can be accessed from the navigation sidebar. diff --git a/docs/index.md b/docs/index.md index 058b2149d..ad825cfa3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -60,17 +60,17 @@ You can expect a careful review of every PR and feedback as needed before mergin ### Authorship -(Ordered by contribution [# of lines changed] as of 2024-08-01) +(Ordered by contribution [# of lines changed] as of 2024-12-04) - **Sage Wright** ([@sage-wright](https://github.com/sage-wright)) - Conceptualization, Software, Validation, Supervision - **Inês Mendes** ([@cimendes](https://github.com/cimendes)) - Software, Validation - **Curtis Kapsak** ([@kapsakcj](https://github.com/kapsakcj)) - Conceptualization, Software, Validation -- **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - Software, Validation - **Frank Ambrosio** ([@frankambrosio3](https://github.com/frankambrosio3)) - Conceptualization, Software, Validation - **Michelle Scribner** ([@michellescribner](https://github.com/michellescribner)) - Software, Validation - **Kevin Libuit** ([@kevinlibuit](https://github.com/kevinlibuit)) - Conceptualization, Project Administration, Software, Validation, Supervision -- **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation +- **Fraser Combe** ([@fraser-combe](https://github.com/fraser-combe)) - Software, Validation - **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision +- **Michal Babinski** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation - **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision - **Kelsey Kropp** ([@kelseykropp](https://github.com/kelseykropp)) - Validation - **Emily Smith** ([@emily-smith1](https://github.com/emily-smith1)) - Validation @@ -80,7 +80,9 @@ You can expect a careful review of every PR and feedback as needed before mergin We would like to gratefully acknowledge the following individuals from the public health community for their contributions to the PHB repository: +- **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - **Robert Petit** ([@rpetit3](https://github.com/rpetit3)) +- **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - **Ash O'Farrel** ([@aofarrel](https://github.com/aofarrel)) - **Sam Baird** ([@sam-baird](https://github.com/sam-baird)) - **Holly Halstead** ([@HNHalstead](https://github.com/HNHalstead)) diff --git a/docs/overrides/main.html b/docs/overrides/main.html index 54a833dfd..0df0d3be2 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -6,8 +6,3 @@ Click here to go to the latest version release. {% endblock %} - - -{% block announce %} -
🏗️ I'm under construction! Pardon the dust while we remodel! 👷
-{% endblock %} diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index e510ecedc..72b16bc01 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -200,7 +200,6 @@ div.searchable-table input.table-search-input { color: #000; border: 1px solid #E0E1E1; } - [data-md-color-scheme="light"] div.searchable-table input.table-search-input::placeholder { color: #888; font-style: italic; @@ -212,7 +211,6 @@ div.searchable-table input.table-search-input { color: #fff; border: 1px solid #373B40; } - [data-md-color-scheme="slate"] div.searchable-table input.table-search-input::placeholder { color: #bbb; font-style: italic; diff --git a/docs/workflows/genomic_characterization/freyja.md b/docs/workflows/genomic_characterization/freyja.md index b174a2712..c5b15c54d 100644 --- a/docs/workflows/genomic_characterization/freyja.md +++ b/docs/workflows/genomic_characterization/freyja.md @@ -328,12 +328,16 @@ The main output file used in subsequent Freyja workflows is found under the `fre | bwa_version | String | Version of BWA used to map read data to the reference genome | PE, SE | | fastp_html_report | File | The HTML report made with fastp | PE, SE | | fastp_version | String | Version of fastp software used | PE, SE | +| fastq_scan_clean1_json | File | JSON file output from `fastq-scan` containing summary stats about clean forward read quality and length | PE, SE | +| fastq_scan_clean2_json | File | JSON file output from `fastq-scan` containing summary stats about clean reverse read quality and length | PE | | fastq_scan_num_reads_clean_pairs | String | Number of clean read pairs | PE | | fastq_scan_num_reads_clean1 | Int | Number of clean forward reads | PE, SE | | fastq_scan_num_reads_clean2 | Int | Number of clean reverse reads | PE | | fastq_scan_num_reads_raw_pairs | String | Number of raw read pairs | PE | | fastq_scan_num_reads_raw1 | Int | Number of raw forward reads | PE, SE | | fastq_scan_num_reads_raw2 | Int | Number of raw reverse reads | PE | +| fastq_scan_raw1_json | File | JSON file output from `fastq-scan` containing summary stats about raw forward read quality and length | PE, SE | +| fastq_scan_raw2_json | File | JSON file output from `fastq-scan` containing summary stats about raw reverse read quality and length | PE | | fastq_scan_version | String | Version of fastq_scan used for read QC analysis | PE, SE | | fastqc_clean1_html | File | Graphical visualization of clean forward read quality from fastqc to open in an internet browser | PE, SE | | fastqc_clean2_html | File | Graphical visualization of clean reverse read quality from fastqc to open in an internet browser | PE | diff --git a/docs/workflows/genomic_characterization/pangolin_update.md b/docs/workflows/genomic_characterization/pangolin_update.md index 988db4404..a05756888 100644 --- a/docs/workflows/genomic_characterization/pangolin_update.md +++ b/docs/workflows/genomic_characterization/pangolin_update.md @@ -65,4 +65,8 @@ This workflow runs on the sample level. | **pangolin_updates** | String | Result of Pangolin Update (lineage changed versus unchanged) with lineage assignment and date of analysis | | **pangolin_versions** | String | All Pangolin software and database versions | - \ No newline at end of file + + +## References + +> **Pangolin**: RRambaut A, Holmes EC, O'Toole Á, Hill V, McCrone JT, Ruis C, du Plessis L, Pybus OG. A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology. Nat Microbiol. 2020 Nov;5(11):1403-1407. doi: 10.1038/s41564-020-0770-5. Epub 2020 Jul 15. PMID: 32669681; PMCID: PMC7610519. diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md index 468894ab1..812ccc53d 100644 --- a/docs/workflows/genomic_characterization/theiacov.md +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -481,35 +481,45 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo The following tables include the relevant organism-specific parameters; **all of these default values can be overwritten by providing a value for the "Overwrite Variable Name" field**. ??? toggle "SARS-CoV-2 Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | gene_locations_bed_file | sars-cov-2 | `"gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed"` | | genome_length_input | sars-cov-2 | `29903` | | kraken_target_organism_input | sars-cov-2 | `"Severe acute respiratory syndrome coronavirus 2"` | | nextclade_dataset_name_input | sars-cov-2 | `"nextstrain/sars-cov-2/wuhan-hu-1/orfs"` | - | nextclade_dataset_tag_input | sars-cov-2 | `"2024-07-17--12-57-03Z"` | - | pangolin_docker_image | sars-cov-2 | `"us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29 "`| + | nextclade_dataset_tag_input | sars-cov-2 | `"2024-11-19--14-18-53Z"` | + | pangolin_docker_image | sars-cov-2 | `"us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.31 "`| | reference_genome | sars-cov-2 | `"gs://theiagen-public-files-rp/terra/augur-sars-cov-2-references/MN908947.fasta"` | | vadr_max_length | sars-cov-2 | `30000` | | vadr_mem | sars-cov-2 | `8` | | vadr_options | sars-cov-2 | `"--noseqnamemax --glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn --out_allfasta"` | +
+ ??? toggle "Mpox Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | gene_locations_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed"` | | genome_length_input | MPXV | `197200` | | kraken_target_organism_input | MPXV | `"Monkeypox virus"` | | nextclade_dataset_name_input | MPXV | `"nextstrain/mpox/lineage-b.1"` | - | nextclade_dataset_tag_input | MPXV | `"2024-04-19--07-50-39Z"` | + | nextclade_dataset_tag_input | MPXV | `"2024-11-19--14-18-53Z"` | | primer_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/MPXV.primer.bed"` | | reference_genome | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/MPXV.MT903345.reference.fasta"` | | reference_gff_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/Mpox-MT903345.1.reference.gff3"` | | vadr_max_length | MPXV | `210000` | | vadr_mem | MPXV | `8` | | vadr_options | MPXV | `"--glsearch -s -r --nomisc --mkey mpxv --r_lowsimok --r_lowsimxd 100 --r_lowsimxl 2000 --alt_pass discontn,dupregin --out_allfasta --minimap2 --s_overhang 150"` | + +
??? toggle "WNV Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | |---|---|---|---| | genome_length_input | WNV | `11000` | | @@ -522,7 +532,11 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | vadr_mem | WNV | `8` | | | vadr_options | WNV | `"--mkey flavi --mdir /opt/vadr/vadr-models-flavi/ --nomisc --noprotid --out_allfasta"` | | +
+ ??? toggle "Flu Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Flu Segment** | **Flu Subtype** | **Default Value** | **Notes** | |---|---|---|---|---|---| | flu_segment | flu | all | all | N/A | TheiaCoV will attempt to automatically assign a flu segment | @@ -532,13 +546,13 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | vadr_mem | flu | all | all | `8` | | | vadr_options | flu | all | all | `"--atgonly --xnocomp --nomisc --alt_fail extrant5,extrant3 --mkey flu"` | | | nextclade_dataset_name_input | flu | ha | h1n1 | `"nextstrain/flu/h1n1pdm/ha/MW626062"` | | - | nextclade_dataset_tag_input | flu | ha | h1n1 | `"2024-07-03--08-29-55Z"` | | + | nextclade_dataset_tag_input | flu | ha | h1n1 | `"2024-11-27--02-51-00Z"` | | | reference_genome | flu | ha | h1n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | h3n2 | `"nextstrain/flu/h3n2/ha/EPI1857216"` | | - | nextclade_dataset_tag_input | flu | ha | h3n2 | `"2024-08-08--05-08-21Z"` | | + | nextclade_dataset_tag_input | flu | ha | h3n2 | `"2024-11-27--02-51-00Z"` | | | reference_genome | flu | ha | h3n2 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | victoria | `"nextstrain/flu/vic/ha/KX058884"` | | - | nextclade_dataset_tag_input | flu | ha | victoria | `"2024-07-03--08-29-55Z"` | | + | nextclade_dataset_tag_input | flu | ha | victoria | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | ha | victoria | `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_ha.fasta"` | | | nextclade_dataset_name_input | flu | ha | yamagata | `"nextstrain/flu/yam/ha/JN993010"` | | | nextclade_dataset_tag_input | flu | ha | yamagata | `"2024-01-30--16-34-55Z"` | | @@ -547,43 +561,55 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | nextclade_dataset_tag_input | flu | ha | h5n1 | `"2024-05-08--11-39-52Z"` | | | reference_genome | flu | ha | h5n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h5n1_ha.fasta"` | | | nextclade_dataset_name_input | flu | na | h1n1 | `"nextstrain/flu/h1n1pdm/na/MW626056"` | | - | nextclade_dataset_tag_input | flu | na | h1n1 | `"2024-07-03--08-29-55Z"` | | + | nextclade_dataset_tag_input | flu | na | h1n1 | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | na | h1n1 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_na.fasta"` | | | nextclade_dataset_name_input | flu | na | h3n2 | `"nextstrain/flu/h3n2/na/EPI1857215"` | | - | nextclade_dataset_tag_input | flu | na | h3n2 | `"2024-04-19--07-50-39Z"` | | + | nextclade_dataset_tag_input | flu | na | h3n2 | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | na | h3n2 | `"gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_na.fasta"` | | | nextclade_dataset_name_input | flu | na | victoria | `"nextstrain/flu/vic/na/CY073894"` | | - | nextclade_dataset_tag_input | flu | na | victoria | `"2024-04-19--07-50-39Z"` | | + | nextclade_dataset_tag_input | flu | na | victoria | `"2024-11-05--09-19-52Z"` | | | reference_genome | flu | na | victoria | `"gs://theiagen-public-files-rp/terra/flu-references/reference_vic_na.fasta"` | | | nextclade_dataset_name_input | flu | na | yamagata | `"NA"` | | | nextclade_dataset_tag_input | flu | na | yamagata | `"NA"` | | | reference_genome | flu | na | yamagata | `"gs://theiagen-public-files-rp/terra/flu-references/reference_yam_na.fasta"` | | +
+ ??? toggle "RSV-A Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_a | 16000 | | kraken_target_organism | rsv_a | "Human respiratory syncytial virus A" | | nextclade_dataset_name_input | rsv_a | nextstrain/rsv/a/EPI_ISL_412866 | - | nextclade_dataset_tag_input | rsv_a | 2024-08-01--22-31-31Z | + | nextclade_dataset_tag_input | rsv_a | "2024-11-27--02-51-00Z" | | reference_genome | rsv_a | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta | | vadr_max_length | rsv_a | 15500 | | vadr_mem | rsv_a | 32 | | vadr_options | rsv_a | -r --mkey rsv --xnocomp | +
+ ??? toggle "RSV-B Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_b | 16000 | | kraken_target_organism | rsv_b | "human respiratory syncytial virus" | | nextclade_dataset_name_input | rsv_b | nextstrain/rsv/b/EPI_ISL_1653999 | - | nextclade_dataset_tag_input | rsv_b | "2024-08-01--22-31-31Z" | + | nextclade_dataset_tag_input | rsv_b | "2024-11-27--02-51-00Z" | | reference_genome | rsv_b | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta | | vadr_max_length | rsv_b | 15500 | | vadr_mem | rsv_b | 32 | | vadr_options | rsv_b | -r --mkey rsv --xnocomp | +
+ ??? toggle "HIV Defaults" +
+ | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | |---|---|---|---| | kraken_target_organism_input | HIV | Human immunodeficiency virus 1 | | @@ -596,6 +622,8 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | reference_genome | HIV-v2 | gs://theiagen-public-files/terra/hivgc-files/AY228557.1.headerchanged.fasta | This version of HIV originates from Southern Africa | | reference_gff_file | HIV-v2 | gs://theiagen-public-files/terra/hivgc-files/AY228557.1.gff3 | This version of HIV originates from Southern Africa | +
+ ### Workflow Tasks All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT, and ClearLabs workflows. These undertake read trimming and assembly appropriate to the input data type. TheiaCoV workflows subsequently launch default genome characterization modules for quality assessment, and additional taxa-specific characterization steps. When setting up the workflow, users may choose to use "optional tasks" as additions or alternatives to tasks run in the workflow by default. @@ -631,8 +659,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | Variable | Rationale | | --- | --- | - | `skip_screen` | Prevent the read screen from running | - | `skip_screen` | Saving waste of compute resources on insufficient data | + | `skip_screen` | Set to true to skip the read screen from running | | `min_reads` | Minimum number of base pairs for 10x coverage of the Hepatitis delta (of the *Deltavirus* genus) virus divided by 300 (longest Illumina read length) | | `min_basepairs` | Greater than 10x coverage of the Hepatitis delta (of the *Deltavirus* genus) virus | | `min_genome_size` | Based on the Hepatitis delta (of the *Deltavirus* genus) genome- the smallest viral genome as of 2024-04-11 (1,700 bp) | @@ -715,7 +742,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | | Links | | --- | --- | - | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilties/wf_read_QC_trim.wdl) | + | Sub-workflow | [wf_read_QC_trim_pe.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_pe.wdl)
[wf_read_QC_trim_se.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_se.wdl) | | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | @@ -735,7 +762,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | | Links | | --- | --- | - | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_ncbi_scrub.wdl) | + | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_ncbi_scrub.wdl) | | Software Source Code | [NCBI Scrub on GitHub](https://github.com/ncbi/sra-human-scrubber) | | Software Documentation | | @@ -756,7 +783,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | | Links | | --- | --- | - | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | | Software Documentation | | | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | @@ -901,6 +928,8 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | Task | [task_pangolin.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/betacoronavirus/task_pangolin.wdl) | | Software Source Code | [Pangolin on GitHub](https://github.com/cov-lineages/pangolin) | | Software Documentation | [Pangolin website](https://cov-lineages.org/resources/pangolin.html) | + | Original Publication(s) | [A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology](https://doi.org/10.1038/s41564-020-0770-5) | + ??? task "`nextclade`" @@ -1027,6 +1056,8 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | est_percent_gene_coverage_tsv | File | Percent coverage for each gene in the organism being analyzed (depending on the organism input) | CL, ONT, PE, SE | | fastp_html_report | File | HTML report for fastp | PE, SE | | fastp_version | String | Fastp version used | PE, SE | +| fastq_scan_clean1_json | File | JSON file output from `fastq-scan` containing summary stats about clean forward read quality and length | PE, SE, CL | +| fastq_scan_clean2_json | File | JSON file output from `fastq-scan` containing summary stats about clean reverse read quality and length | PE | | fastq_scan_num_reads_clean_pairs | String | Number of paired reads after filtering as determined by fastq_scan | PE | | fastq_scan_num_reads_clean1 | Int | Number of forward reads after filtering as determined by fastq_scan | CL, PE, SE | | fastq_scan_num_reads_clean2 | Int | Number of reverse reads after filtering as determined by fastq_scan | PE | @@ -1037,6 +1068,8 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | fastq_scan_r1_mean_q_raw | Float | Forward read mean quality value before quality trimming and adapter removal | | | fastq_scan_r1_mean_readlength_clean | Float | Forward read mean read length value after quality trimming and adapter removal | | | fastq_scan_r1_mean_readlength_raw | Float | Forward read mean read length value before quality trimming and adapter removal | | +| fastq_scan_raw1_json | File | JSON file output from `fastq-scan` containing summary stats about raw forward read quality and length | PE, SE, CL | +| fastq_scan_raw2_json | File | JSON file output from `fastq-scan` containing summary stats about raw reverse read quality and length | PE | | fastq_scan_version | String | Version of fastq_scan used for read QC analysis | CL, PE, SE | | fastqc_clean1_html | File | Graphical visualization of clean forward read quality from fastqc to open in an internet browser | PE, SE | | fastqc_clean2_html | File | Graphical visualization of clean reverse read quality from fastqc to open in an internet browser | PE | @@ -1139,10 +1172,10 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | nextclade_json_flu_ha | File | Nextclade output in JSON file format, specific to Flu HA segment | ONT, PE | | nextclade_json_flu_na | File | Nextclade output in JSON file format, specific to Flu NA segment | ONT, PE | | nextclade_lineage | String | Nextclade lineage designation | CL, FASTA, ONT, PE, SE | -| nextclade_qc | String | QC metric as determined by Nextclade. (For Flu, this output will be specific to HA segment) | CL, FASTA, ONT, PE, SE | +| nextclade_qc | String | QC metric as determined by Nextclade. Will be blank for Flu | CL, FASTA, ONT, PE, SE | | nextclade_qc_flu_ha | String | QC metric as determined by Nextclade, specific to Flu HA segment | ONT, PE | | nextclade_qc_flu_na | String | QC metric as determined by Nextclade, specific to Flu NA segment | ONT, PE | -| nextclade_tsv | File | Nextclade output in TSV file format. (For Flu, this output will be specific to HA segment) | CL, FASTA, ONT, PE, SE | +| nextclade_tsv | File | Nextclade output in TSV file format. Will be blank for Flu | CL, FASTA, ONT, PE, SE | | nextclade_tsv_flu_ha | File | Nextclade output in TSV file format, specific to Flu HA segment | ONT, PE | | nextclade_tsv_flu_na | File | Nextclade output in TSV file format, specific to Flu NA segment | ONT, PE | | nextclade_version | String | The version of Nextclade software used | CL, FASTA, ONT, PE, SE | diff --git a/docs/workflows/genomic_characterization/theiaeuk.md b/docs/workflows/genomic_characterization/theiaeuk.md index 19141cd05..cc9cba9c1 100644 --- a/docs/workflows/genomic_characterization/theiaeuk.md +++ b/docs/workflows/genomic_characterization/theiaeuk.md @@ -8,22 +8,22 @@ ## TheiaEuk Workflows -**The TheiaEuk_PE workflow is for the assembly, quality assessment, and characterization of fungal genomes.** It is designed to accept Illumina paired-end sequencing data as the primary input. **It is currently intended only for haploid fungal genomes like _Candida auris_.** Analyzing diploid genomes using TheiaEuk should be attempted only with expert attention to the resulting genome quality. +**The TheiaEuk_Illumina_PE workflow is for the assembly, quality assessment, and characterization of fungal genomes.** It is designed to accept Illumina paired-end sequencing data as the primary input. **It is currently intended only for ==haploid== fungal genomes like _Candida auris_.** Analyzing diploid genomes using TheiaEuk should be attempted only with expert attention to the resulting genome quality. -All input reads are processed through "core tasks" in each workflow. The core tasks include raw-read quality assessment, read cleaning (quality trimming and adapter removal), de novo assembly, assembly quality assessment, and species taxon identification. For some taxa identified, "taxa-specific sub-workflows" will be automatically activated, undertaking additional taxa-specific characterization steps, including clade-typing and/or antifungal resistance detection. +All input reads are processed through "core tasks" in each workflow. The core tasks include raw read quality assessment, read cleaning (quality trimming and adapter removal), de novo assembly, assembly quality assessment, and species taxon identification. For some taxa identified, taxa-specific sub-workflows will be automatically activated, undertaking additional taxa-specific characterization steps, including clade-typing and/or antifungal resistance detection. !!! caption "TheiaEuk Workflow Diagram" - ![TheiaEuk Workflow Diagram](../../assets/figures/TheiaEuk_Illumina_PE.png){width=75%} + ![TheiaEuk Workflow Diagram](../../assets/figures/TheiaEuk_Illumina_PHB_20241106.png){width=75%} ### Inputs !!! info "Input read data" - The TheiaEuk_PE workflow takes in Illumina paired-end read data. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. When possible, Theiagen recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) prior to Terra upload to minimize data upload time. + The TheiaEuk_Illumina_PE workflow takes in Illumina paired-end read data. Read file names should end with `.fastq` or `.fq`, with the optional addition of `.gz`. When possible, Theiagen recommends zipping files with [gzip](https://www.gnu.org/software/gzip/) prior to Terra upload to minimize data upload time. By default, the workflow anticipates 2 x 150bp reads (i.e. the input reads were generated using a 300-cycle sequencing kit). Modifications to the optional parameter for `trim_minlen` may be required to accommodate shorter read data, such as the 2 x 75bp reads generated using a 150-cycle sequencing kit. -
+
| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | |---|---|---|---|---|---| @@ -148,7 +148,7 @@ All input reads are processed through "core tasks" in each workflow. The core ta | read_QC_trim | **workflow_series** | String | Internal component, do not modify | | Do Not Modify, Optional | | shovill_pe | **assembler** | String | Assembler to use (spades, skesa, velvet or megahit), see | "skesa" | Optional | | shovill_pe | **assembler_options** | String | Assembler-specific options that you might choose, see | | Optional | -| shovill_pe | **depth** | Int | User specified depth of coverage for downsampling (see ) | 150 | Optional | +| shovill_pe | **depth** | Int | User specified depth of coverage for downsampling (see and ) | 150 | Optional | | shovill_pe | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | | shovill_pe | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/shovill:1.1.0 | Optional | | shovill_pe | **genome_length** | String | Internal component, do not modify | | Do Not Modify, Optional | @@ -177,7 +177,14 @@ All input reads are processed through "core tasks" in each workflow. The core ta
-### Workflow tasks (performed for all taxa) +### Workflow Tasks + +All input reads are processed through "core tasks" in the TheiaEuk workflows. These undertake read trimming and assembly appropriate to the input data type, currently only Illumina paired-end data. TheiaEuk workflow subsequently launch default genome characterization modules for quality assessment, and additional taxa-specific characterization steps. When setting up the workflow, users may choose to use "optional tasks" or alternatives to tasks run in the workflow by default. + +#### Core tasks + +!!! tip "" + These tasks are performed regardless of organism. They perform read trimming and various quality control steps. ??? task "`versioning`: Version capture for TheiaEuk" @@ -189,7 +196,7 @@ All input reads are processed through "core tasks" in each workflow. The core ta | --- | --- | | Task | [task_versioning.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/task_versioning.wdl) | -??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation" +??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation (optional, on by default)" The [`screen`](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses [`fastq-scan`](https://github.com/rpetit3/fastq-scan) and bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow: @@ -219,19 +226,22 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Task | [task_screen.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_screen.wdl) | + | Task | [task_screen.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) | -??? task "`rasusa`: Read subsampling" +??? task "`Rasusa`: Read subsampling (optional, on by default)" - The RASUSA task performs subsampling of the raw reads. By default, this task will subsample reads to a depth of 150X using the estimated genome length produced during the preceding raw read screen. The user can prevent the task from being launched by setting the `call_rasusa`variable to false. + The Rasusa task performs subsampling of the raw reads. By default, this task will subsample reads to a depth of 150X using the estimated genome length produced during the preceding raw read screen. The user can prevent the task from being launched by setting the `call_rasusa`variable to false. The user can also provide an estimated genome length for the task to use for subsampling using the `genome_size` variable. In addition, the read depth can be modified using the `subsample_coverage` variable. - !!! techdetails "RASUSA Technical Details" + !!! techdetails "Rasusa Technical Details" - | | TheiaEuk_Illumina_PE_PHB | + | | Links | | --- | --- | | Task | [task_rasusa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_rasusa.wdl) | + | Software Source Code | [Rasusa on GitHub](https://github.com/mbhall88/rasusa) | + | Software Documentation | [Rasusa on GitHub](https://github.com/mbhall88/rasusa) | + | Original Publication(s) | [Rasusa: Randomly subsample sequencing reads to a specified coverage](https://doi.org/10.21105/joss.03941) | ??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" @@ -297,12 +307,17 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim.wdl) | + | Sub-workflow | [wf_read_QC_trim_pe.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_pe.wdl) | | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl)| | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | | Original Publication(s) | [Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/)
[Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | +#### Assembly tasks + +!!! tip "" + These tasks assemble the reads into a _de novo_ assembly and assess the quality of the assembly. + ??? task "`shovill`: _De novo_ Assembly" De Novo assembly will be undertaken only for samples that have sufficient read quantity and quality, as determined by the `screen` task assessment of clean reads. @@ -316,7 +331,8 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | | TheiaEuk WDL Task | [task_shovill.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_shovill.wdl#L3) | - | Software code repository and documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | + | Software Source Code | [Shovill on GitHub](https://github.com/tseemann/shovill) | + | Software Documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | ??? task "`QUAST`: Assembly Quality Assessment" @@ -326,7 +342,7 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_quast.wdl) | + | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_quast.wdl) | | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | | Software Documentation | https://quast.sourceforge.net/docs/manual.html | | Orginal publication | [QUAST: quality assessment tool for genome assemblies](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3624806/) | @@ -340,11 +356,16 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_cg_pipeline.wdl) | + | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_cg_pipeline.wdl) | | Software Source Code | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | | Software Documentation | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | | Original Publication(s) | [A computational genomics pipeline for prokaryotic sequencing projects](https://academic.oup.com/bioinformatics/article/26/15/1819/188418) | +#### Organism-agnostic characterization + +!!! tip "" + These tasks are performed regardless of the organism and provide quality control and taxonomic assignment. + ??? task "`GAMBIT`: **Taxon Assignment**" [`GAMBIT`](https://github.com/jlumpe/gambit) determines the taxon of the genome assembly using a k-mer based approach to match the assembly sequence to the closest complete genome in a database, thereby predicting its identity. Sometimes, GAMBIT can confidently designate the organism to the species level. Other times, it is more conservative and assigns it to a higher taxonomic rank. @@ -360,7 +381,33 @@ All input reads are processed through "core tasks" in each workflow. The core ta | Software Documentation | [GAMBIT ReadTheDocs](https://gambit-genomics.readthedocs.io/en/latest/) | | Original Publication(s) | [GAMBIT (Genomic Approximation Method for Bacterial Identification and Tracking): A methodology to rapidly leverage whole genome sequencing of bacterial isolates for clinical identification](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0277575) | -??? task "**`QC_check`: Check QC Metrics Against User-Defined Thresholds (optional)**" +??? task "`BUSCO`: Assembly Quality Assessment" + + BUSCO (**B**enchmarking **U**niversal **S**ingle-**C**opy **O**rthologue) attempts to quantify the completeness and contamination of an assembly to generate quality assessment metrics. It uses taxa-specific databases containing genes that are all expected to occur in the given taxa, each in a single copy. BUSCO examines the presence or absence of these genes, whether they are fragmented, and whether they are duplicated (suggestive that additional copies came from contaminants). + + **BUSCO notation** + + Here is an example of BUSCO notation: `C:99.1%[S:98.9%,D:0.2%],F:0.0%,M:0.9%,n:440`. There are several abbreviations used in this output: + + - Complete (C) - genes are considered "complete" when their lengths are within two standard deviations of the BUSCO group mean length. + - Single-copy (S) - genes that are complete and have only one copy. + - Duplicated (D) - genes that are complete and have more than one copy. + - Fragmented (F) - genes that are only partially recovered. + - Missing (M) - genes that were not recovered at all. + - Number of genes examined (n) - the number of genes examined. + + A high equity assembly will use the appropriate database for the taxa, have high complete (C) and single-copy (S) percentages, and low duplicated (D), fragmented (F) and missing (M) percentages. + + !!! techdetails "BUSCO Technical Details" + + | | Links | + | --- | --- | + | Task | [task_busco.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/advanced_metrics/task_busco.wdl) | + | Software Source Code | [BUSCO on GitLab](https://gitlab.com/ezlab/busco) | + | Software Documentation | https://busco.ezlab.org/ | + | Orginal publication | [BUSCO: assessing genome assembly and annotation completeness with single-copy orthologs](https://academic.oup.com/bioinformatics/article/31/19/3210/211866) | + +??? task "`QC_check`: Check QC Metrics Against User-Defined Thresholds (optional)" The `qc_check` task compares generated QC metrics against user-defined thresholds for each metric. This task will run if the user provides a `qc_check_table` .tsv file. If all QC metrics meet the threshold, the `qc_check` output variable will read `QC_PASS`. Otherwise, the output will read `QC_NA` if the task could not proceed or `QC_ALERT` followed by a string indicating what metric failed. @@ -383,96 +430,167 @@ All input reads are processed through "core tasks" in each workflow. The core ta | | Links | | --- | --- | - | Task | [task_qc_check.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_qc_check.wdl) | + | Task | [task_qc_check_phb.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_qc_check_phb.wdl) | -### Organism-specific Characterization +#### Organism-specific characterization -The TheiaEuk workflow automatically activates taxa-specific tasks after identification of relevant taxa using `GAMBIT`. Many of these taxa-specific tasks do not require any additional workflow tasks from the user. +!!! tip "" + The TheiaEuk workflow automatically activates taxa-specific tasks after identification of the relevant taxa using `GAMBIT`. Many of these taxa-specific tasks do not require any additional inputs from the user. ??? toggle "_Candida auris_" + Two tools are deployed when _Candida auris_ is identified. + + ??? task "Cladetyping: clade determination" + GAMBIT is used to determine the clade of the specimen by comparing the sequence to five clade-specific reference files. The output of the clade typing task will be used to specify the reference genome for the antifungal resistance detection tool. + + ??? toggle "Default reference genomes used for clade typing and antimicrobial resistance gene detection of _C. auris_" + | Clade | Genome Accession | Assembly Name | Strain | NCBI Submitter | Included mutations in AMR genes (not comprehensive) | + | --- | --- | --- | --- | --- | --- | + | _Candida auris_ Clade I | GCA_002759435.2 | Cand_auris_B8441_V2 | B8441 | Centers for Disease Control and Prevention | | + | _Candida auris_ Clade II | GCA_003013715.2 | ASM301371v2 | B11220 | Centers for Disease Control and Prevention | | + | _Candida auris_ Clade III | GCA_002775015.1 | Cand_auris_B11221_V1 | B11221 | Centers for Disease Control and Prevention | _ERG11_ V125A/F126L | + | _Candida auris_ Clade IV | GCA_003014415.1 | Cand_auris_B11243 | B11243 | Centers for Disease Control and Prevention | _ERG11_ Y132F | + | _Candida auris_ Clade V | GCA_016809505.1 | ASM1680950v1 | IFRC2087 | Centers for Disease Control and Prevention | | + + !!! techdetails "Cladetyping Technical Details" + | | Links | + | --- | --- | + | Task | [task_cauris_cladetyping.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/candida/task_cauris_cladetyper.wdl) | + | Software Source Code | [GAMBIT on GitHub](https://github.com/jlumpe/gambit) | + | Software Documentation | [GAMBIT Overview](https://theiagen.notion.site/GAMBIT-7c1376b861d0486abfbc316480046bdc?pvs=4) + | Original Publication(s) | [GAMBIT (Genomic Approximation Method for Bacterial Identification and Tracking): A methodology to rapidly leverage whole genome sequencing of bacterial isolates for clinical identification](https://doi.org/10.1371/journal.pone.0277575)
[TheiaEuk: a species-agnostic bioinformatics workflow for fungal genomic characterization](https://doi.org/10.3389/fpubh.2023.1198213) | + + ??? task "Snippy Variants: antifungal resistance detection" + To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, then these variants are queried for product names associated with resistance. + + The genes in which there are known resistance-conferring mutations for this pathogen are: + + - FKS1 + - ERG11 (lanosterol 14-alpha demethylase) + - FUR1 (uracil phosphoribosyltransferase) + + We query `Snippy` results to see if any mutations were identified in those genes. By default, we automatically check for the following loci (which can be overwritten by the user). You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name (see below): + + | **TheiaEuk Search Term** | **Corresponding Gene Name** | + |---|---| + | B9J08_005340 | ERG6 | + | B9J08_000401 | FLO8 | + | B9J08_005343 | Hypothetical protein (PSK74852) | + | B9J08_003102 | MEC3 | + | B9J08_003737 | ERG3 | + | lanosterol.14-alpha.demethylase | ERG11 | + | uracil.phosphoribosyltransferase | FUR1 | + | FKS1 | FKS1 | + + For example, one sample may have the following output for the `theiaeuk_snippy_variants_hits` column: + + ```plaintext + lanosterol.14-alpha.demethylase: lanosterol 14-alpha demethylase (missense_variant c.428A>G p.Lys143Arg; C:266 T:0),B9J08_000401: hypothetical protein (stop_gained c.424C>T p.Gln142*; A:70 G:0) + ``` + + Based on this, we can tell that ERG11 has a missense variant at position 143 (Lysine to Arginine) and B9J08_000401 (which is FLO8) has a stop-gained variant at position 142 (Glutamine to Stop). + + ??? toggle "Known resistance-conferring mutations for _Candida auris_" + Mutations in these genes that are known to confer resistance are shown below + + | **Organism** | **Found in** | **Gene name** | **Gene locus** | **AA mutation** | **Drug** | **Reference** | + | --- | --- | --- | --- | --- | --- | --- | + | **Candida auris** | **Human** | **ERG11** | | **Y132F** | **Fluconazole** | [Simultaneous Emergence of Multidrug-Resistant _Candida auris_ on 3 Continents Confirmed by Whole-Genome Sequencing and Epidemiological Analyses](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | + | **Candida auris** | **Human** | **ERG11** | | **K143R** | **Fluconazole** | [Simultaneous Emergence of Multidrug-Resistant _Candida auris_ on 3 Continents Confirmed by Whole-Genome Sequencing and Epidemiological Analyses](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | + | **Candida auris** | **Human** | **ERG11** | | **F126T** | **Fluconazole** | [Simultaneous Emergence of Multidrug-Resistant _Candida auris_ on 3 Continents Confirmed by Whole-Genome Sequencing and Epidemiological Analyses](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | + | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Micafungin** | [Activity of CD101, a long-acting echinocandin, against clinical isolates of Candida auris](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | + | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Caspofungin** | [Activity of CD101, a long-acting echinocandin, against clinical isolates of Candida auris](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | + | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Anidulafungin** | [Activity of CD101, a long-acting echinocandin, against clinical isolates of Candida auris](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | + | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Micafungin** | [A multicentre study of antifungal susceptibility patterns among 350 _Candida auris_ isolates (2009–17) in India: role of the ERG11 and FKS1 genes in azole and echinocandin resistance](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | + | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Caspofungin** | [A multicentre study of antifungal susceptibility patterns among 350 _Candida auris_ isolates (2009–17) in India: role of the ERG11 and FKS1 genes in azole and echinocandin resistance](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | + | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Anidulafungin** | [A multicentre study of antifungal susceptibility patterns among 350 _Candida auris_ isolates (2009–17) in India: role of the ERG11 and FKS1 genes in azole and echinocandin resistance](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | + | **Candida auris** | **Human** | **FUR1** | **CAMJ_004922** | **F211I** | **5-flucytosine** | [Genomic epidemiology of the UK outbreak of the emerging human fungal pathogen Candida auris](https://doi.org/10.1038/s41426-018-0045-x) | + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | + +??? toggle "_Candida albicans_" + When this species is detected by the taxon ID tool, an antifungal resistance detection task is deployed. - Two tools are deployed when _Candida auris is_ identified. First, the Cladetyping tool is launched to determine the clade of the specimen by comparing the sequence to five clade-specific reference files. The output of the clade typing task will be used to specify the reference genome for the antifungal resistance detection tool. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, then these variants are queried for product names associated with resistance according to the MARDy database (). - - The genes in which there are known resistance-conferring mutations for this pathogen are: - - - FKS1 - - ERG11 (lanosterol 14-alpha demethylase) - - FUR1 (uracil phosphoribosyltransferase) - - We query `Snippy` results to see if any mutations were identified in those genes. In addition, _C. auris_ automatically checks for the following loci. You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name followings: - - | **TheiaEuk Search Term** | **Corresponding Gene Name** | - |---|---| - | B9J08_005340 | ERG6 | - | B9J08_000401 | FLO8 | - | B9J08_005343 | Hypothetical protein (PSK74852) | - | B9J08_003102 | MEC3 | - | B9J08_003737 | ERG3 | - | lanosterol.14-alpha.demethylase | ERG11 | - | uracil.phosphoribosyltransferase | FUR1 | - | FKS1 | FKS1 | - - For example, one sample may have the following output for the `theiaeuk_snippy_variants_hits` column: - - ```plaintext - lanosterol.14-alpha.demethylase: lanosterol 14-alpha demethylase (missense_variant c.428A>G p.Lys143Arg; C:266 T:0),B9J08_000401: hypothetical protein (stop_gained c.424C>T p.Gln142*; A:70 G:0) - ``` - - Based on this, we can tell that ERG11 has a missense variant at position 143 (Lysine to Arginine) and B9J08_000401 (which is FLO8) has a stop-gained variant at position 142 (Glutamine to Stop). - - ??? toggle "Default reference genomes used for clade typing and antimicrobial resistance gene detection of _C. auris_" - | Clade | Genome Accession | Assembly Name | Strain | NCBI Submitter | Included mutations in AMR genes (not comprehensive) | - | --- | --- | --- | --- | --- | --- | - | Candida auris Clade I | GCA_002759435.2 | Cand_auris_B8441_V2 | B8441 | Centers for Disease Control and Prevention | | - | Candida auris Clade II | GCA_003013715.2 | ASM301371v2 | B11220 | Centers for Disease Control and Prevention | | - | Candida auris Clade III | GCA_002775015.1 | Cand_auris_B11221_V1 | B11221 | Centers for Disease Control and Prevention | _ERG11_ V125A/F126L | - | Candida auris Clade IV | GCA_003014415.1 | Cand_auris_B11243 | B11243 | Centers for Disease Control and Prevention | _ERG11_ Y132F | - | Candida auris Clade V | GCA_016809505.1 | ASM1680950v1 | IFRC2087 | Centers for Disease Control and Prevention | | - - ??? toggle "Known resistance-conferring mutations for _Candida auris_" - Mutations in these genes that are known to confer resistance are shown below (source: MARDy database http://mardy.dide.ic.ac.uk/index.php) - - | **Organism** | **Found in** | **Gene name** | **Gene locus** | **AA mutation** | **Drug** | **Tandem repeat name** | **Tandem repeat sequence** | **Reference** | - | --- | --- | --- | --- | --- | --- | --- | --- | --- | - | **Candida auris** | **Human** | **ERG11** | | **Y132F** | **Fluconazole** | | | [**10.1093/cid/ciw691**](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | - | **Candida auris** | **Human** | **ERG11** | | **K143R** | **Fluconazole** | | | [**10.1093/cid/ciw691**](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | - | **Candida auris** | **Human** | **ERG11** | | **F126T** | **Fluconazole** | | | [**10.1093/cid/ciw691**](https://academic.oup.com/cid/article/64/2/134/2706620/Simultaneous-Emergence-of-Multidrug-Resistant) | - | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Micafungin** | | | [**10.1016/j.diagmicrobio.2017.10.021**](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | - | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Caspofungin** | | | [**10.1016/j.diagmicrobio.2017.10.021**](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | - | **Candida auris** | **Human** | **FKS1** | | **S639P** | **Anidulafungin** | | | [**10.1016/j.diagmicrobio.2017.10.021**](https://www.sciencedirect.com/science/article/pii/S0732889317303498) | - | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Micafungin** | | | [**10.1093/jac/dkx480**](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | - | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Caspofungin** | | | [**10.1093/jac/dkx480**](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | - | **Candida auris** | **Human** | **FKS1** | | **S639F** | **Anidulafungin** | | | [**10.1093/jac/dkx480**](https://academic.oup.com/jac/advance-article/doi/10.1093/jac/dkx480/4794718) | - | **Candida auris** | **Human** | **FUR1** | **CAMJ_004922** | **F211I** | **5-flucytosine** | | | [**https://doi.org/10.1038/s41426-018-0045-x**](https://www.nature.com/articles/s41426-018-0045-x) | + ??? task "Snippy Variants: antifungal resistance detection" + To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance. -??? toggle "_Candida albicans_" + The genes in which there are known resistance-conferring mutations for this pathogen are: - When this species is detected by the taxon ID tool, an antifungal resistance detection task is deployed. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance according to the MARDy database (). + - ERG11 + - GCS1 (FKS1) + - FUR1 + - RTA2 - The genes in which there are known resistance-conferring mutations for this pathogen are: + We query `Snippy` results to see if any mutations were identified in those genes. By default, we automatically check for the following loci (which can be overwritten by the user). You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name (see below): - - ERG11 - - GCS1 (FKS1) - - FUR1 - - RTA2 + | **TheiaEuk Search Term** | **Corresponding Gene Name** | + |---|---| + | ERG11 | ERG11 | + | GCS1 | FKS1 | + | FUR1 | FUR1 | + | RTA2 | RTA2 | + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ??? toggle "_Aspergillus fumigatus_" + When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. + + ??? task "Snippy Variants: antifungal resistance detection" + To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance. - When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance according to the MARDy database (). + The genes in which there are known resistance-conferring mutations for this pathogen are: - The genes in which there are known resistance-conferring mutations for this pathogen are: + - Cyp51A + - HapE + - COX10 (AFUA_4G08340) + + We query `Snippy` results to see if any mutations were identified in those genes. By default, we automatically check for the following loci (which can be overwritten by the user). You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name (see below): - - Cyp51A - - HapE - - COX10 (AFUA_4G08340) + | **TheiaEuk Search Term** | **Corresponding Gene Name** | + |---|---| + | Cyp51A | Cyp51A | + | HapE | HapE | + | AFUA_4G08340 | COX10 | + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ??? toggle "_Cryptococcus neoformans_" + When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. - When this species is detected by the taxon ID tool an antifungal resistance detection task is deployed. To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance according to the MARDy database (). + ??? task "Snippy Variants: antifungal resistance detection" + To detect mutations that may confer antifungal resistance, `Snippy` is used to find all variants relative to the clade-specific reference, and these variants are queried for product names associated with resistance. - The gene in which there are known resistance-conferring mutations for this pathogen is: + The genes in which there are known resistance-conferring mutations for this pathogen are: - - ERG11 (CNA00300) + - ERG11 (CNA00300) + + We query `Snippy` results to see if any mutations were identified in those genes. By default, we automatically check for the following loci (which can be overwritten by the user). You will find the mutations next to the locus tag in the `theiaeuk_snippy_variants_hits` column corresponding gene name (see below): + + | **TheiaEuk Search Term** | **Corresponding Gene Name** | + |---|---| + | CNA00300 | ERG11 | + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ### Outputs @@ -484,6 +602,10 @@ The TheiaEuk workflow automatically activates taxa-specific tasks after identifi | cg_pipeline_report | File | TSV file of read metrics from raw reads, including average read length, number of reads, and estimated genome coverage | | est_coverage_clean | Float | Estimated coverage calculated from clean reads and genome length | | est_coverage_raw | Float | Estimated coverage calculated from raw reads and genome length | +| fastq_scan_clean1_json | File | JSON file output from `fastq-scan` containing summary stats about clean forward read quality and length | +| fastq_scan_clean2_json | File | JSON file output from `fastq-scan` containing summary stats about clean reverse read quality and length | +| fastq_scan_raw1_json | File | JSON file output from `fastq-scan` containing summary stats about raw forward read quality and length | +| fastq_scan_raw2_json | File | JSON file output from `fastq-scan` containing summary stats about raw reverse read quality and length | | r1_mean_q_clean | Float | Mean quality score of clean forward reads | | r1_mean_q_raw | Float | Mean quality score of raw forward reads | | r2_mean_q_clean | Float | Mean quality score of clean reverse reads | @@ -536,4 +658,4 @@ The TheiaEuk workflow automatically activates taxa-specific tasks after identifi | theiaeuk_illumina_pe_analysis_date | String | Date of TheiaProk workflow execution | | theiaeuk_illumina_pe_version | String | TheiaProk workflow version used | -
\ No newline at end of file + diff --git a/docs/workflows/genomic_characterization/theiameta.md b/docs/workflows/genomic_characterization/theiameta.md index 55c26d9a6..d6b55e80a 100644 --- a/docs/workflows/genomic_characterization/theiameta.md +++ b/docs/workflows/genomic_characterization/theiameta.md @@ -149,7 +149,7 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge | | Links | | --- | --- | - | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_ncbi_scrub.wdl) | + | Task | [task_ncbi_scrub.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_ncbi_scrub.wdl) | | Software Source Code | [NCBI Scrub on GitHub](https://github.com/ncbi/sra-human-scrubber) | | Software Documentation | | @@ -214,7 +214,7 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge | | Links | | --- | --- | - | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim.wdl) | + | Sub-workflow | [wf_read_QC_trim_pe.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_pe.wdl)
[wf_read_QC_trim_se.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_se.wdl) | | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl)| | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | @@ -233,7 +233,7 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge | | Links | | --- | --- | - | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | | Software Documentation | | | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | @@ -242,21 +242,62 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge ??? task "`metaspades`: _De Novo_ Metagenomic Assembly" - While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. + While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. + + `metaspades` is a _de novo_ assembler that first constructs a de Bruijn graph of all the reads using the SPAdes algorithm. Through various graph simplification procedures, paths in the assembly graph are reconstructed that correspond to long genomic fragments within the metagenome. For more details, please see the original publication. !!! techdetails "MetaSPAdes Technical Details" | | Links | | --- | --- | | Task | [task_metaspades.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_metaspades.wdl) | - | Software Source Code | [SPAdes on GitHub](https://github.com/ablab/spades) | - | Software Documentation | | - | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5411777/) | + | Software Source Code | [SPAdes on GitHub](https://github.com/ablab/spades) | + | Software Documentation | [SPAdes Manual](https://ablab.github.io/spades/index.html) | + | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](http://www.genome.org/cgi/doi/10.1101/gr.213959.116) | -??? task "`minimap2`: Assembly Alignment and Contig Filtering (if a reference is provided)" +??? task "`minimap2`: Assembly Alignment and Contig Filtering" If a reference genome is provided through the **`reference`** optional input, the assembly produced with `metaspades` will be mapped to the reference genome with `minimap2`. The contigs which align to the reference are retrieved and returned in the **`assembly_fasta`** output. + `minimap2` is a popular aligner that is used for correcting the assembly produced by metaSPAdes. This is done by aligning the reads back to the generated assembly or a reference genome. + + In minimap2, "modes" are a group of preset options. Two different modes are used in this task depending on whether a reference genome is provided. + + If a reference genome is _not_ provided, the only mode used in this task is `sr` which is intended for "short single-end reads without splicing". The `sr` mode indicates the following parameters should be used: `-k21 -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g100 -2K50m --heap-sort=yes --secondary=no`. The output file is in SAM format. + + If a reference genome is provided, then after the draft assembly polishing with `pilon`, this task runs again with the mode set to `asm20` which is intended for "long assembly to reference mapping". The `asm20` mode indicates the following parameters should be used: `-k19 -w10 -U50,500 --rmq -r100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50`. The output file is in PAF format. + + For more information, please see the [minimap2 manpage](https://lh3.github.io/minimap2/minimap2.html) + + !!! techdetails "minimap2 Technical Details" + | | Links | + |---|---| + | Task | [task_minimap2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_minimap2.wdl) | + | Software Source Code | [minimap2 on GitHub](https://github.com/lh3/minimap2) | + | Software Documentation | [minimap2](https://lh3.github.io/minimap2) | + | Original Publication(s) | [Minimap2: pairwise alignment for nucleotide sequences](https://academic.oup.com/bioinformatics/article/34/18/3094/4994778) | + +??? task "`samtools`: SAM File Conversion " + This task converts the output SAM file from minimap2 and converts it to a BAM file. It then sorts the BAM based on the read names, and then generates an index file. + + !!! techdetails "samtools Technical Details" + | | Links | + |---|---| + | Task | [task_samtools.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/data_handling/task_parse_mapping.wdl) | + | Software Source Code | [samtools on GitHub](https://github.com/samtools/samtools) | + | Software Documentation | [samtools](https://www.htslib.org/doc/samtools.html) | + | Original Publication(s) | [The Sequence Alignment/Map format and SAMtools](https://doi.org/10.1093/bioinformatics/btp352)
[Twelve Years of SAMtools and BCFtools](https://doi.org/10.1093/gigascience/giab008) | + +??? task "`pilon`: Assembly Polishing" + `pilon` is a tool that uses read alignment to correct errors in an assembly. It is used to polish the assembly produced by metaSPAdes. The input to Pilon is the sorted BAM file produced by `samtools`, and the original draft assembly produced by `metaspades`. + + !!! techdetails "pilon Technical Details" + | | Links | + |---|---| + | Task | [task_pilon.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_pilon.wdl) | + | Software Source Code | [Pilon on GitHub](https://github.com/broadinstitute/pilon) | + | Software Documentation | [Pilon Wiki](https://github.com/broadinstitute/pilon/wiki) | + | Original Publication(s) | [Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement](https://doi.org/10.1371/journal.pone.0112963) | #### Assembly QC ??? task "`quast`: Assembly Quality Assessment" @@ -267,7 +308,7 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge | | Links | | --- | --- | - | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_quast.wdl) | + | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_quast.wdl) | | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | | Software Documentation | | | Original Publication(s) | [QUAST: quality assessment tool for genome assemblies](https://academic.oup.com/bioinformatics/article/29/8/1072/228832) | @@ -295,12 +336,16 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge | fastp_html_report | File | Report file for fastp in HTML format | | fastp_version | String | Version of fastp used | | fastq_scan_docker | String | Docker image of fastq_scan | +| fastq_scan_clean1_json | File | JSON file output from `fastq-scan` containing summary stats about clean forward read quality and length | +| fastq_scan_clean2_json | File | JSON file output from `fastq-scan` containing summary stats about clean reverse read quality and length | | fastq_scan_num_reads_clean_pairs | String | Number of read pairs after cleaning as calculated by fastq_scan | | fastq_scan_num_reads_clean1 | Int | Number of forward reads after cleaning as calculated by fastq_scan | | fastq_scan_num_reads_clean2 | Int | Number of reverse reads after cleaning as calculated by fastq_scan | | fastq_scan_num_reads_raw_pairs | String | Number of input read pairs as calculated by fastq_scan | | fastq_scan_num_reads_raw1 | Int | Number of input forward reads as calculated by fastq_scan | | fastq_scan_num_reads_raw2 | Int | Number of input reserve reads as calculated by fastq_scan | +| fastq_scan_raw1_json | File | JSON file output from `fastq-scan` containing summary stats about raw forward read quality and length | +| fastq_scan_raw2_json | File | JSON file output from `fastq-scan` containing summary stats about raw reverse read quality and length | | fastq_scan_version | String | fastq_scan version | | fastqc_clean1_html | File | Graphical visualization of clean forward read quality from fastqc to open in an internet browser | | fastqc_clean2_html | File | Graphical visualization of clean reverse read quality from fastqc to open in an internet browser | diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index 2b5f5308d..8808caab2 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -722,7 +722,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | | Links | | --- | --- | - | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim.wdl) | + | Sub-workflow | [wf_read_QC_trim_pe.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_pe.wdl)
[wf_read_QC_trim_se.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_se.wdl) | | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl)| | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | @@ -737,7 +737,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | | Links | | --- | --- | - | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_cg_pipeline.wdl) | + | Task | [task_cg_pipeline.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_cg_pipeline.wdl) | | Software Source Code | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | | Software Documentation | [CG-Pipeline on GitHub](https://github.com/lskatz/CG-Pipeline/) | | Original Publication(s) | [A computational genomics pipeline for prokaryotic sequencing projects](https://academic.oup.com/bioinformatics/article/26/15/1819/188418) | @@ -746,7 +746,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al De Novo assembly will be undertaken only for samples that have sufficient read quantity and quality, as determined by the `screen` task assessment of clean reads. - In TheiaEuk, assembly is performed using the [Shovill](https://github.com/tseemann/shovill) pipeline. This undertakes the assembly with one of four assemblers ([SKESA](https://github.com/ncbi/SKESA) (default), [SPAdes](https://github.com/ablab/spades), [Velvet](https://github.com/dzerbino/velvet/), [Megahit](https://github.com/voutcn/megahit)), but also performs [a number of pre- and post-processing steps](https://github.com/tseemann/shovill#main-steps) to improve the resulting genome assembly. Shovill uses an estimated genome size (see [here](https://github.com/tseemann/shovill#--gsize)). If this is not provided by the user as an optional input, Shovill will estimate the genome size using [mash](https://mash.readthedocs.io/en/latest/index.html). Adaptor trimming can be undertaken with Shovill by setting the `trim` option to "true", but this is set to "false" by default as [alternative adapter trimming](https://www.notion.so/TheiaProk-Workflow-Series-89b9c08406094ec78d08a578fe861626?pvs=21) is undertaken in the TheiaEuk workflow. + In TheiaProk, assembly is performed using the [Shovill](https://github.com/tseemann/shovill) pipeline. This undertakes the assembly with one of four assemblers ([SKESA](https://github.com/ncbi/SKESA) (default), [SPAdes](https://github.com/ablab/spades), [Velvet](https://github.com/dzerbino/velvet/), [Megahit](https://github.com/voutcn/megahit)), but also performs [a number of pre- and post-processing steps](https://github.com/tseemann/shovill#main-steps) to improve the resulting genome assembly. Shovill uses an estimated genome size (see [here](https://github.com/tseemann/shovill#--gsize)). If this is not provided by the user as an optional input, Shovill will estimate the genome size using [mash](https://mash.readthedocs.io/en/latest/index.html). Adaptor trimming can be undertaken with Shovill by setting the `trim` option to "true", but this is set to "false" by default as [alternative adapter trimming](https://www.notion.so/TheiaProk-Workflow-Series-89b9c08406094ec78d08a578fe861626?pvs=21) is undertaken in the TheiaEuk workflow. ??? toggle "What is _de novo_ assembly?" _De novo_ assembly is the process or product of attempting to reconstruct a genome from scratch (without prior knowledge of the genome) using sequence reads. Assembly of fungal genomes from short-reads will produce multiple contigs per chromosome rather than a single contiguous sequence for each chromosome. @@ -754,8 +754,9 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al !!! techdetails "Shovill Technical Details" | | Links | | --- | --- | - | TheiaProk WDL Task | [task_shovill.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_shovill.wdl#L3) | - | Software code repository and documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | + | TheiaEuk WDL Task | [task_shovill.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_shovill.wdl#L3) | + | Software Source Code | [Shovill on GitHub](https://github.com/tseemann/shovill) | + | Software Documentation | [Shovill on GitHub](https://github.com/tseemann/shovill) | #### ONT Data Core Tasks @@ -765,7 +766,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al **Estimated genome length**: - By default, an estimated genome length is set to 5 Mb, which is around 0.7 Mb higher than the average bacterial genome length, according to the information collated [here](https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt). This estimate can be overwritten by the user, and is used by `RASUSA` and `dragonflye`. + By default, an estimated genome length is set to 5 Mb, which is around 0.7 Mb higher than the average bacterial genome length, according to the information collated [here](https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt). This estimate can be overwritten by the user, and is used by `Rasusa` and `dragonflye`. **Plotting and quantifying long-read sequencing data:** `nanoplot` @@ -784,7 +785,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | Workflow | **TheiaProk_ONT** | | --- | --- | | Sub-workflow | [wf_read_QC_trim_ont.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim_ont.wdl) | - | Tasks | [task_nanoplot.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_nanoplot.wdl) [task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/b481ce48f3d527ab8f31e4ad8171769212cc091a/tasks/quality_control/basic_statistics/task_fastq_scan.wdl) [task_rasusa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_rasusa.wdl) [task_nanoq.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_nanoq.wdl) [task_tiptoft.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/plasmid_detection/task_tiptoft.wdl) | + | Tasks | [task_nanoplot.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_nanoplot.wdl) [task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl) [task_rasusa.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_rasusa.wdl) [task_nanoq.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_nanoq.wdl) [task_tiptoft.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/plasmid_detection/task_tiptoft.wdl) | | Software Source Code | [fastq-scan](https://github.com/rpetit3/fastq-scan), [NanoPlot](https://github.com/wdecoster/NanoPlot), [RASUSA](https://github.com/mbhall88/rasusa), [tiptoft](https://github.com/andrewjpage/tiptoft), [nanoq](https://github.com/esteinig/nanoq) | | Original Publication(s) | [NanoPlot paper](https://academic.oup.com/bioinformatics/article/39/5/btad311/7160911)
[RASUSA paper](https://doi.org/10.21105/joss.03941)
[Nanoq Paper](https://doi.org/10.21105/joss.02991)
[Tiptoft paper](https://doi.org/10.21105/joss.01021) | @@ -808,7 +809,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | --- | --- | | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_quast.wdl) | | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | - | Software Documentation | | + | Software Documentation | | | Original Publication(s) | [QUAST: quality assessment tool for genome assemblies](https://academic.oup.com/bioinformatics/article/29/8/1072/228832) | ??? task "`BUSCO`: Assembly Quality Assessment" @@ -892,7 +893,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al ??? task "`AMRFinderPlus`: AMR Genotyping (default)" - NCBI's [AMRFinderPlus](https://github.com/ncbi/amr/wiki) is the default antimicrobial resistance (AMR) detection tool used in TheiaProk. [ResFinder](https://www.notion.so/TheiaProk-Workflow-Series-68c34aca2a0240ef94fef0acd33651b9?pvs=21) may be used alternatively and if so, AMRFinderPlus is not run. + NCBI's [AMRFinderPlus](https://github.com/ncbi/amr/wiki) is the default antimicrobial resistance (AMR) detection tool used in TheiaProk. ResFinder may be used alternatively and if so, AMRFinderPlus is not run. AMRFinderPlus identifies acquired antimicrobial resistance (AMR) genes, virulence genes, and stress genes. Such AMR genes confer resistance to antibiotics, metals, biocides, heat, or acid. For some taxa (see [here](https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#--organism-option)), AMRFinderPlus will provide taxa-specific results including filtering out genes that are almost ubiquitous in the taxa (intrinsic genes) and identifying resistance-associated point mutations. In TheiaProk, the taxon used by AMRFinderPlus is specified based on the `gambit_predicted_taxon` or a user-provided `expected_taxon`. @@ -1047,7 +1048,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | | Links | | --- | --- | - | Task | [task_plasmidfinder.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/plasmid_typing/task_plasmidfinder.wdl) | + | Task | [task_plasmidfinder.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/plasmid_detection/task_plasmidfinder.wdl) | | Software Source Code | https://bitbucket.org/genomicepidemiology/plasmidfinder/src/master/ | | Software Documentation | https://bitbucket.org/genomicepidemiology/plasmidfinder/src/master/ | | Original Publication(s) | [In Silico Detection and Typing of Plasmids using PlasmidFinder and Plasmid Multilocus Sequence Typing](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4068535/) | @@ -1076,7 +1077,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | | Links | | --- | --- | - | Task | [task_qc_check.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_qc_check.wdl) | + | Task | [task_qc_check_phb.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparistask_qc_check_phb.wdl.wdl) | ??? task "`Taxon Tables`: Copy outputs to new data tables based on taxonomic assignment (optional)" @@ -1323,7 +1324,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | | Links | | --- | --- | - | Task | [task_kleborate.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/haemophilus/task_kleborate.wdl) | + | Task | [task_kleborate.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/klebsiella/task_kleborate.wdl) | | Software Source Code | [kleborate on GitHub](https://github.com/katholt/Kleborate) | | Software Documentation | https://github.com/katholt/Kleborate/wiki | | Orginal publication | [A genomic surveillance framework and genotyping tool for Klebsiella pneumoniae and its related species complex](https://www.nature.com/articles/s41467-021-24448-3)
[Identification of Klebsiella capsule synthesis loci from whole genome data](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000102) | @@ -1534,7 +1535,7 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after ??? task "`PopPUNK`: Global Pneumococcal Sequence Cluster typing" - Global Pneumococcal Sequence Clusters (GPSC) define and name pneumococcal strains. GPSC designation is undertaken using the PopPUNK software and GPSC database as described in the file below, obtained from [here](https://www.pneumogen.net/gps/training_command_line.html). + Global Pneumococcal Sequence Clusters (GPSC) define and name pneumococcal strains. GPSC designation is undertaken using the PopPUNK software and GPSC database as described in the file below, obtained from [here](https://www.pneumogen.net/gps/#/training#command-line). :file: [GPSC_README_PopPUNK2.txt](../../assets/files/GPSC_README_PopPUNK2.txt) @@ -1547,9 +1548,9 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | | Links | | --- | --- | | Task | [task_poppunk_streppneumo.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/streptococcus/task_poppunk_streppneumo.wdl) | - | GPSC database | https://www.pneumogen.net/gps/training_command_line.html | + | GPSC database | | | Software Source Code | [PopPunk](https://github.com/bacpop/PopPUNK) | - | Software Documentation | https://poppunk.readthedocs.io/en/latest/ | + | Software Documentation | | | Original Publication(s) | [Fast and flexible bacterial genomic epidemiology with PopPUNK](https://genome.cshlp.org/content/29/2/304) | ??? task "`SeroBA`: Serotyping ==_for Illumina_PE only_==" @@ -1731,12 +1732,16 @@ The TheiaProk workflows automatically activate taxa-specific sub-workflows after | est_coverage_raw | Float | Estimated coverage calculated from raw reads and genome length | ONT, PE, SE | | fastp_html_report | File | The HTML report made with fastp | PE, SE | | fastp_version | String | Version of fastp software used | PE, SE | +| fastq_scan_clean1_json | File | JSON file output from `fastq-scan` containing summary stats about clean forward read quality and length | PE, SE | +| fastq_scan_clean2_json | File | JSON file output from `fastq-scan` containing summary stats about clean reverse read quality and length | PE | | fastq_scan_num_reads_clean_pairs | String | Number of read pairs after cleaning as calculated by fastq_scan | PE | | fastq_scan_num_reads_clean1 | Int | Number of forward reads after cleaning as calculated by fastq_scan | PE, SE | | fastq_scan_num_reads_clean2 | Int | Number of reverse reads after cleaning as calculated by fastq_scan | PE | | fastq_scan_num_reads_raw_pairs | String | Number of input read pairs calculated by fastq_scan | PE | | fastq_scan_num_reads_raw1 | Int | Number of input forward reads calculated by fastq_scan | PE, SE | | fastq_scan_num_reads_raw2 | Int | Number of input reverse reads calculated by fastq_scan | PE | +| fastq_scan_raw1_json | File | JSON file output from `fastq-scan` containing summary stats about raw forward read quality and length | PE, SE | +| fastq_scan_raw2_json | File | JSON file output from `fastq-scan` containing summary stats about raw reverse read quality and length | PE | | fastq_scan_version | String | Version of fastq-scan software used | PE, SE | | fastqc_clean1_html | File | Graphical visualization of clean forward read quality from fastqc to open in an internet browser | PE, SE | | fastqc_clean2_html | File | Graphical visualization of clean reverse read quality from fastqc to open in an internet browser | PE | diff --git a/docs/workflows/phylogenetic_construction/augur.md b/docs/workflows/phylogenetic_construction/augur.md index 7ccf78d56..c9d144997 100644 --- a/docs/workflows/phylogenetic_construction/augur.md +++ b/docs/workflows/phylogenetic_construction/augur.md @@ -14,10 +14,10 @@ Two workflows are offered: **Augur_Prep_PHB** and **Augur_PHB**. These must be r !!! dna "**Helpful resources for epidemiological interpretation**" - - [introduction to Nextstrain](https://www.cdc.gov/amd/training/covid-toolkit/module3-1.html) (which includes Auspice) - - guide to Nextstrain [interactive trees](https://www.cdc.gov/amd/training/covid-toolkit/module3-4.html) - - an [introduction to UShER](https://www.cdc.gov/amd/training/covid-toolkit/module3-3.html) - - a video about [how to read trees](https://www.cdc.gov/amd/training/covid-toolkit/module1-3.html) if this is new to you + - [introduction to Nextstrain](https://www.cdc.gov/advanced-molecular-detection/php/training/module-3-1.html) (which includes Auspice) + - guide to Nextstrain [interactive trees](https://www.cdc.gov/advanced-molecular-detection/php/training/module-3-4.html) + - an [introduction to UShER](https://www.cdc.gov/advanced-molecular-detection/php/training/module-3-3.html) + - a video about [how to read trees](https://www.cdc.gov/advanced-molecular-detection/php/training/module-1-3.html) if this is new to you - documentation on [how to identify SARS-CoV-2 recombinants](https://github.com/pha4ge/pipeline-resources/blob/main/docs/sc2-recombinants.md) ### Augur_Prep_PHB @@ -174,7 +174,7 @@ The Augur_PHB workflow takes in a ***set*** of SARS-CoV-2 (or any other viral This workflow runs on the set level. Please note that for every task, runtime parameters are modifiable (cpu, disk_size, docker, and memory); most of these values have been excluded from the table below for convenience. -
+
| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | |---|---|---|---|---|---| @@ -198,7 +198,7 @@ This workflow runs on the set level. Please note that for every task, runtime pa | augur_ancestral | **inference** | String | Calculate joint or marginal maximum likelihood ancestral sequence states; options: "joint", "marginal" | joint | Optional | | augur_ancestral | **keep_ambiguous** | Boolean | If true, do not infer nucleotides at ambiguous (N) sides | FALSE | Optional | | augur_ancestral | **keep_overhangs** | Boolean | If true, do not infer nucleotides for gaps on either side of the alignment | FALSE | Optional | -| augur_export | **colors_tsv** | File | Custom color definitions, one per line in the format TRAIT_TYPE \| TRAIT_VALUE\tHEX_CODE | | Optional | +| augur_export | **colors_tsv** | File | Custom color definitions, one per line in TSV format with the following fields: TRAIT_TYPE TRAIT_VALUE HEX_CODE | | Optional | | augur_export | **description_md** | File | Markdown file with description of build and/or acknowledgements | | Optional | | augur_export | **include_root_sequence** | Boolean | Export an additional JSON containing the root sequence used to identify mutations | FALSE | Optional | | augur_export | **title** | String | Title to be displayed by Auspice | | Optional | @@ -220,7 +220,7 @@ This workflow runs on the set level. Please note that for every task, runtime pa | augur_tree | **exclude_sites** | File | File of one-based sites to exclude for raw tree building (BED format in .bed files, DRM format in tab-delimited files, or one position per line) | | Optional | | augur_tree | **method** | String | Which method to use to build the tree; options: "fasttree", "raxml", "iqtree" | iqtree | Optional | | augur_tree | **override_default_args** | Boolean | If true, override default tree builder arguments instead of augmenting them | FALSE | Optional | -| augur_tree | **substitution_model** | String | The substitution model to use; only available for iqtree. Specify "auto" to run ModelTest; options: "GTR" | GTR | Optional | +| augur_tree | **substitution_model** | String | The substitution model to use; only available for iqtree. Specify "auto" to run ModelTest; model options can be found [here](http://www.iqtree.org/doc/Substitution-Models) | GTR | Optional | | augur_tree | **tree_builder_args** | String | Additional tree builder arguments either augmenting or overriding the default arguments. FastTree defaults: "-nt -nosupport". RAxML defaults: "-f d -m GTRCAT -c 25 -p 235813". IQ-TREE defaults: "-ninit 2 -n 2 -me 0.05 -nt AUTO -redo" | | Optional | | sc2_defaults | **nextstrain_ncov_repo_commit** | String | The version of the from which to draw default values for SARS-CoV-2. | `23d1243127e8838a61b7e5c1a72bc419bf8c5a0d` | Optional | | organism_parameters | **gene_locations_bed_file** | File | Use to provide locations of interest where average coverage will be calculated | Defaults are organism-specific. Please find default values for some organisms here: . For an organism without set defaults, an empty file is provided, "gs://theiagen-public-files/terra/theiacov-files/empty.bed", but will not be as useful as an organism specific gene locations bed file. | Optional | @@ -284,6 +284,7 @@ The Nextstrain team hosts documentation surrounding the Augur workflow → Auspi | **Variable** | **Type** | **Description** | | --- | --- | --- | | aligned_fastas | File | A FASTA file of the aligned genomes | +| augur_iqtree_model_used | String | The iqtree model used during augur tree | | augur_phb_analysis_date | String | The date the analysis was run | | augur_phb_version | String | The version of the Public Health Bioinformatics (PHB) repository used | | augur_version | String | Version of Augur used | diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline.md b/docs/workflows/phylogenetic_construction/snippy_streamline.md index c794be4c8..aa04198b3 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline.md @@ -173,11 +173,7 @@ For all cases: `Snippy_Variants` aligns reads for each sample against the reference genome. As part of `Snippy_Streamline`, the only output from this workflow is the `snippy_variants_outdir_tarball` which is provided in the set-level data table. Please see the full documentation for [Snippy_Variants](./snippy_variants.md) for more information. -??? task "snippy_variants (qc_metrics output)" - - ##### snippy_variants {#snippy_variants} - - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + This task also extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: - **samplename**: The name of the sample. - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. @@ -195,9 +191,17 @@ For all cases: - **meanbaseq**: Mean base quality over the reference sequence. - **meanmapq**: Mean mapping quality over the reference sequence. - These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`). The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + + !!! tip "QC Metrics for Phylogenetic Analysis" + These QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses - **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ??? task "Snippy_Tree workflow" diff --git a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md index 352d5a55c..890674b3f 100644 --- a/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md +++ b/docs/workflows/phylogenetic_construction/snippy_streamline_fasta.md @@ -39,11 +39,11 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a ### Workflow Tasks -??? task "snippy_variants (qc_metrics output)" +??? task "Snippy_Variants QC Metrics Concatenation (optional)" - ##### snippy_variants {#snippy_variants} + ##### Snippy_Variants QC Metric Concatenation (optional) {#snippy_variants} - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + Optionally, the user can provide the `snippy_variants_qc_metrics` file produced by the Snippy_Variants workflow as input to the workflow to concatenate the reports for each sample in the tree. These per-sample QC metrics include the following columns: - **samplename**: The name of the sample. - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. @@ -61,9 +61,17 @@ The `Snippy_Streamline_FASTA` workflow is an all-in-one approach to generating a - **meanbaseq**: Mean base quality over the reference sequence. - **meanmapq**: Mean mapping quality over the reference sequence. - These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. - **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. + !!! tip "QC Metrics for Phylogenetic Analysis" + These QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses, and we recommend examining them before proceeding with phylogenetic analysis if performing Snippy_Variants and Snippy_Tree separately. + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ### Inputs diff --git a/docs/workflows/phylogenetic_construction/snippy_tree.md b/docs/workflows/phylogenetic_construction/snippy_tree.md index d6c0a272b..d28160bbb 100644 --- a/docs/workflows/phylogenetic_construction/snippy_tree.md +++ b/docs/workflows/phylogenetic_construction/snippy_tree.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.1.0 | Yes; some optional features incompatible | Set-level | +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria) | PHB v2.3.0 | Yes; some optional features incompatible | Set-level | ## Snippy_Tree_PHB @@ -266,7 +266,7 @@ Sequencing data used in the Snippy_Tree workflow must: | | Links | | --- | --- | - | Task | [task_summarize_data.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/task_summarize_data.wdl) | + | Task | [task_summarize_data.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/data_handling/task_summarize_data.wdl) | ??? task "Concatenate Variants (optional)" @@ -310,11 +310,11 @@ Sequencing data used in the Snippy_Tree workflow must: | Task | task_shared_variants.wdl | | Software Source Code | [task_shared_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/phylogenetic_inference/utilities/task_shared_variants.wdl) | -??? task "snippy_variants (qc_metrics output)" +??? task "Snippy_Variants QC Metrics Concatenation (optional)" - ##### snippy_variants {#snippy_variants} + ##### Snippy_Variants QC Metric Concatenation (optional) {#snippy_variants} - This task runs Snippy to perform SNP analysis on individual samples. It extracts QC metrics from the Snippy output for each sample and saves them in per-sample TSV files (`snippy_variants_qc_metrics`). These per-sample QC metrics include the following columns: + Optionally, the user can provide the `snippy_variants_qc_metrics` file produced by the Snippy_Variants workflow as input to the workflow to concatenate the reports for each sample in the tree. These per-sample QC metrics include the following columns: - **samplename**: The name of the sample. - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. @@ -332,9 +332,17 @@ Sequencing data used in the Snippy_Tree workflow must: - **meanbaseq**: Mean base quality over the reference sequence. - **meanmapq**: Mean mapping quality over the reference sequence. - These per-sample QC metrics are then combined into a single file (`snippy_combined_qc_metrics`) in the downstream `snippy_tree_wf` workflow. The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + The combined QC metrics file includes the same columns as above for all samples. Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. - **Note:** The per-sample QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses. + !!! tip "QC Metrics for Phylogenetic Analysis" + These QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses, and we recommend examining them before proceeding with phylogenetic analysis if performing Snippy_Variants and Snippy_Tree separately. + + !!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ### Outputs diff --git a/docs/workflows/phylogenetic_construction/snippy_variants.md b/docs/workflows/phylogenetic_construction/snippy_variants.md index 4ec73569a..f4fc65a37 100644 --- a/docs/workflows/phylogenetic_construction/snippy_variants.md +++ b/docs/workflows/phylogenetic_construction/snippy_variants.md @@ -4,7 +4,7 @@ | **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | |---|---|---|---|---| -| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics), [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.2.0 | Yes | Sample-level | +| [Phylogenetic Construction](../../workflows_overview/workflows_type.md/#phylogenetic-construction) | [Bacteria](../../workflows_overview/workflows_kingdom.md/#bacteria), [Mycotics](../../workflows_overview/workflows_kingdom.md#mycotics), [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.3.0 | Yes | Sample-level | ## Snippy_Variants_PHB @@ -60,14 +60,40 @@ The `Snippy_Variants` workflow aligns single-end or paired-end reads (in FASTQ f ### Workflow Tasks -`Snippy_Variants` uses the snippy tool to align reads to the reference and call SNPs, MNPs and INDELs according to optional input parameters. The output includes a file of variants that is then queried using the `grep` bash command to identify any mutations in specified genes or annotations of interest. The query string MUST match the gene name or annotation as specified in the GenBank file and provided in the output variant file in the `snippy_results` column. - -Additionally, `Snippy_Variants` extracts quality control (QC) metrics from the Snippy output for each sample. These per-sample QC metrics are saved in TSV files (`snippy_variants_qc_metrics`). The QC metrics include: - -- **Percentage of reads aligned to the reference genome** (`snippy_variants_percent_reads_aligned`). -- **Percentage of the reference genome covered at or above the specified depth threshold** (`snippy_variants_percent_ref_coverage`). - -These per-sample QC metrics can be combined into a single file (`snippy_combined_qc_metrics`) in downstream workflows, such as `snippy_tree_wf`, providing an overview of QC metrics across all samples. +`Snippy_Variants` uses Snippy to align reads to the reference and call SNPs, MNPs and INDELs according to optional input parameters. The output includes a file of variants that is then queried using the `grep` bash command to identify any mutations in specified genes or annotations of interest. The query string MUST match the gene name or annotation as specified in the GenBank file and provided in the output variant file in the `snippy_results` column. + +!!! info "Quality Control Metrics" + Additionally, `Snippy_Variants` extracts quality control (QC) metrics from the Snippy output for each sample. These per-sample QC metrics are saved in TSV files (`snippy_variants_qc_metrics`). The QC metrics include: + + - **samplename**: The name of the sample. + - **reads_aligned_to_reference**: The number of reads that aligned to the reference genome. + - **total_reads**: The total number of reads in the sample. + - **percent_reads_aligned**: The percentage of reads that aligned to the reference genome; also available in the `snippy_variants_percent_reads_aligned` output column. + - **variants_total**: The total number of variants detected between the sample and the reference genome. + - **percent_ref_coverage**: The percentage of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10); also available in the `snippy_variants_percent_ref_coverage` output column. + - **#rname**: Reference sequence name (e.g., chromosome or contig name). + - **startpos**: Starting position of the reference sequence. + - **endpos**: Ending position of the reference sequence. + - **numreads**: Number of reads covering the reference sequence. + - **covbases**: Number of bases with coverage. + - **coverage**: Percentage of the reference sequence covered (depth ≥ 1). + - **meandepth**: Mean depth of coverage over the reference sequence. + - **meanbaseq**: Mean base quality over the reference sequence. + - **meanmapq**: Mean mapping quality over the reference sequence. + + Note that the last set of columns (`#rname` to `meanmapq`) may repeat for each chromosome or contig in the reference genome. + +!!! tip "QC Metrics for Phylogenetic Analysis" + These QC metrics provide valuable insights into the quality and coverage of your sequencing data relative to the reference genome. Monitoring these metrics can help identify samples with low coverage, poor alignment, or potential issues that may affect downstream analyses, and we recommend examining them before proceeding with phylogenetic analysis if performing Snippy_Variants and Snippy_Tree separately. + + These per-sample QC metrics can also be combined into a single file (`snippy_combined_qc_metrics`) in downstream workflows, such as `snippy_tree`, providing an overview of QC metrics across all samples. + +!!! techdetails "Snippy Variants Technical Details" + | | Links | + | --- | --- | + | Task | [task_snippy_variants.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_variants.wdl)
[task_snippy_gene_query.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/gene_typing/variant_detection/task_snippy_gene_query.wdl) | + | Software Source Code | [Snippy on GitHub](https://github.com/tseemann/snippy) | + | Software Documentation | [Snippy on GitHub](https://github.com/tseemann/snippy) | ### Outputs @@ -92,6 +118,7 @@ These per-sample QC metrics can be combined into a single file (`snippy_combined | snippy_variants_outdir_tarball | File | A compressed file containing the whole directory of snippy output files. This is used when running Snippy_Tree | | snippy_variants_percent_reads_aligned | Float | Percentage of reads aligned to the reference genome | | snippy_variants_percent_ref_coverage| Float | Proportion of the reference genome covered by reads with a depth greater than or equal to the `min_coverage` threshold (default is 10). | +| snippy_variants_qc_metrics | File | TSV file containing quality control metrics for the sample | | snippy_variants_query | String | Query strings specified by the user when running the workflow | | snippy_variants_query_check | String | Verification that query strings are found in the reference genome | | snippy_variants_results | File | CSV file detailing results for all mutations identified in the query sequence relative to the reference | @@ -99,4 +126,4 @@ These per-sample QC metrics can be combined into a single file (`snippy_combined | snippy_variants_version | String | Version of Snippy used | | snippy_variants_wf_version | String | Version of Snippy_Variants used | -
\ No newline at end of file +
diff --git a/docs/workflows/public_data_sharing/fetch_srr_accession.md b/docs/workflows/public_data_sharing/fetch_srr_accession.md new file mode 100644 index 000000000..aa18c6438 --- /dev/null +++ b/docs/workflows/public_data_sharing/fetch_srr_accession.md @@ -0,0 +1,52 @@ +# Fetch SRR Accession Workflow + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Public Data Sharing](../../workflows_overview/workflows_type.md/#public-data-sharing) | [Any Taxa](../../workflows_overview/workflows_kingdom.md/#any-taxa) | PHB v2.3.0 | Yes | Sample-level | + +## Fetch SRR Accession + +This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. The primary inputs are BioSample IDs (e.g., SAMN00000000) or SRA Experiment IDs (e.g., SRX000000), which link to sequencing data in the SRA repository. + +The workflow uses the fastq-dl tool to fetch metadata from SRA and specifically parses this metadata to extract the associated SRR accession and outputs the SRR accession. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description**| **Default Value** | **Terra Status** | +| --- | --- | --- | --- | --- | --- | +| fetch_srr_metadata | **sample_accession** | String | SRA-compatible accession, such as a **BioSample ID** (e.g., "SAMN00000000") or **SRA Experiment ID** (e.g., "SRX000000"), used to retrieve SRR metadata. | | Required | +| fetch_srr_metadata | **cpu** | Int | Number of CPUs allocated for the task. | 2 | Optional | +| fetch_srr_metadata | **disk_size** | Int | Disk space in GB allocated for the task. | 10 | Optional | +| fetch_srr_metadata | **docker**| String | Docker image for metadata retrieval. | `us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0` | Optional | +| fetch_srr_metadata | **memory** | Int | Memory in GB allocated for the task. | 8 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +### Workflow Tasks + +This workflow has a single task that performs metadata retrieval for the specified sample accession. + +??? task "`fastq-dl`: Fetches SRR metadata for sample accession" + When provided a BioSample accession or SRA experiment ID, 'fastq-dl' collects metadata and returns the appropriate SRR accession. + + !!! techdetails "fastq-dl Technical Details" + | | Links | + | --- | --- | + | Task | [Task on GitHub](https://github.com/theiagen-org/phb-workflows/blob/main/tasks/utilities/data_handling/task_fetch_srr_metadata.wdl) | + | Software Source Code | [fastq-dl Source](https://github.com/rvalieris/fastq-dl) | + | Software Documentation | [fastq-dl Documentation](https://github.com/rvalieris/fastq-dl#documentation) | + | Original Publication | [fastq-dl: A fast and reliable tool for downloading SRA metadata](https://doi.org/10.1186/s12859-021-04346-3) | + +### Outputs + +| **Variable** | **Type** | **Description**| +|---|---|---| +| srr_accession| String | The SRR accession's associated with the input sample accession.| +| fetch_srr_accession_version | String | The version of the fetch_srr_accession workflow. | +| fetch_srr_accession_analysis_date | String | The date the fetch_srr_accession analysis was run. | + +## References + +> Valieris, R. et al., "fastq-dl: A fast and reliable tool for downloading SRA metadata." Bioinformatics, 2021. diff --git a/docs/workflows/standalone/ncbi_scrub.md b/docs/workflows/standalone/ncbi_scrub.md index 0ae60c49b..e82b3feea 100644 --- a/docs/workflows/standalone/ncbi_scrub.md +++ b/docs/workflows/standalone/ncbi_scrub.md @@ -66,7 +66,7 @@ This workflow is composed of two tasks, one to dehost the input reads and anothe | | Links | | --- | --- | - | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | | Software Documentation | | | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | diff --git a/docs/workflows_overview/workflows_alphabetically.md b/docs/workflows_overview/workflows_alphabetically.md index 3543d3cb9..c937e815b 100644 --- a/docs/workflows_overview/workflows_alphabetically.md +++ b/docs/workflows_overview/workflows_alphabetically.md @@ -47,6 +47,7 @@ title: Alphabetical Workflows | [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | | [**Transfer_Column_Content**](../workflows/data_export/transfer_column_content.md)| Transfer contents of a specified Terra data table column for many samples ("entities") to a GCP storage bucket location | Any taxa | Set-level | Yes | v1.3.0 | [Transfer_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Transfer_Column_Content_PHB:main?tab=info) | | [**Samples_to_Ref_Tree**](../workflows/phylogenetic_placement/usher.md)| Use UShER to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Usher_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Usher_PHB:main?tab=info) | +| [**Fetch_SRR_Accession**](../workflows/public_data_sharing/fetch_srr_accession.md)| Update SRR metadata in a Terra data table at the sample level | Any taxa | | Yes | v2.3.0 | [*Fetch_SRR_Accession_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Fetch_SRR_Accession_PHB:main?tab=info) | | [**Usher_PHB**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | | [**Zip_Column_Content**](../workflows/data_export/zip_column_content.md)| Zip contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Zip_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Zip_Column_Content_PHB:main?tab=info) | diff --git a/docs/workflows_overview/workflows_kingdom.md b/docs/workflows_overview/workflows_kingdom.md index c77c7bc3d..d10fa2afd 100644 --- a/docs/workflows_overview/workflows_kingdom.md +++ b/docs/workflows_overview/workflows_kingdom.md @@ -24,6 +24,7 @@ title: Workflows by Kingdom | [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | | [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | | [**Transfer_Column_Content**](../workflows/data_export/transfer_column_content.md)| Transfer contents of a specified Terra data table column for many samples ("entities") to a GCP storage bucket location | Any taxa | Set-level | Yes | v1.3.0 | [Transfer_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Transfer_Column_Content_PHB:main?tab=info) | +| [**Fetch_SRR_Accession**](../workflows/public_data_sharing/fetch_srr_accession.md)| Update SRR metadata in a Terra data table at the sample level | Any taxa | Set-level | Yes | v2.3.0 | [Fetch_SRR_Accession_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Fetch_SRR_Accession_PHB:main?tab=info) | | [**Zip_Column_Content**](../workflows/data_export/zip_column_content.md)| Zip contents of a specified Terra data table column for many samples ("entities") | Any taxa | Set-level | Yes | v2.1.0 | [Zip_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Zip_Column_Content_PHB:main?tab=info) | diff --git a/docs/workflows_overview/workflows_type.md b/docs/workflows_overview/workflows_type.md index 53623d7ee..14f23fd92 100644 --- a/docs/workflows_overview/workflows_type.md +++ b/docs/workflows_overview/workflows_type.md @@ -75,6 +75,7 @@ title: Workflows by Type | [**Mercury_Prep_N_Batch**](../workflows/public_data_sharing/mercury_prep_n_batch.md)| Prepare metadata and sequence data for submission to NCBI and GISAID | Influenza, Monkeypox virus, SARS-CoV-2, Viral | Set-level | No | v2.2.0 | [Mercury_Prep_N_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Mercury_Prep_N_Batch_PHB:main?tab=info) | | [**Terra_2_GISAID**](../workflows/public_data_sharing/terra_2_gisaid.md)| Upload of assembly data to GISAID | SARS-CoV-2, Viral | Set-level | Yes | v1.2.1 | [Terra_2_GISAID_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_GISAID_PHB:main?tab=info) | | [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | +| [**Fetch_SRR_Accession**](../workflows/public_data_sharing/fetch_srr_accession.md)| Update SRR metadata in a Terra data table at the sample level | Any taxa | | Yes | v2.3.0 | [Fetch_SRR_Accession_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Fetch_SRR_Accession_PHB:main?tab=info) | diff --git a/mkdocs.yml b/mkdocs.yml index cc90e4e3d..613f81b15 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -43,6 +43,7 @@ nav: - Samples_to_Ref_Tree: workflows/phylogenetic_placement/samples_to_ref_tree.md - Usher_PHB: workflows/phylogenetic_placement/usher.md - Public Data Sharing: + - Fetch_SRR_Accession: workflows/public_data_sharing/fetch_srr_accession.md - Mercury_Prep_N_Batch: workflows/public_data_sharing/mercury_prep_n_batch.md - Terra_2_GISAID: workflows/public_data_sharing/terra_2_gisaid.md - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md diff --git a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl index f16c73618..22bd469e7 100644 --- a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl +++ b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl @@ -28,10 +28,26 @@ task augur_tree { ~{"--tree-builder-args " + tree_builder_args} \ ~{true="--override-default-args" false="" override_default_args} \ --nthreads auto + + # If iqtree, get the model used + if [ "~{method}" == "iqtree" ]; then + if [ "~{substitution_model}" == "auto" ]; then + FASTA_BASENAME=$(basename ~{aligned_fasta} .fasta) + FASTA_DIR=$(dirname ~{aligned_fasta}) + MODEL=$(grep "Best-fit model:" ${FASTA_DIR}/${FASTA_BASENAME}-delim.iqtree.log | sed 's|Best-fit model: ||g;s|chosen.*||' | tr -d '\n\r') + else + MODEL="~{substitution_model}" + fi + echo "$MODEL" > FINAL_MODEL.txt + else + echo "" > FINAL_MODEL.txt + fi >>> + output { File aligned_tree = "~{build_name}_~{method}.nwk" String augur_version = read_string("VERSION") + String iqtree_model_used = read_string("FINAL_MODEL.txt") } runtime { docker: docker diff --git a/tasks/quality_control/basic_statistics/task_fastq_scan.wdl b/tasks/quality_control/basic_statistics/task_fastq_scan.wdl index 029b94917..e2f4a2d4d 100644 --- a/tasks/quality_control/basic_statistics/task_fastq_scan.wdl +++ b/tasks/quality_control/basic_statistics/task_fastq_scan.wdl @@ -6,14 +6,16 @@ task fastq_scan_pe { File read2 String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") - Int disk_size = 100 - String docker = "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" + Int disk_size = 50 + String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-scan:1.0.1--h4ac6f70_3" Int memory = 2 - Int cpu = 2 + Int cpu = 1 } command <<< - # capture date and version - date | tee DATE + # exit task in case anything fails in one-liners or variables are unset + set -euo pipefail + + # capture version fastq-scan -v | tee VERSION # set cat command based on compression @@ -24,11 +26,21 @@ task fastq_scan_pe { fi # capture forward read stats + echo "DEBUG: running fastq-scan on $(basename ~{read1})" eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json - cat ~{read1_name}_fastq-scan.json | jq .qc_stats.read_total | tee READ1_SEQS + # using simple redirect so STDOUT is not confusing + jq .qc_stats.read_total ~{read1_name}_fastq-scan.json > READ1_SEQS + echo "DEBUG: number of reads in $(basename ~{read1}): $(cat READ1_SEQS)" read1_seqs=$(cat READ1_SEQS) + echo + + # capture reverse read stats + echo "DEBUG: running fastq-scan on $(basename ~{read2})" eval "${cat_reads} ~{read2}" | fastq-scan | tee ~{read2_name}_fastq-scan.json - cat ~{read2_name}_fastq-scan.json | jq .qc_stats.read_total | tee READ2_SEQS + + # using simple redirect so STDOUT is not confusing + jq .qc_stats.read_total ~{read2_name}_fastq-scan.json > READ2_SEQS + echo "DEBUG: number of reads in $(basename ~{read2}): $(cat READ2_SEQS)" read2_seqs=$(cat READ2_SEQS) # capture number of read pairs @@ -37,17 +49,18 @@ task fastq_scan_pe { else read_pairs="Uneven pairs: R1=${read1_seqs}, R2=${read2_seqs}" fi - - echo $read_pairs | tee READ_PAIRS + + # use simple redirect so STDOUT is not confusing + echo "$read_pairs" > READ_PAIRS + echo "DEBUG: number of read pairs: $(cat READ_PAIRS)" >>> output { - File read1_fastq_scan_report = "~{read1_name}_fastq-scan.json" - File read2_fastq_scan_report = "~{read2_name}_fastq-scan.json" + File read1_fastq_scan_json = "~{read1_name}_fastq-scan.json" + File read2_fastq_scan_json = "~{read2_name}_fastq-scan.json" Int read1_seq = read_int("READ1_SEQS") Int read2_seq = read_int("READ2_SEQS") String read_pairs = read_string("READ_PAIRS") String version = read_string("VERSION") - String pipeline_date = read_string("DATE") String fastq_scan_docker = docker } runtime { @@ -55,8 +68,8 @@ task fastq_scan_pe { memory: memory + " GB" cpu: cpu disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" # TES - preemptible: 0 + disk: disk_size + " GB" + preemptible: 1 maxRetries: 3 } } @@ -65,14 +78,16 @@ task fastq_scan_se { input { File read1 String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") - Int disk_size = 100 + Int disk_size = 50 Int memory = 2 - Int cpu = 2 - String docker = "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" + Int cpu = 1 + String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-scan:1.0.1--h4ac6f70_3" } command <<< - # capture date and version - date | tee DATE + # exit task in case anything fails in one-liners or variables are unset + set -euo pipefail + + # capture version fastq-scan -v | tee VERSION # set cat command based on compression @@ -83,14 +98,16 @@ task fastq_scan_se { fi # capture forward read stats + echo "DEBUG: running fastq-scan on $(basename ~{read1})" eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json - cat ~{read1_name}_fastq-scan.json | jq .qc_stats.read_total | tee READ1_SEQS + # using simple redirect so STDOUT is not confusing + jq .qc_stats.read_total ~{read1_name}_fastq-scan.json > READ1_SEQS + echo "DEBUG: number of reads in $(basename ~{read1}): $(cat READ1_SEQS)" >>> output { - File fastq_scan_report = "~{read1_name}_fastq-scan.json" + File fastq_scan_json = "~{read1_name}_fastq-scan.json" Int read1_seq = read_int("READ1_SEQS") String version = read_string("VERSION") - String pipeline_date = read_string("DATE") String fastq_scan_docker = docker } runtime { @@ -98,8 +115,8 @@ task fastq_scan_se { memory: memory + " GB" cpu: cpu disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" # TES - preemptible: 0 + disk: disk_size + " GB" + preemptible: 1 maxRetries: 3 } } diff --git a/tasks/utilities/data_export/task_broad_terra_tools.wdl b/tasks/utilities/data_export/task_broad_terra_tools.wdl index 3a3fba0fd..8a54d6bad 100644 --- a/tasks/utilities/data_export/task_broad_terra_tools.wdl +++ b/tasks/utilities/data_export/task_broad_terra_tools.wdl @@ -35,6 +35,10 @@ task export_taxon_tables { Int? num_reads_raw2 String? num_reads_raw_pairs String? fastq_scan_version + File? fastq_scan_raw1_json + File? fastq_scan_raw2_json + File? fastq_scan_clean1_json + File? fastq_scan_clean2_json Int? num_reads_clean1 Int? num_reads_clean2 String? num_reads_clean_pairs @@ -390,7 +394,8 @@ task export_taxon_tables { volatile: true } command <<< - + set -euo pipefail + # capture taxon and corresponding table names from input taxon_tables taxon_array=($(cut -f1 ~{taxon_tables} | tail +2)) echo "Taxon array: ${taxon_array[*]}" @@ -446,6 +451,10 @@ task export_taxon_tables { "num_reads_raw2": "~{num_reads_raw2}", "num_reads_raw_pairs": "~{num_reads_raw_pairs}", "fastq_scan_version": "~{fastq_scan_version}", + "fastq_scan_raw1_json": "~{fastq_scan_raw1_json}", + "fastq_scan_raw2_json": "~{fastq_scan_raw2_json}", + "fastq_scan_clean1_json": "~{fastq_scan_clean1_json}", + "fastq_scan_clean2_json": "~{fastq_scan_clean2_json}", "num_reads_clean1": "~{num_reads_clean1}", "num_reads_clean2": "~{num_reads_clean2}", "num_reads_clean_pairs": "~{num_reads_clean_pairs}", @@ -778,7 +787,7 @@ task export_taxon_tables { "agrvate_version": "~{agrvate_version}", "agrvate_docker": "~{agrvate_docker}", "srst2_vibrio_detailed_tsv": "~{srst2_vibrio_detailed_tsv}", - "srst2_vibrio_version": "~{srst2_vibrio_version}",~ + "srst2_vibrio_version": "~{srst2_vibrio_version}", "srst2_vibrio_docker": "~{srst2_vibrio_docker}", "srst2_vibrio_database": "~{srst2_vibrio_database}", "srst2_vibrio_ctxA": "~{srst2_vibrio_ctxA}", diff --git a/tasks/utilities/data_export/task_export_two_tsvs.wdl b/tasks/utilities/data_export/task_export_two_tsvs.wdl index d3707441f..4410e29a8 100644 --- a/tasks/utilities/data_export/task_export_two_tsvs.wdl +++ b/tasks/utilities/data_export/task_export_two_tsvs.wdl @@ -18,6 +18,7 @@ task export_two_tsvs { volatile: true } command <<< + set -euo pipefail python3 /scripts/export_large_tsv/export_large_tsv.py --project ~{terra_project1} --workspace ~{terra_workspace1} --entity_type ~{datatable1} --tsv_filename "~{datatable1}_table1.tsv" # check if second project is provided; if not, use first diff --git a/tasks/utilities/data_handling/task_fetch_srr_accession.wdl b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl new file mode 100644 index 000000000..ab8f98440 --- /dev/null +++ b/tasks/utilities/data_handling/task_fetch_srr_accession.wdl @@ -0,0 +1,62 @@ +version 1.0 + +task fetch_srr_accession { + input { + String sample_accession + String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-dl:2.0.4--pyhdfd78af_0" + Int disk_size = 10 + Int cpu = 2 + Int memory = 8 + } + meta { + volatile: true + } + command <<< + set -euo pipefail + + # Output the current date and fastq-dl version for debugging + date -u | tee DATE + fastq-dl --version | tee VERSION + + echo "Fetching metadata for accession: ~{sample_accession}" + + # Run fastq-dl and capture stderr + fastq-dl --accession ~{sample_accession} --only-download-metadata -m 2 --verbose 2> stderr.log || true + + # Handle whether the ID/accession is valid and contains SRR metadata based on stderr + if grep -q "No results found for" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "received an empty response" stderr.log; then + echo "No SRR accession found" > srr_accession.txt + echo "No SRR accession found for accession: ~{sample_accession}" + elif grep -q "is not a Study, Sample, Experiment, or Run accession" stderr.log; then + echo "Invalid accession: ~{sample_accession}" >&2 + exit 1 + elif [[ ! -f fastq-run-info.tsv ]]; then + echo "No metadata file found for accession: ~{sample_accession}" >&2 + exit 1 + else + # Extract SRR accessions from the TSV file if it exists + SRR_accessions=$(awk -F'\t' 'NR>1 {print $1}' fastq-run-info.tsv | paste -sd ',' -) + if [[ -z "${SRR_accessions}" ]]; then + echo "No SRR accession found" > srr_accession.txt + else + echo "Extracted SRR accessions: ${SRR_accessions}" + echo "${SRR_accessions}" > srr_accession.txt + fi + fi + >>> + output { + String srr_accession = read_string("srr_accession.txt") + String fastq_dl_version = read_string("VERSION") + } + runtime { + docker: docker + memory: "~{memory} GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + preemptible: 1 + } +} diff --git a/tasks/utilities/data_handling/task_summarize_data.wdl b/tasks/utilities/data_handling/task_summarize_data.wdl index 40586fbf3..5e5f64468 100644 --- a/tasks/utilities/data_handling/task_summarize_data.wdl +++ b/tasks/utilities/data_handling/task_summarize_data.wdl @@ -23,6 +23,8 @@ task summarize_data { volatile: true } command <<< + set -euo pipefail + # when running on terra, comment out all input_table mentions python3 /scripts/export_large_tsv/export_large_tsv.py --project "~{terra_project}" --workspace "~{terra_workspace}" --entity_type ~{terra_table} --tsv_filename ~{terra_table}-data.tsv diff --git a/tasks/utilities/data_handling/task_theiacov_fasta_batch.wdl b/tasks/utilities/data_handling/task_theiacov_fasta_batch.wdl index 5ab9247ad..4eb101b2e 100644 --- a/tasks/utilities/data_handling/task_theiacov_fasta_batch.wdl +++ b/tasks/utilities/data_handling/task_theiacov_fasta_batch.wdl @@ -28,6 +28,8 @@ task sm_theiacov_fasta_wrangling { # the sm stands for supermassive Int memory = 4 } command <<< + set -euo pipefail + # check if nextclade json file exists if [ -f ~{nextclade_json} ]; then # this line splits into individual json files diff --git a/tasks/utilities/data_import/task_create_terra_table.wdl b/tasks/utilities/data_import/task_create_terra_table.wdl index 638052ab0..22f95453a 100644 --- a/tasks/utilities/data_import/task_create_terra_table.wdl +++ b/tasks/utilities/data_import/task_create_terra_table.wdl @@ -146,6 +146,10 @@ task create_terra_table { done >> output { diff --git a/tasks/utilities/file_handling/task_transfer_files.wdl b/tasks/utilities/file_handling/task_transfer_files.wdl index 28cfbebb9..1115df119 100644 --- a/tasks/utilities/file_handling/task_transfer_files.wdl +++ b/tasks/utilities/file_handling/task_transfer_files.wdl @@ -14,6 +14,8 @@ task transfer_files { volatile: true } command <<< + set -euo pipefail + file_path_array="~{sep=' ' files_to_transfer}" gsutil -m cp -n ${file_path_array[@]} ~{target_bucket} diff --git a/tasks/utilities/submission/task_submission.wdl b/tasks/utilities/submission/task_submission.wdl index 694b4f0e8..effa28619 100644 --- a/tasks/utilities/submission/task_submission.wdl +++ b/tasks/utilities/submission/task_submission.wdl @@ -23,6 +23,8 @@ task prune_table { volatile: true } command <<< + set -euo pipefail + # when running on terra, comment out all input_table mentions python3 /scripts/export_large_tsv/export_large_tsv.py --project "~{project_name}" --workspace "~{workspace_name}" --entity_type ~{table_name} --tsv_filename ~{table_name}-data.tsv @@ -54,7 +56,7 @@ task prune_table { # read export table into pandas tablename = "~{table_name}-data.tsv" - table = pd.read_csv(tablename, delimiter='\t', header=0, dtype={"~{table_name}_id": 'str'}) # ensure sample_id is always a string) + table = pd.read_csv(tablename, delimiter='\t', header=0, dtype={"~{table_name}_id": 'str', "collection_date": 'str'}) # ensure sample_id is always a string) # extract the samples for upload from the entire table table = table[table["~{table_name}_id"].isin("~{sep='*' sample_names}".split("*"))] diff --git a/tests/config/environment.yml b/tests/config/environment.yml index 0aed07151..c4016d3ae 100644 --- a/tests/config/environment.yml +++ b/tests/config/environment.yml @@ -2,7 +2,6 @@ name: pytest-env-CI channels: - conda-forge - bioconda - - defaults dependencies: - python >=3.7 - cromwell=86 diff --git a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml index efee9c22a..83d78611b 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml @@ -115,17 +115,16 @@ - path: miniwdl_run/call-fastq_scan_clean_reads/inputs.json contains: ["read1", "clearlabs"] - path: miniwdl_run/call-fastq_scan_clean_reads/outputs.json - contains: ["fastq_scan_se", "pipeline_date", "read1_seq"] + contains: ["fastq_scan_se", "read1_seq"] - path: miniwdl_run/call-fastq_scan_clean_reads/stderr.txt - path: miniwdl_run/call-fastq_scan_clean_reads/stderr.txt.offset - path: miniwdl_run/call-fastq_scan_clean_reads/stdout.txt - path: miniwdl_run/call-fastq_scan_clean_reads/task.log contains: ["wdl", "theiacov_clearlabs", "fastq_scan_clean_reads", "done"] - - path: miniwdl_run/call-fastq_scan_clean_reads/work/DATE - path: miniwdl_run/call-fastq_scan_clean_reads/work/READ1_SEQS md5sum: 097e79b36919c8377c56088363e3d8b7 - path: miniwdl_run/call-fastq_scan_clean_reads/work/VERSION - md5sum: 8e4e9cdfbacc9021a3175ccbbbde002b + md5sum: a59bb42644e35c09b8fa8087156fa4c2 - path: miniwdl_run/call-fastq_scan_clean_reads/work/_miniwdl_inputs/0/clearlabs_R1_dehosted.fastq.gz - path: miniwdl_run/call-fastq_scan_clean_reads/work/clearlabs_R1_dehosted_fastq-scan.json md5sum: 869dd2e934c600bba35f30f08e2da7c9 @@ -134,17 +133,16 @@ - path: miniwdl_run/call-fastq_scan_raw_reads/inputs.json contains: ["read1", "clearlabs"] - path: miniwdl_run/call-fastq_scan_raw_reads/outputs.json - contains: ["fastq_scan_se", "pipeline_date", "read1_seq"] + contains: ["fastq_scan_se", "read1_seq"] - path: miniwdl_run/call-fastq_scan_raw_reads/stderr.txt - path: miniwdl_run/call-fastq_scan_raw_reads/stderr.txt.offset - path: miniwdl_run/call-fastq_scan_raw_reads/stdout.txt - path: miniwdl_run/call-fastq_scan_raw_reads/task.log contains: ["wdl", "theiacov_clearlabs", "fastq_scan_raw_reads", "done"] - - path: miniwdl_run/call-fastq_scan_raw_reads/work/DATE - path: miniwdl_run/call-fastq_scan_raw_reads/work/READ1_SEQS md5sum: 097e79b36919c8377c56088363e3d8b7 - path: miniwdl_run/call-fastq_scan_raw_reads/work/VERSION - md5sum: 8e4e9cdfbacc9021a3175ccbbbde002b + md5sum: a59bb42644e35c09b8fa8087156fa4c2 - path: miniwdl_run/call-fastq_scan_raw_reads/work/_miniwdl_inputs/0/clearlabs.fastq.gz - path: miniwdl_run/call-fastq_scan_raw_reads/work/clearlabs_fastq-scan.json md5sum: 869dd2e934c600bba35f30f08e2da7c9 @@ -236,7 +234,7 @@ - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/genome_annotation.gff3 md5sum: 4dff84d2d6ada820e0e3a8bc6798d402 - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/pathogen.json - md5sum: a51a91e0b5e16590c1afd0c7897ad071 + md5sum: 32f20640f926d5b59fed6b954541792d - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/reference.fasta md5sum: c7ce05f28e4ec0322c96f24e064ef55c - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/sequences.fasta @@ -310,13 +308,13 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: 59478efddde2191ead1b46b1f121bbc9 - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: 0803245359027bd3017d2bd9a9c9c8e3 + md5sum: 36f64a1cd7c6844309e8ad2121358088 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/clearlabs.medaka.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e - path: miniwdl_run/call-pangolin4/work/clearlabs.pangolin_report.csv - md5sum: 151390c419b00ca44eb83e2bbfb96996 + md5sum: 0370f24c270c44f6023dd98af79501e7 - path: miniwdl_run/call-stats_n_coverage/command md5sum: ac020678f99ac145b11d3dbc7b9fe9ba - path: miniwdl_run/call-stats_n_coverage/inputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_fasta.yml b/tests/workflows/theiacov/test_wf_theiacov_fasta.yml index e688eb726..df82166e4 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_fasta.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_fasta.yml @@ -37,7 +37,7 @@ md5sum: 6808ca805661622ad65ae014a4b2a094 - path: miniwdl_run/call-consensus_qc/work/_miniwdl_inputs/0/clearlabs.fasta.gz - path: miniwdl_run/call-nextclade_v3/command - md5sum: 59868097729a0dac73f93a62d57ecd4c + md5sum: 5f142285394dd5432eeda69c8db06444 - path: miniwdl_run/call-nextclade_v3/inputs.json - path: miniwdl_run/call-nextclade_v3/outputs.json - path: miniwdl_run/call-nextclade_v3/stderr.txt @@ -50,22 +50,22 @@ - path: miniwdl_run/call-nextclade_v3/work/clearlabs.fasta.gz.nextclade.auspice.json - path: miniwdl_run/call-nextclade_v3/work/clearlabs.fasta.gz.nextclade.json - path: miniwdl_run/call-nextclade_v3/work/clearlabs.fasta.gz.nextclade.tsv - md5sum: 3aeae954ba64b8ad7db55e08f9c7131c + md5sum: 6f73969f56007a50f230d9768d95daf1 - path: miniwdl_run/call-nextclade_v3/work/nextclade.aligned.fasta md5sum: bf487271d506418ea23fe30fc033e44d - path: miniwdl_run/call-nextclade_v3/work/nextclade.csv - md5sum: 50ca5404982b62cbdf077c5d16543e6f + md5sum: d03e4ca908ab966f2a5c4e6a2a346c74 - path: miniwdl_run/call-nextclade_v3/work/nextclade.ndjson - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/genome_annotation.gff3 md5sum: 4dff84d2d6ada820e0e3a8bc6798d402 - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/pathogen.json - md5sum: a51a91e0b5e16590c1afd0c7897ad071 + md5sum: 32f20640f926d5b59fed6b954541792d - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/reference.fasta md5sum: c7ce05f28e4ec0322c96f24e064ef55c - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/sequences.fasta md5sum: c2a4d6cbb837dce22d81f9c36dd0629e - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/tree.json - md5sum: f5a645741d65a60de34373e9e912b8a1 + md5sum: 82d588f58ef37c713bdc1eb8d2c5c22d - path: miniwdl_run/call-nextclade_v3/work/nextclade.cds_translation.E.fasta md5sum: dc43b1e98245a25c142aec52b29a07df - path: miniwdl_run/call-nextclade_v3/work/nextclade.cds_translation.M.fasta @@ -111,7 +111,7 @@ - path: miniwdl_run/call-nextclade_output_parser/work/_miniwdl_inputs/0/clearlabs.fasta.gz.nextclade.tsv md5sum: d41d8cd98f00b204e9800998ecf8427e - path: miniwdl_run/call-nextclade_output_parser/work/input.tsv - md5sum: 3aeae954ba64b8ad7db55e08f9c7131c + md5sum: 6f73969f56007a50f230d9768d95daf1 - path: miniwdl_run/call-pangolin4/command md5sum: b9c36681b77c5e007bf7e890265d70eb - path: miniwdl_run/call-pangolin4/inputs.json @@ -130,12 +130,12 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: 71eba5c871bca955ab2a69dbd2c3c62e - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: e01f9468a9a5490f5743cc0ca76286a7 + md5sum: e5d1adcf421ec6306f35626a6f7c9961 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/clearlabs.fasta.gz - path: miniwdl_run/call-pangolin4/work/fasta.pangolin_report.csv - md5sum: 163d8390eb18b50c7d871edf815d029f + md5sum: 87c7b2dbd5d507949ff6cfddfee22766 - path: miniwdl_run/call-vadr/command md5sum: 9e4318eb5b452da239723882bbcfe352 - path: miniwdl_run/call-vadr/inputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml index 070887778..d2e0c64f9 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml @@ -60,11 +60,11 @@ md5sum: d41d8cd98f00b204e9800998ecf8427e # fastq scan raw - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/command - md5sum: 9b2cc0107f1a90972482d7b3a658d242 + md5sum: 56bcc1ba5d2a9c94f4704fc4b8e6b7ba - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/inputs.json contains: ["read1", "read2", "illumina_pe"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/outputs.json - contains: ["fastq_scan_pe", "pipeline_date", "read1_seq", "read2_seq"] + contains: ["fastq_scan_pe", "read1_seq", "read2_seq"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stderr.txt - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stderr.txt.offset - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stdout.txt @@ -74,7 +74,6 @@ md5sum: 2a77387b247176aa5fcc9aed228699c9 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/SRR13687078_2_fastq-scan.json md5sum: d0eebdd4e14cf0a0b371fee1338474c9 - - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/DATE - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/READ1_SEQS md5sum: 4e4a08422dbf7001fd09ad5126e13b44 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/READ2_SEQS @@ -365,14 +364,14 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: e98d2fc28664c0622f6b490433286e32 - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: 0803245359027bd3017d2bd9a9c9c8e3 + md5sum: 36f64a1cd7c6844309e8ad2121358088 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/SRR13687078.ivar.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e # nextclade - path: miniwdl_run/call-nextclade_v3/command - md5sum: 75c10b0cc6a7c826b84f6b3fa8be5a26 + md5sum: 113378e9114fde0abcf359fda49de568 - path: miniwdl_run/call-nextclade_v3/inputs.json contains: ["dataset_name", "dataset_tag", "genome_fasta"] - path: miniwdl_run/call-nextclade_v3/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml index 2398707c0..362fa45d0 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml @@ -56,11 +56,11 @@ md5sum: d41d8cd98f00b204e9800998ecf8427e # fastq scan raw - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/command - md5sum: 56f66a4ef82d3ae03c17db6a26f59528 + md5sum: f96c3103490fff3560fc930a84bd459d - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/inputs.json contains: ["read1", "illumina_se"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/outputs.json - contains: ["fastq_scan_se", "pipeline_date", "read1_seq"] + contains: ["fastq_scan_se", "read1_seq"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stderr.txt - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stderr.txt.offset - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stdout.txt @@ -68,7 +68,6 @@ contains: ["wdl", "theiacov_illumina_se", "fastq_scan_raw", "done"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/ERR6319327_fastq-scan.json md5sum: 66b2f7c60b74de654f590d77bdd2231e - - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/DATE - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/READ1_SEQS md5sum: 87f1a9ed69127009aa0c173cd74c9d31 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/VERSION @@ -317,14 +316,14 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: 0b1f8fb5b938fe71631f61234cbf7ab3 - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: 0803245359027bd3017d2bd9a9c9c8e3 + md5sum: 36f64a1cd7c6844309e8ad2121358088 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/ERR6319327.ivar.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e # nextclade - path: miniwdl_run/call-nextclade_v3/command - md5sum: a98129345713c75ac2e51ffa465c1703 + md5sum: c5d644127d8eae3f8fb3e3eaecb7fd2e - path: miniwdl_run/call-nextclade_v3/inputs.json contains: ["dataset_name", "dataset_tag", "genome_fasta"] - path: miniwdl_run/call-nextclade_v3/outputs.json @@ -373,7 +372,7 @@ - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/sequences.fasta - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/tree.json - path: miniwdl_run/call-nextclade_v3/work/nextclade_dataset_dir/pathogen.json - md5sum: a51a91e0b5e16590c1afd0c7897ad071 + md5sum: 32f20640f926d5b59fed6b954541792d - path: miniwdl_run/call-nextclade_v3/work/_miniwdl_inputs/0/ERR6319327.ivar.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e # nextclade output parsing diff --git a/tests/workflows/theiacov/test_wf_theiacov_ont.yml b/tests/workflows/theiacov/test_wf_theiacov_ont.yml index 333f39b90..1348bce94 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_ont.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_ont.yml @@ -205,9 +205,9 @@ - path: miniwdl_run/call-pangolin4/work/PANGOLIN_NOTES md5sum: 35aa27af5fb90d54561ee9d45a3163d5 - path: miniwdl_run/call-pangolin4/work/PANGO_ASSIGNMENT_VERSION - md5sum: 0803245359027bd3017d2bd9a9c9c8e3 + md5sum: 36f64a1cd7c6844309e8ad2121358088 - path: miniwdl_run/call-pangolin4/work/VERSION_PANGOLIN_ALL - md5sum: b5dbf2ba7480effea8c656099df0e78e + md5sum: dfd90750c8776f46bad1de214c1d1a57 - path: miniwdl_run/call-pangolin4/work/_miniwdl_inputs/0/ont.medaka.consensus.fasta md5sum: d41d8cd98f00b204e9800998ecf8427e - path: miniwdl_run/call-pangolin4/work/ont.pangolin_report.csv diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index f9cfc3f44..0ddf01645 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -416,14 +416,13 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/inputs.json contains: ["read", "fastq"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/outputs.json - contains: ["read", "fastq", "fastq_scan_report"] + contains: ["read", "fastq", "fastq_scan_json"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/stderr.txt - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/stderr.txt.offset - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/stdout.txt contains: ["fastq", "qc_stats", "read_lengths"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/task.log contains: ["wdl", "theiaprok_illumina_pe", "fastq_scan_clean", "done"] - - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/DATE - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/READ1_SEQS md5sum: 5fcafec683df465a99878ceaffe8a294 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/READ2_SEQS @@ -431,7 +430,7 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/READ_PAIRS md5sum: 5fcafec683df465a99878ceaffe8a294 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/VERSION - md5sum: 8e4e9cdfbacc9021a3175ccbbbde002b + md5sum: a59bb42644e35c09b8fa8087156fa4c2 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/_miniwdl_inputs/0/test_1.clean.fastq.gz - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/_miniwdl_inputs/0/test_2.clean.fastq.gz - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/test_1.clean_fastq-scan.json @@ -443,14 +442,13 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/inputs.json contains: ["read", "fastq"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/outputs.json - contains: ["read", "fastq", "fastq_scan_report"] + contains: ["read", "fastq", "fastq_scan_json"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stderr.txt - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stderr.txt.offset - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stdout.txt contains: ["fastq", "qc_stats", "read_lengths"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/task.log contains: ["wdl", "theiaprok_illumina_pe", "fastq_scan_raw", "done"] - - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/DATE - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/READ1_SEQS md5sum: 75fa2f47fecb5dec8d244366881e76ec - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/READ2_SEQS @@ -462,7 +460,7 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/SRR2838702_R2_fastq-scan.json md5sum: e81f34050c11995771de79182f06d793 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/VERSION - md5sum: 8e4e9cdfbacc9021a3175ccbbbde002b + md5sum: a59bb42644e35c09b8fa8087156fa4c2 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/_miniwdl_inputs/0/SRR2838702_R1.fastq.gz - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/_miniwdl_inputs/0/SRR2838702_R2.fastq.gz - path: miniwdl_run/call-read_QC_trim/call-trimmomatic_pe/command @@ -629,9 +627,9 @@ - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 4d69a6539b68503af9f3f1c2787ff920 + md5sum: 850ad97598aca5c28eb36e6a5c13c2fc - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl - md5sum: 3cb5c86b15e931b0c0b98ed784386438 + md5sum: d8db687487a45536d4837a540ed2a135 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl md5sum: ea5cff6eff8c2c42046cf2eae6f16b6f - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_pe.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index ad9ee1090..c8b49bc78 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -400,18 +400,17 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/inputs.json contains: ["read", "fastq"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/outputs.json - contains: ["read", "fastq", "fastq_scan_report"] + contains: ["read", "fastq", "fastq_scan_json"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/stderr.txt - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/stderr.txt.offset - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/stdout.txt contains: ["fastq", "qc_stats", "read_lengths"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/task.log contains: ["wdl", "theiaprok_illumina_se", "fastq_scan_clean", "done"] - - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/DATE - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/READ1_SEQS md5sum: 499f7af0d267a13f5523ec9a60ec46e3 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/VERSION - md5sum: 8e4e9cdfbacc9021a3175ccbbbde002b + md5sum: a59bb42644e35c09b8fa8087156fa4c2 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/_miniwdl_inputs/0/test_1.clean.fastq.gz - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_clean/work/test_1.clean_fastq-scan.json md5sum: eb30273b3f19578fec5360da8b255e28 @@ -420,20 +419,19 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/inputs.json contains: ["read", "fastq"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/outputs.json - contains: ["read", "fastq", "fastq_scan_report"] + contains: ["read", "fastq", "fastq_scan_json"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stderr.txt - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stderr.txt.offset - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/stdout.txt contains: ["fastq", "qc_stats", "read_lengths"] - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/task.log contains: ["wdl", "theiaprok_illumina_se", "fastq_scan_raw", "done"] - - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/DATE - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/READ1_SEQS md5sum: 75fa2f47fecb5dec8d244366881e76ec - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/SRR2838702_R1_fastq-scan.json md5sum: c4a64c8fd27fa357206e0d41b74866e2 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/VERSION - md5sum: 8e4e9cdfbacc9021a3175ccbbbde002b + md5sum: a59bb42644e35c09b8fa8087156fa4c2 - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/_miniwdl_inputs/0/SRR2838702_R1.fastq.gz - path: miniwdl_run/call-read_QC_trim/call-trimmomatic_se/command md5sum: a317f1a2182fe1a3b26812b54eff088e @@ -592,9 +590,9 @@ - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 4d69a6539b68503af9f3f1c2787ff920 + md5sum: 850ad97598aca5c28eb36e6a5c13c2fc - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl - md5sum: fdb66b59ac886501a4ae90a25cefd633 + md5sum: 4111a758490174325ae8ea52a95319e9 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl md5sum: ea5cff6eff8c2c42046cf2eae6f16b6f - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl diff --git a/workflows/freyja/wf_freyja_fastq.wdl b/workflows/freyja/wf_freyja_fastq.wdl index ca81d16d0..2e0fe755e 100644 --- a/workflows/freyja/wf_freyja_fastq.wdl +++ b/workflows/freyja/wf_freyja_fastq.wdl @@ -120,10 +120,14 @@ workflow freyja_fastq { String fastq_scan_num_reads_raw1 = select_first([read_QC_trim_pe.fastq_scan_raw1, read_QC_trim_se.fastq_scan_raw1, ""]) Int? fastq_scan_num_reads_raw2 = read_QC_trim_pe.fastq_scan_raw2 String? fastq_scan_num_reads_raw_pairs = read_QC_trim_pe.fastq_scan_raw_pairs + String fastq_scan_raw1_json = select_first([read_QC_trim_pe.fastq_scan_raw1_json, read_QC_trim_se.fastq_scan_raw1_json, ""]) + File? fastq_scan_raw2_json = read_QC_trim_pe.fastq_scan_raw2_json String fastq_scan_version = select_first([read_QC_trim_pe.fastq_scan_version, read_QC_trim_se.fastq_scan_version, ""]) String fastq_scan_num_reads_clean1 = select_first([read_QC_trim_pe.fastq_scan_clean1, read_QC_trim_se.fastq_scan_clean1, ""]) Int? fastq_scan_num_reads_clean2 = read_QC_trim_pe.fastq_scan_clean2 String? fastq_scan_num_reads_clean_pairs = read_QC_trim_pe.fastq_scan_clean_pairs + String fastq_scan_clean1_json = select_first([read_QC_trim_pe.fastq_scan_clean1_json, read_QC_trim_se.fastq_scan_clean1_json, ""]) + File? fastq_scan_clean2_json = read_QC_trim_pe.fastq_scan_clean2_json # Read QC - fastqc outputs - Illumina PE and SE String fastqc_num_reads_raw1 = select_first([read_QC_trim_pe.fastqc_raw1, read_QC_trim_se.fastqc_raw1, ""]) Int? fastqc_num_reads_raw2 = read_QC_trim_pe.fastqc_raw2 diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl index 3398d430f..bb003b705 100644 --- a/workflows/phylogenetics/wf_augur.wdl +++ b/workflows/phylogenetics/wf_augur.wdl @@ -203,6 +203,7 @@ workflow augur { File? auspice_input_json = augur_export.auspice_json File? time_tree = augur_refine.refined_tree File distance_tree = augur_tree.aligned_tree + String augur_iqtree_model_used = augur_tree.iqtree_model_used File aligned_fastas = select_first([augur_align.aligned_fasta, alignment_fasta]) File combined_assemblies = filter_sequences_by_length.filtered_fasta File? metadata_merged = tsv_join.out_tsv diff --git a/workflows/theiacov/wf_theiacov_clearlabs.wdl b/workflows/theiacov/wf_theiacov_clearlabs.wdl index 8fd04b389..d63f61c0f 100644 --- a/workflows/theiacov/wf_theiacov_clearlabs.wdl +++ b/workflows/theiacov/wf_theiacov_clearlabs.wdl @@ -171,6 +171,8 @@ workflow theiacov_clearlabs { Int fastq_scan_num_reads_raw1 = fastq_scan_raw_reads.read1_seq Int fastq_scan_num_reads_clean1 = fastq_scan_clean_reads.read1_seq String fastq_scan_version = fastq_scan_raw_reads.version + File fastq_scan_raw1_json = fastq_scan_raw_reads.fastq_scan_json + File fastq_scan_clean1_json = fastq_scan_clean_reads.fastq_scan_json # Read QC - kraken outputs String kraken_version = kraken2_raw.version Float kraken_human = kraken2_raw.percent_human diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index 5b19bcc23..7bf1fc36a 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -260,6 +260,10 @@ workflow theiacov_illumina_pe { Int? fastq_scan_num_reads_clean1 = read_QC_trim.fastq_scan_clean1 Int? fastq_scan_num_reads_clean2 = read_QC_trim.fastq_scan_clean2 String? fastq_scan_num_reads_clean_pairs = read_QC_trim.fastq_scan_clean_pairs + File? fastq_scan_raw1_json = read_QC_trim.fastq_scan_raw1_json + File? fastq_scan_raw2_json = read_QC_trim.fastq_scan_raw2_json + File? fastq_scan_clean1_json = read_QC_trim.fastq_scan_clean1_json + File? fastq_scan_clean2_json = read_QC_trim.fastq_scan_clean2_json # Read QC - fastqc outputs Int? fastqc_num_reads_raw1 = read_QC_trim.fastqc_raw1 Int? fastqc_num_reads_raw2 = read_QC_trim.fastqc_raw2 diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl index a183015a7..0a92ef2fc 100644 --- a/workflows/theiacov/wf_theiacov_illumina_se.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl @@ -215,6 +215,8 @@ workflow theiacov_illumina_se { Int? fastq_scan_num_reads_raw1 = read_QC_trim.fastq_scan_raw1 String? fastq_scan_version = read_QC_trim.fastq_scan_version Int? fastq_scan_num_reads_clean1 = read_QC_trim.fastq_scan_clean1 + File? fastq_scan_raw1_json = read_QC_trim.fastq_scan_raw1_json + File? fastq_scan_clean1_json = read_QC_trim.fastq_scan_clean1_json # Read QC - fastqc outputs Int? fastqc_num_reads_raw1 = read_QC_trim.fastqc_raw1 Int? fastqc_num_reads_clean1 = read_QC_trim.fastqc_clean1 diff --git a/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl b/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl index 67c5c5464..3e792345d 100644 --- a/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl +++ b/workflows/theiaeuk/wf_theiaeuk_illumina_pe.wdl @@ -208,6 +208,10 @@ workflow theiaeuk_illumina_pe { Int? fastq_scan_num_reads_clean1 = read_QC_trim.fastq_scan_clean1 Int? fastq_scan_num_reads_clean2 = read_QC_trim.fastq_scan_clean2 String? fastq_scan_num_reads_clean_pairs = read_QC_trim.fastq_scan_clean_pairs + File? fastq_scan_raw1_json = read_QC_trim.fastq_scan_raw1_json + File? fastq_scan_raw2_json = read_QC_trim.fastq_scan_raw2_json + File? fastq_scan_clean1_json = read_QC_trim.fastq_scan_clean1_json + File? fastq_scan_clean2_json = read_QC_trim.fastq_scan_clean2_json # Read QC - trimmomatic outputs String? trimmomatic_version = read_QC_trim.trimmomatic_version String? trimmomatic_docker = read_QC_trim.trimmomatic_docker diff --git a/workflows/theiameta/wf_theiameta_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_illumina_pe.wdl index 51f1a0054..2a6a23488 100644 --- a/workflows/theiameta/wf_theiameta_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_illumina_pe.wdl @@ -207,6 +207,10 @@ workflow theiameta_illumina_pe { Int? fastq_scan_num_reads_clean1 = read_QC_trim.fastq_scan_clean1 Int? fastq_scan_num_reads_clean2 = read_QC_trim.fastq_scan_clean2 String? fastq_scan_num_reads_clean_pairs = read_QC_trim.fastq_scan_clean_pairs + File? fastq_scan_raw1_json = read_QC_trim.fastq_scan_raw1_json + File? fastq_scan_raw2_json = read_QC_trim.fastq_scan_raw2_json + File? fastq_scan_clean1_json = read_QC_trim.fastq_scan_clean1_json + File? fastq_scan_clean2_json = read_QC_trim.fastq_scan_clean2_json # Read QC - fastqc outputs Int? fastqc_num_reads_raw1 = read_QC_trim.fastqc_raw1 Int? fastqc_num_reads_raw2 = read_QC_trim.fastqc_raw2 diff --git a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl index d71c5e324..32271224e 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl @@ -277,6 +277,10 @@ workflow theiaprok_illumina_pe { num_reads_clean1 = read_QC_trim.fastq_scan_clean1, num_reads_clean2 = read_QC_trim.fastq_scan_clean2, num_reads_clean_pairs = read_QC_trim.fastq_scan_clean_pairs, + fastq_scan_raw1_json = read_QC_trim.fastq_scan_raw1_json, + fastq_scan_raw2_json = read_QC_trim.fastq_scan_raw2_json, + fastq_scan_clean1_json = read_QC_trim.fastq_scan_clean1_json, + fastq_scan_clean2_json = read_QC_trim.fastq_scan_clean2_json, trimmomatic_version = read_QC_trim.trimmomatic_version, fastp_version = read_QC_trim.fastp_version, bbduk_docker = read_QC_trim.bbduk_docker, @@ -615,6 +619,10 @@ workflow theiaprok_illumina_pe { Int? fastq_scan_num_reads_clean1 = read_QC_trim.fastq_scan_clean1 Int? fastq_scan_num_reads_clean2 = read_QC_trim.fastq_scan_clean2 String? fastq_scan_num_reads_clean_pairs = read_QC_trim.fastq_scan_clean_pairs + File? fastq_scan_raw1_json = read_QC_trim.fastq_scan_raw1_json + File? fastq_scan_raw2_json = read_QC_trim.fastq_scan_raw2_json + File? fastq_scan_clean1_json = read_QC_trim.fastq_scan_clean1_json + File? fastq_scan_clean2_json = read_QC_trim.fastq_scan_clean2_json # Read QC - fastqc outputs Int? fastqc_num_reads_raw1 = read_QC_trim.fastqc_raw1 Int? fastqc_num_reads_raw2 = read_QC_trim.fastqc_raw2 diff --git a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl index 1c3eee081..e743ecbce 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl @@ -254,6 +254,8 @@ workflow theiaprok_illumina_se { num_reads_raw1 = read_QC_trim.fastq_scan_raw1, fastq_scan_version = read_QC_trim.fastq_scan_version, num_reads_clean1 = read_QC_trim.fastq_scan_clean1, + fastq_scan_raw1_json = read_QC_trim.fastq_scan_raw1_json, + fastq_scan_clean1_json = read_QC_trim.fastq_scan_clean1_json, trimmomatic_version = read_QC_trim.trimmomatic_version, fastp_version = read_QC_trim.fastp_version, bbduk_docker = read_QC_trim.bbduk_docker, @@ -571,6 +573,8 @@ workflow theiaprok_illumina_se { Int? fastq_scan_num_reads_raw1 = read_QC_trim.fastq_scan_raw1 String? fastq_scan_version = read_QC_trim.fastq_scan_version Int? fastq_scan_num_reads_clean1 = read_QC_trim.fastq_scan_clean1 + File? fastq_scan_raw1_json = read_QC_trim.fastq_scan_raw1_json + File? fastq_scan_clean1_json = read_QC_trim.fastq_scan_clean1_json # Read QC - fastqc outputs Int? fastqc_num_reads_raw1 = read_QC_trim.fastqc_raw1 Int? fastqc_num_reads_clean1 = read_QC_trim.fastqc_clean1 diff --git a/workflows/utilities/data_import/wf_fetch_srr_accession.wdl b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl new file mode 100644 index 000000000..e40e54a0f --- /dev/null +++ b/workflows/utilities/data_import/wf_fetch_srr_accession.wdl @@ -0,0 +1,26 @@ +version 1.0 + +import "../../../tasks/utilities/data_handling/task_fetch_srr_accession.wdl" as srr_task +import "../../../tasks/task_versioning.wdl" as versioning_task + +workflow fetch_srr_accession { + meta { + description: "This workflow retrieves the Sequence Read Archive (SRA) accession (SRR) associated with a given sample accession. It uses the fastq-dl tool to fetch metadata from SRA and outputs the SRR accession." + } + input { + String sample_accession + } + call versioning_task.version_capture { + input: + } + call srr_task.fetch_srr_accession as fetch_srr { + input: + sample_accession = sample_accession + } + output { + String srr_accession = fetch_srr.srr_accession + # Version Captures + String fetch_srr_accession_version = version_capture.phb_version + String fetch_srr_accession_analysis_date = version_capture.date + } +} diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index f01954654..65c513cb8 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -52,10 +52,10 @@ workflow organism_parameters { String sc2_org_name = "sars-cov-2" String sc2_reference_genome = "gs://theiagen-public-files-rp/terra/augur-sars-cov-2-references/MN908947.fasta" String sc2_gene_locations_bed = "gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed" - String sc2_nextclade_ds_tag = "2024-07-17--12-57-03Z" + String sc2_nextclade_ds_tag = "2024-11-19--14-18-53Z" String sc2_nextclade_ds_name = "nextstrain/sars-cov-2/wuhan-hu-1/orfs" String sc2_kraken_target_organism = "Severe acute respiratory syndrome coronavirus 2" - String sc2_pangolin_docker = "us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29" + String sc2_pangolin_docker = "us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.31" Int sc2_genome_len = 29903 Int sc2_vadr_max_length = 30000 Int sc2_vadr_skip_length = 10000 @@ -66,7 +66,7 @@ workflow organism_parameters { String mpox_org_name = "MPXV" String mpox_reference_genome = "gs://theiagen-public-files/terra/mpxv-files/MPXV.MT903345.reference.fasta" String mpox_gene_locations_bed = "gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed" - String mpox_nextclade_ds_tag = "2024-04-19--07-50-39Z" + String mpox_nextclade_ds_tag = "2024-11-19--14-18-53Z" String mpox_nextclade_ds_name = "nextstrain/mpox/lineage-b.1" String mpox_kraken_target_organism = "Monkeypox virus" String mpox_primer_bed_file = "gs://theiagen-public-files/terra/mpxv-files/MPXV.primer.bed" @@ -125,7 +125,7 @@ workflow organism_parameters { if (flu_subtype == "H1N1") { String h1n1_ha_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_ha.fasta" String h1n1_ha_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_ha.gb" - String h1n1_ha_nextclade_ds_tag = "2024-07-03--08-29-55Z" + String h1n1_ha_nextclade_ds_tag = "2024-11-27--02-51-00Z" String h1n1_ha_nextclade_ds_name = "nextstrain/flu/h1n1pdm/ha/MW626062" String h1n1_ha_clades_tsv = "gs://theiagen-public-files-rp/terra/flu-references/clades_h1n1pdm_ha.tsv" String h1n1_ha_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h1n1pdm.json" @@ -133,7 +133,7 @@ workflow organism_parameters { if (flu_subtype == "H3N2") { String h3n2_ha_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_ha.fasta" String h3n2_ha_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_ha.gb" - String h3n2_ha_nextclade_ds_tag = "2024-08-08--05-08-21Z" + String h3n2_ha_nextclade_ds_tag = "2024-11-27--02-51-00Z" String h3n2_ha_nextclade_ds_name = "nextstrain/flu/h3n2/ha/EPI1857216" String h3n2_ha_clades_tsv = "gs://theiagen-public-files-rp/terra/flu-references/clades_h3n2_ha.tsv" String h3n2_ha_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h3n2.json" @@ -141,7 +141,7 @@ workflow organism_parameters { if (flu_subtype == "Victoria") { String vic_ha_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_vic_ha.fasta" String vic_ha_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_vic_ha.gb" - String vic_ha_nextclade_ds_tag = "2024-07-03--08-29-55Z" + String vic_ha_nextclade_ds_tag = "2024-11-05--09-19-52Z" String vic_ha_nextclade_ds_name = "nextstrain/flu/vic/ha/KX058884" String vic_ha_clades_tsv = "gs://theiagen-public-files-rp/terra/flu-references/clades_vic_ha.tsv" String vic_ha_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_vic.json" @@ -168,21 +168,21 @@ workflow organism_parameters { if (flu_subtype == "H1N1") { String h1n1_na_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_na.fasta" String h1n1_na_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_h1n1pdm_na.gb" - String h1n1_na_nextclade_ds_tag = "2024-07-03--08-29-55Z" + String h1n1_na_nextclade_ds_tag = "2024-11-05--09-19-52Z" String h1n1_na_nextclade_ds_name = "nextstrain/flu/h1n1pdm/na/MW626056" String h1n1_na_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h1n1pdm.json" } if (flu_subtype == "H3N2") { String h3n2_na_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_na.fasta" String h3n2_na_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_h3n2_na.gb" - String h3n2_na_nextclade_ds_tag = "2024-04-19--07-50-39Z" + String h3n2_na_nextclade_ds_tag = "2024-11-05--09-19-52Z" String h3n2_na_nextclade_ds_name = "nextstrain/flu/h3n2/na/EPI1857215" String h3n2_na_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_h3n2.json" } if (flu_subtype == "Victoria") { String vic_na_reference = "gs://theiagen-public-files-rp/terra/flu-references/reference_vic_na.fasta" String vic_na_reference_gbk = "gs://theiagen-public-files-rp/terra/flu-references/reference_yam_na.gb" - String vic_na_nextclade_ds_tag = "2024-04-19--07-50-39Z" + String vic_na_nextclade_ds_tag = "2024-11-05--09-19-52Z" String vic_na_nextclade_ds_name = "nextstrain/flu/vic/na/CY073894" String vic_na_auspice_config = "gs://theiagen-public-files-rp/terra/flu-references/auspice_config_vic.json" } @@ -198,7 +198,7 @@ workflow organism_parameters { if (organism == "rsv_a" || organism == "rsv-a" || organism == "RSV-A" || organism == "RSV_A") { String rsv_a_org_name = "rsv_a" String rsv_a_reference_genome = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta" - String rsv_a_nextclade_ds_tag = "2024-08-01--22-31-31Z" + String rsv_a_nextclade_ds_tag = "2024-11-27--02-51-00Z" String rsv_a_nextclade_ds_name = "nextstrain/rsv/a/EPI_ISL_412866" Int rsv_a_genome_len = 15500 String rsv_a_kraken_target_organism = "Human respiratory syncytial virus A" @@ -222,7 +222,7 @@ workflow organism_parameters { if (organism == "rsv_b" || organism == "rsv-b" || organism == "RSV-B" || organism == "RSV_B") { String rsv_b_org_name = "rsv_b" String rsv_b_reference_genome = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta" - String rsv_b_nextclade_ds_tag = "2024-08-01--22-31-31Z" + String rsv_b_nextclade_ds_tag = "2024-11-27--02-51-00Z" String rsv_b_nextclade_ds_name = "nextstrain/rsv/b/EPI_ISL_1653999" Int rsv_b_genome_len = 15500 String rsv_b_kraken_target_organism = "human respiratory syncytial virus" diff --git a/workflows/utilities/wf_read_QC_trim_ont.wdl b/workflows/utilities/wf_read_QC_trim_ont.wdl index 07dec7a35..8cc609346 100644 --- a/workflows/utilities/wf_read_QC_trim_ont.wdl +++ b/workflows/utilities/wf_read_QC_trim_ont.wdl @@ -9,7 +9,7 @@ import "../../tasks/utilities/task_rasusa.wdl" as rasusa_task workflow read_QC_trim_ont { meta { - description: "Runs basic QC on Oxford Nanopore (ONT) reads with (1) fastq_scan, (2) nanoplot, (3) rasusa downsampling, (4) tiptoft plasmid detection, and (5) nanoq filtering" + description: "Runs basic QC on Oxford Nanopore (ONT) reads with nanoplot, rasusa downsampling, tiptoft plasmid detection, and nanoq filtering" } input { String samplename diff --git a/workflows/utilities/wf_read_QC_trim_pe.wdl b/workflows/utilities/wf_read_QC_trim_pe.wdl index 3718a9f17..ee921bc12 100644 --- a/workflows/utilities/wf_read_QC_trim_pe.wdl +++ b/workflows/utilities/wf_read_QC_trim_pe.wdl @@ -182,6 +182,10 @@ workflow read_QC_trim_pe { String? fastq_scan_clean_pairs = fastq_scan_clean.read_pairs String? fastq_scan_version = fastq_scan_raw.version String? fastq_scan_docker = fastq_scan_raw.fastq_scan_docker + File? fastq_scan_raw1_json = fastq_scan_raw.read1_fastq_scan_json + File? fastq_scan_raw2_json = fastq_scan_raw.read2_fastq_scan_json + File? fastq_scan_clean1_json = fastq_scan_clean.read1_fastq_scan_json + File? fastq_scan_clean2_json = fastq_scan_clean.read2_fastq_scan_json # fastqc Int? fastqc_raw1 = fastqc_raw.read1_seq diff --git a/workflows/utilities/wf_read_QC_trim_se.wdl b/workflows/utilities/wf_read_QC_trim_se.wdl index 9808f09e9..f82d3aae3 100644 --- a/workflows/utilities/wf_read_QC_trim_se.wdl +++ b/workflows/utilities/wf_read_QC_trim_se.wdl @@ -157,6 +157,8 @@ workflow read_QC_trim_se { Int? fastq_scan_clean1 = fastq_scan_clean.read1_seq String? fastq_scan_version = fastq_scan_raw.version String? fastq_scan_docker = fastq_scan_raw.fastq_scan_docker + File? fastq_scan_raw1_json = fastq_scan_raw.fastq_scan_json + File? fastq_scan_clean1_json = fastq_scan_clean.fastq_scan_json # fastqc Int? fastqc_raw1 = fastqc_raw.read1_seq