diff --git a/.dockstore.yml b/.dockstore.yml index 146638eb7..b69a14ca6 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -230,6 +230,11 @@ workflows: primaryDescriptorPath: /workflows/theiameta/wf_theiameta_illumina_pe.wdl testParameterFiles: - /tests/inputs/empty.json + - name: TheiaMeta_Panel_Illumina_PE_PHB + subclass: WDL + primaryDescriptorPath: /workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl + testParameterFiles: + - /tests/inputs/empty.json - name: Snippy_Streamline_PHB subclass: WDL primaryDescriptorPath: /workflows/phylogenetics/wf_snippy_streamline.wdl diff --git a/docs/workflows/genomic_characterization/theiameta.md b/docs/workflows/genomic_characterization/theiameta.md index d6b55e80a..ffcbe294a 100644 --- a/docs/workflows/genomic_characterization/theiameta.md +++ b/docs/workflows/genomic_characterization/theiameta.md @@ -247,7 +247,6 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge `metaspades` is a _de novo_ assembler that first constructs a de Bruijn graph of all the reads using the SPAdes algorithm. Through various graph simplification procedures, paths in the assembly graph are reconstructed that correspond to long genomic fragments within the metagenome. For more details, please see the original publication. !!! techdetails "MetaSPAdes Technical Details" - | | Links | | --- | --- | | Task | [task_metaspades.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_metaspades.wdl) | diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md new file mode 100644 index 000000000..56458276d --- /dev/null +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -0,0 +1,687 @@ +# TheiaMeta Panel + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Genomice Characterization](../../workflows_overview/workflows_type.md/#genomic_characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.X.X | Yes | Sample-level | + +## TheiaMeta_Panel_Illumina_PE_PHB + +TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Surveillance Panel](https://www.illumina.com/products/by-type/sequencing-kits/library-prep-kits/viral-surveillance-panel.html)[^1]; however, it can be used for any panel that is sequenced using Illumina paired-end reads if the appropriate taxon IDs are provided. TheiaMeta_Panel performs taxonomic binning, and then assembles the bins into contigs. If the contigs are associated with a supported organism, genomic characterization will be performed. + +[^1]: We are not affiliated with Illumina, Inc. The mention of the Illumina Viral Surveillance Panel is for informational purposes only. + +??? toggle "**What organisms and taxon IDs are identified by default?**" + The Illumina VSP panel contains over 224 viral species, of which 163 can be identified in the default Kraken2 viral database. + + Accordingly, the following 163 taxon IDs are used by default in TheiaMeta_Panel_Illumina_PE. Feel free to search this table to see if your organism of interest is included. + +
+ + | **Taxon ID** | **Organism Name in Illumina VSP Panel** | + |---|---| + | 10804 | Adeno-associated virus 2 (AAV2) | + | 1313215 | Aichi virus 1 (AiV-A1) | + | 2849717  | Aigai virus (AIGV) | + | 1980456 | Andes virus (ANDV) | + | 1424613 | Anjozorobe virus (ANJV) | + | 90961 | Australian bat lyssavirus (ABLV) | + | 3052470 | Bayou virus (BAYV) | + | 3052490 | Black Creek Canal virus (BCCV) | + | 2010960 | Bombali virus (BOMV) | + | 1618189 | Bourbon virus (BRBV) | + | 565995 | Bundibugyo virus (BDBV) | + | 80935 | Cache Valley virus (CVV) | + | 35305 | California encephalitis virus (CEV) | + | 1221391 | Cedar virus (CedV) | + | 3052302 | Chapare virus (CHAPV) | + | 37124 | Chikungunya virus (CHIKV) | + | 169173 | Choclo virus (CHOV) | + | 46839 | Colorado tick fever virus (CTFV) | + | 138948 | Coxsackievirus A | + | 138949 | Coxsackievirus B | + | 3052518 | Crimean-Congo hemorrhagic fever virus (CCHFV) | + | 11053 | Dengue Virus 1 | + | 11060 | Dengue Virus 2 | + | 11069 | Dengue Virus 3 | + | 11070 | Dengue Virus 4 | + | 3052477 | Dobrava virus (DOBV) | + | 38767 | Duvenhage virus (DUVV) | + | 11021 | Eastern equine encephalitis virus (EEEV) | + | 138951 | Enterovirus D | + | 10376 | Epstein-Barr virus (EBV) | + | 57482 | European bat lyssavirus 1 | + | 57483 | European bat lyssavirus 2 | + | 2847089 | Ghana virus (GhV) | + | 3052307 | Guanarito virus (GTOV) | + | 3052480 | Hantaan virus (HTNV) | + | 1216928 | Heartland virus (HRTV) | + | 3052223 | Hendra virus (HeV) | + | 12092 | Hepatitis A virus (HAV) | + | 3052230 | Hepatitis C virus (HCV) | + | 12475 | Hepatitis D virus (HDV) | + | 10298 | Herpes simplex virus 1 (HSV1) | + | 129875 | Human adenovirus A | + | 108098 | Human adenovirus B | + | 129951 | Human adenovirus C | + | 130310 | Human adenovirus D | + | 130308 | Human adenovirus E | + | 130309 | Human adenovirus F | + | 536079 | Human adenovirus G | + | 11137 | Human coronavirus 229E (HCoV_229E) | + | 290028 | Human coronavirus HKU1 (HCoV_HKU1) | + | 277944 | Human coronavirus NL63 (HCoV_NL63) | + | 31631 | Human coronavirus OC43 (HCoV_OC43) | + | 10359 | Human cytomegalovirus (HCMV) | + | 11676 | Human immunodeficiency virus 1 (HIV-1) | + | 11709 | Human immunodeficiency virus 2 (HIV-2) | + | 162145 | Human metapneumovirus (HMPV) | + | 333760 | Human papillomavirus 16 (HPV16; high-risk) | + | 333761 | Human papillomavirus 18 (HPV18; high-risk) | + | 333762 | Human papillomavirus 26 (HPV26) | + | 12730 | Human parainfluenza virus 1 (HPIV-1) | + | 2560525 | Human parainfluenza virus 2 (HPIV-2) | + | 11216 | Human parainfluenza virus 3 (HPIV-3) | + | 2560526  | Human parainfluenza virus 4 (HPIV-4) | + | 1803956  | Human parechovirus (HPeV) | + | 10798  | Human parvovirus B19 (B19V) | + | 746830 | Human polyomavirus 6 (HPyV6) | + | 746831 | Human polyomavirus 7 (HPyV7) | + | 943908 | Human polyomavirus 9 (HPyV9) | + | 208893 | Human respiratory syncytial virus A (HRSV-A) | + | 114727 | Influenza A virus (H1N1) | + | 114729 | Influenza A virus (H2N2) | + | 119210 | Influenza A virus (H3N2) | + | 102793 | Influenza A virus (H5N1) | + | 333278 | Influenza A virus (H7N9) | + | 102796 | Influenza A virus (H9N2) | + | 11520 | Influenza B virus | + | 11552 | Influenza C virus | + | 35511 | Jamestown Canyon virus (JCV) | + | 11072 | Japanese encephalitis virus (JEV) | + | 10632 | JC polyomavirus (JCPyV) | + | 2169991 | Junin virus (JUNV) | + | 1891764 | KI polyomavirus (KIPyV) | + | 33743 | Kyasanur Forest disease virus (KFDV) | + | 11577 | La Crosse virus (LACV) | + | 38766 | Lagos bat virus (LBV) | + | 3052489 | Laguna Negra virus (LANV) | + | 3052310 | Lassa virus (LASV) | + | 1965344 | LI polyomavirus (LIPyV) | + | 3052148 | Lloviu virus (LLOV) | + | 3052314 | Lujo virus (LUJV) | + | 3052303 | Lymphocytic choriomeningitis virus (LCMV) | + | 3052317 | Machupo virus (MACV) | + | 1239565 | Mamastrovirus 1 (MAstV1) | + | 1239570 | Mamastrovirus 6 (MAstV6) | + | 1239573 | Mamastrovirus 9 (MAstV9) | + | 238817 | Maporal virus (MAPV) | + | 3052505 | Marburg virus (MARV) | + | 59301 | Mayaro virus (MAYV) | + | 11234 | Measles virus (MV) | + | 152219 | Menangle virus (MenV) | + | 493803 | Merkel cell polyomavirus (MCPyV) | + | 1335626 | Middle East respiratory syndrome-related coronavirus (MERS-CoV) | + | 1474807 | Mojiang virus (MojV) | + | 12538 | Mokola virus (MOKV) | + | 10244 | Monkeypox virus (MPV) | + | 2560602 | Mumps virus (MuV) | + | 11079 | Murray Valley encephalitis virus (MVEV) | + | 1203539 | MW polyomavirus (MWPyV) | + | 1497391 | New Jersey polyomavirus (NJPyV) | + | 3052225 | Nipah virus (NiV) | + | 142786 | Norovirus | + | 12542 | Omsk hemorrhagic fever virus (OHFV) | + | 2169701 | Onyong-nyong virus (ONNV) | + | 118655 | Oropouche virus (OROV) | + | 138950 | Poliovirus | + | 11083 | Powassan virus (POWV) | + | 11587 | Punta Toro virus (PTV) | + | 3052493 | Puumala virus (PUUV) | + | 11292 | Rabies virus (RABV) | + | 186539 | Reston virus (RESTV) | + | 147711 | Rhinovirus A (RV-A) | + | 147712 | Rhinovirus B (RV-B) | + | 463676 | Rhinovirus C (RV-C) | + | 11588 | Rift Valley fever virus (RVFV) | + | 11029 | Ross River virus (RRV) | + | 28875 | Rotavirus A (RVA) | + | 28876 | Rotavirus B (RVB) | + | 36427 | Rotavirus C (RVC) | + | 1348384 | Rotavirus H (RVH) | + | 11041 | Rubella virus (RuV) | + | 2907957 | Sabia virus (SBAV) | + | 1330524 | Salivirus A (SaV-A) | + | 3052496 | Sangassou virus (SANGV) | + | 95341 | Sapovirus | + | 11033 | Semliki Forest virus (SFV) | + | 3052498 | Seoul virus (SEOV) | + | 2901879 | Severe acute respiratory syndrome coronavirus (SARS-CoV) | + | 2697049 | Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) | + | 1003835 | Severe fever with thrombocytopenia syndrome virus (SFTSV) | + | 1891767 | Simian virus 40 (SV40) | + | 3052499 | Sin nombre virus (SNV) | + | 11034 | Sindbis virus (SINV) | + | 11580 | Snowshoe hare virus (SSHV) | + | 1452514 | Sosuga virus (SoRV) | + | 11080 | St. Louis encephalitis virus (SLEV) | + | 1277649 | STL polyomavirus (STLPyV) | + | 186540 | Sudan virus (SUDV) | + | 1608084 | Tacheng tick virus 2 (TcTV-2) | + | 45270 | Tahyna virus (TAHV) | + | 186541 | Tai Forest virus (TAFV) | + | 11084 | Tick-borne encephalitis virus (TBEV) | + | 68887 | Torque teno virus (TTV) | + | 862909 | Trichodysplasia spinulosa-associated polyomavirus (TSPyV) | + | 3052503 | Tula virus (TULV) | + | 64286 | Usutu virus (USUV) | + | 10255 | Variola virus (VARV) | + | 11036 | Venezuelan equine encephalitis virus (VEEV) | + | 11082 | West Nile virus (WNV) | + | 11039 | Western equine encephalitis virus (WEEV) | + | 440266 | WU polyomavirus (WUPyV) | + | 11089 | Yellow fever virus (YFV) | + | 186538 | Zaire ebolavirus(EBOV) | + | 64320 | Zika virus (ZIKV) | + +
+ +!!! tip "Make your own list of taxon IDs" + You may want to make your own list of taxon IDs if you know your sample is likely to contain a specific organism or group of organisms. You can find taxon IDs in the [NCBI Taxonomy Browser](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi). + + In Terra, you provide your created list of taxon IDs as an array of integers for the `taxon_ids` optional input variable, like this: `[1, 2, 3, 4, 5]`. Just replace the numbers in this example with the taxon IDs you want to use. + +### Inputs + +
+ +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| theiameta_panel_illumina_pe | **read1** | File | The forward Illumina read in FASTQ file format (compression optional) | | Required | +| theiameta_panel_illumina_pe | **read2** | File | The reverse Illumina read in FASTQ file format (compression optional) | | Required | +| theiameta_panel_illumina_pe | **samplename** | String | The name of the sample being analyzed | | Required | +| fastq_scan_binned | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| fastq_scan_binned | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| fastq_scan_binned | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | +| fastq_scan_binned | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| gather_scatter | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| gather_scatter | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| gather_scatter | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | +| gather_scatter | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| kraken2 | **classified_out** | String | Allows user to rename the classified FASTQ files output. Must include .fastq as the suffix | classified#.fastq | Optional | +| kraken2 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| kraken2 | **disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | +| kraken2 | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | +| kraken2 | **kraken2_args** | String | Allows a user to supply additional kraken2 command-line arguments | | Optional | +| kraken2 | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| kraken2 | **unclassified_out** | String | Allows user to rename unclassified FASTQ files output. Must include .fastq as the suffix | unclassified#.fastq | Optional | +| krakentools | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| krakentools | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| krakentools | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/krakentools:d4a2fbe| Optional | +| krakentools | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| metaspades | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| metaspades | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| metaspades | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/metaspades:3.15.3 | Optional | +| metaspades | **kmers** | String | The k-mer list to use; if not provided, the value is automatically set | | Optional | +| metaspades | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| metaspades | **metaspades_opts** | String | Additional arguments to pass on to the metaspades command | | Optional | +| metaspades | **phred_offset** | Int | The PHRED quality offset of the input reads; can be either 33 or 64 | 33 | Optional | +| minimap2_assembly_correction | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| minimap2_assembly_correction | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| minimap2_assembly_correction | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/minimap2:2.22 | Optional | +| minimap2_assembly_correction | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| morgana_magic | **abricate_flu_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| morgana_magic | **abricate_flu_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **abricate_flu_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/abricate:1.0.1-insaflu-220727 | Optional | +| morgana_magic | **abricate_flu_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| morgana_magic | **abricate_flu_mincov** | Int | Minimum DNA % coverage | 60 | Optional | +| morgana_magic | **abricate_flu_minid** | Int | Minimum DNA % identity | 70 | Optional | +| morgana_magic | **assembly_metrics_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| morgana_magic | **assembly_metrics_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **assembly_metrics_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.15 | Optional | +| morgana_magic | **assembly_metrics_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| morgana_magic | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| morgana_magic | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **docker** | String | The Docker container to use for the task | ngolin | Optional | +| morgana_magic | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| morgana_magic | **genoflu_cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| morgana_magic | **genoflu_cross_reference** | File | An Excel file to cross-reference BLAST findings; probably useful if novel genotypes are not in the default file used by genoflu.py | | Optional | +| morgana_magic | **genoflu_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 25 | Optional | +| morgana_magic | **genoflu_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/genoflu:1.03 | Optional | +| morgana_magic | **genoflu_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| morgana_magic | **irma_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| morgana_magic | **irma_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **irma_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/cdcgov/irma:v1.1.5 | Optional | +| morgana_magic | **irma_keep_ref_deletions** | Boolean | True/False variable that determines if sites missed during read gathering should be deleted by ambiguation. | TRUE | Optional | +| morgana_magic | **irma_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| morgana_magic | **nextclade_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| morgana_magic | **nextclade_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| morgana_magic | **nextclade_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:3.3.1 | Optional | +| morgana_magic | **nextclade_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| morgana_magic | **nextclade_output_parser_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| morgana_magic | **nextclade_output_parser_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| morgana_magic | **nextclade_output_parser_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/python/python:3.8.18-slim | Optional | +| morgana_magic | **nextclade_output_parser_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| morgana_magic | **pangolin_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| morgana_magic | **pangolin_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **pangolin_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29 | Optional | +| morgana_magic | **pangolin_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| pilon | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| pilon | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| pilon | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pilon:1.24 | Optional | +| pilon | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| quast | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| quast | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| quast | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/quast:5.0.2 | Optional | +| quast | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim | **adapters** | File | A file containing the sequence of the adapters used during library preparation, used in the BBDuk task | | Optional | +| read_QC_trim | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim | **call_kraken** | Boolean | Set to true to launch Kraken2; if true, you must provide a kraken_db | FALSE | Optional | +| read_QC_trim | **call_midas** | Boolean | Set to true to launch Midas | TRUE | Optional | +| read_QC_trim | **fastp_args** | String | Additional arguments to pass to fastp | "--detect_adapter_for_pe -g -5 20 -3 20 | Optional | +| read_QC_trim | **midas_db** | File | Midas database file | gs://theiagen-large-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz | Optional | +| read_QC_trim | **phix** | File | A file containing the phix used during Illumina sequencing; used in the BBDuk task | | Optional | +| read_QC_trim | **read_processing** | String | Read trimming software to use, either "trimmomatic" or "fastp" | trimmomatic | Optional | +| read_QC_trim | **read_qc** | String | Allows the user to decide between fastq_scan (default) and fastqc for the evaluation of read quality. | fastq_scan | Optional | +| read_QC_trim | **trim_min_length** | Int | The minimum length of each read after trimming | 75 | Optional | +| read_QC_trim | **trim_primers** | Boolean | A True/False option that determines if primers should be trimmed. | TRUE | Optional | +| read_QC_trim | **trim_quality_min_score** | Int | The minimum quality score to keep during trimming | 30 | Optional | +| read_QC_trim | **trim_window_size** | Int | Specifies window size for trimming (the number of bases to average the quality across) | 4 | Optional | +| read_QC_trim | **trimmomatic_args** | String | Additional arguments to pass to trimmomatic | -phred33 | Optional | +| sort_bam_assembly_correction | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| sort_bam_assembly_correction | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| sort_bam_assembly_correction | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.17 | Optional | +| sort_bam_assembly_correction | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| theiameta_panel_illumina_pe | **kraken2_db** | File | A Kraken2 database in .tar.gz format | gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz | Optional | +| theiameta_panel_illumina_pe | **minimum_read_number** | Int | The minimum number of reads in order to attempt assembly on a bin of reads | 1000 | Optional | +| theiameta_panel_illumina_pe | **taxon_ids** | Array[Int] | The taxon IDs to be used for taxonomic binning. By default, this array uses the taxon IDs listed above that are intended for the Illumina VSP panel | Illumina VSP panel (see above toggle) | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | + +
+ +### Workflow Tasks + +#### Read QC and Cleaning + +??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" + `read_QC_trim` is a sub-workflow within TheiaMeta that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. + + **Read quality trimming** + + Either `trimmomatic` or `fastp` can be used for read-quality trimming. Trimmomatic is used by default. Both tools trim low-quality regions of reads with a sliding window (with a window size of `trim_window_size`), cutting once the average quality within the window falls below `trim_quality_trim_score`. They will both discard the read if it is trimmed below `trim_min_length`. + + By default, the trim_min_length is set to 75 bp. This is likely _too high_ for data generated using the Illumina VSP panel. We recommend setting this parameter to `50` in this case. + + If fastp is selected for analysis, fastp also implements the additional read-trimming parameters indicated below: + + | **Parameter** | **Explanation** | + | --- | --- | + | -g | enables polyG tail trimming | + | -5 20 | enables read end-trimming | + | -3 20 | enables read end-trimming | + | --detect_adapter_for_pe | enables adapter-trimming **only for paired-end reads** | + + **Adapter removal** + + The `BBDuk` task removes adapters from sequence reads. To do this: + + - [Repair](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/repair-guide/) from the [BBTools](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) package reorders reads in paired fastq files to ensure the forward and reverse reads of a pair are in the same position in the two fastq files. + - [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) (*"Bestus Bioinformaticus" Decontamination Using Kmers*) is then used to trim the adapters and filter out all reads that have a 31-mer match to [PhiX](https://emea.illumina.com/products/by-type/sequencing-kits/cluster-gen-sequencing-reagents/phix-control-v3.html), which is commonly added to Illumina sequencing runs to monitor and/or improve overall run quality. + + ??? toggle "What are adapters and why do they need to be removed?" + Adapters are manufactured oligonucleotide sequences attached to DNA fragments during the library preparation process. In Illumina sequencing, these adapter sequences are required for attaching reads to flow cells. You can read more about Illumina adapters [here](https://emea.support.illumina.com/bulletins/2020/06/illumina-adapter-portfolio.html). For genome analysis, it's important to remove these sequences since they're not actually from your sample. If you don't remove them, the downstream analysis may be affected. + + **Read Quantification** + + There are two methods for read quantification to choose from: [`fastq-scan`](https://github.com/rpetit3/fastq-scan) (default) or [`fastqc`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Both quantify the forward and reverse reads in FASTQ files. In paired-end workflows, they also provide the total number of read pairs. This task is run once with raw reads as input and once with clean reads as input. If QC has been performed correctly, you should expect **fewer** clean reads than raw reads. `fastqc` also provides a graphical visualization of the read quality in an HTML file. + + **Read Identification (optional)** + + The `MIDAS` task is for the identification of reads to detect contamination with non-target taxa. This task is optional and turned off by default. It can be used by setting the `call_midas` input variable to `true`. + + The MIDAS reference database, located at **`gs://theiagen-large-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz`**, is provided as the default. It is possible to provide a custom database. More information is available [here](https://github.com/snayfach/MIDAS/blob/master/docs/ref_db.md). + + ??? toggle "How are the MIDAS output columns determined?" + + Example MIDAS report in the ****`midas_report` column: + + | species_id | count_reads | coverage | relative_abundance | + | --- | --- | --- | --- | + | Salmonella_enterica_58156 | 3309 | 89.88006645 | 0.855888033 | + | Salmonella_enterica_58266 | 501 | 11.60606061 | 0.110519371 | + | Salmonella_enterica_53987 | 99 | 2.232896237 | 0.021262881 | + | Citrobacter_youngae_61659 | 46 | 0.995216227 | 0.009477003 | + | Escherichia_coli_58110 | 5 | 0.123668877 | 0.001177644 | + + MIDAS report column descriptions: + + - species_id: species identifier + - count_reads: number of reads mapped to marker genes + - coverage: estimated genome-coverage (i.e. read-depth) of species in metagenome + - relative_abundance: estimated relative abundance of species in metagenome + + !!! techdetails "read_QC_trim Technical Details" + + | | Links | + | --- | --- | + | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim.wdl) | + | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl) | + | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS)| + | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS) | + | Original Publication(s) | [Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/) | + +#### Taxonomic Classification and Read Binning + +??? task "`kraken2`: Taxonomic Classification" + Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate, eukaryotic isolate, viral isolate, etc.) whole genome sequence data. + + Kraken2 is run on the clean reads that result from the `read_QC_trim` subworkflow. By default, the Kraken2 database is set to the `k2_viral_20240112` database, located at `"gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz"`. + + !!! info "Database-dependent" + The Kraken2 software is database-dependent and **taxonomic assignments are highly sensitive to the database used**. An appropriate database should contain the expected organism(s) (e.g. _Escherichia coli_) and other taxa that may be present in the reads (e.g. _Citrobacter freundii_, a common contaminant). + + !!! techdetails "Kraken2 Technical Details" + | | Links | + | --- | --- | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | + | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | + | Software Documentation | | + | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + +??? task "`extract_kraken_reads` from KrakenTools: Read Binning" + KrakenTools is a collection of scripts that can be used to help downstream analysis of Kraken2 results. In particular, this task uses the `extract_kraken_reads` script, which extracts reads classified at any user-specified taxonomy IDs. All parent and children reads of the specified taxonomic ID are also extracted. + + !!! techdetails "KrakenTools Technical Details" + | | Links | + | --- | --- | + | Task | [task_kraken_tools.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_krakentools.wdl) + | Software Source Code | [KrakenTools on GitHub](https://github.com/jenniferlu717/KrakenTools) | + | Software Documentation | [KrakenTools on GitHub](https://github.com/jenniferlu717/KrakenTools) | + | Original Publication(s) | [Metagenome analysis using the Kraken software suite](https://doi.org/10.1038/s41596-022-00738-y) | + +??? task "`fastq_scan`: Summarizing Read Bins" + `fastq_scan` is used to summarize the read bins generated by the `extract_kraken_reads` task. It provides basic statistics about the read bins, such as the number of reads in each bin, the number of read pairs, and the number of reads in each bin. + + !!! techdetails "fastq_scan Technical Details" + | | Links | + | --- | --- | + | Task | [task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl) | + | Software Source Code | [fastq-scan](https://github.com/rpetit3/fastq-scan) | + | Software Documentation | [fastq-scan](https://github.com/rpetit3/fastq-scan) | + +#### Assembly and Polishing + +??? task "`metaspades`: _De Novo_ Metagenomic Assembly" + While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. + + `metaspades` is a _de novo_ assembler that first constructs a de Bruijn graph of all the reads using the SPAdes algorithm. Through various graph simplification procedures, paths in the assembly graph are reconstructed that correspond to long genomic fragments within the metagenome. For more details, please see the original publication. + + !!! techdetails "MetaSPAdes Technical Details" + | | Links | + | --- | --- | + | Task | [task_metaspades.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_metaspades.wdl) | + | Software Source Code | [SPAdes on GitHub](https://github.com/ablab/spades) | + | Software Documentation | [SPAdes Manual](https://ablab.github.io/spades/index.html) | + | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](http://www.genome.org/cgi/doi/10.1101/gr.213959.116) | + +??? task "`minimap2`: Assembly Alignment and Contig Filtering" + `minimap2` is a popular aligner that is used in TheiaMeta_Panel for correcting the assembly produced by metaSPAdes. This is done by aligning the reads back to the generated assembly. + + The default mode used in this task is `sr` which is intended for "short single-end reads without splicing". In minimap2, "modes" are a group of preset options; the `sr` mode indicates the following parameters should be used: `-k21 -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g100 -2K50m --heap-sort=yes --secondary=no`. + + For more information, please see the [minimap2 manpage](https://lh3.github.io/minimap2/minimap2.html) + + !!! techdetails "minimap2 Technical Details" + | | Links | + |---|---| + | Task | [task_minimap2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_minimap2.wdl) | + | Software Source Code | [minimap2 on GitHub](https://github.com/lh3/minimap2) | + | Software Documentation | [minimap2](https://lh3.github.io/minimap2) | + | Original Publication(s) | [Minimap2: pairwise alignment for nucleotide sequences](https://academic.oup.com/bioinformatics/article/34/18/3094/4994778) | + +??? task "`samtools`: SAM File Conversion" + This task converts the output SAM file from minimap2 and converts it to a BAM file. It then sorts the BAM based on the read names, and then generates an index file. + + !!! techdetails "samtools Technical Details" + | | Links | + |---|---| + | Task | [task_samtools.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/data_handling/task_parse_mapping.wdl) | + | Software Source Code | [samtools on GitHub](https://github.com/samtools/samtools) | + | Software Documentation | [samtools](https://www.htslib.org/doc/samtools.html) | + | Original Publication(s) | [The Sequence Alignment/Map format and SAMtools](https://doi.org/10.1093/bioinformatics/btp352)
[Twelve Years of SAMtools and BCFtools](https://doi.org/10.1093/gigascience/giab008) | + +??? task "`pilon`: Assembly Polishing" + `pilon` is a tool that uses read alignment to correct errors in an assembly. It is used to polish the assembly produced by metaSPAdes. The input to Pilon is the sorted BAM file produced by `samtools`, and the original draft assembly produced by `metaspades`. + + !!! techdetails "pilon Technical Details" + | | Links | + |---|---| + | Task | [task_pilon.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_pilon.wdl) | + | Software Source Code | [Pilon on GitHub](https://github.com/broadinstitute/pilon) | + | Software Documentation | [Pilon Wiki](https://github.com/broadinstitute/pilon/wiki) | + | Original Publication(s) | [Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement](https://doi.org/10.1371/journal.pone.0112963) | + +??? task "`quast`: Assembly Quality Assessment" + QUAST stands for QUality ASsessment Tool. It evaluates genome/metagenome assemblies by computing various metrics without a reference being necessary. It includes useful metrics such as number of contigs, length of the largest contig and N50. + + !!! techdetails "QUAST Technical Details" + | | Links | + | --- | --- | + | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_quast.wdl) | + | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | + | Software Documentation | | + | Original Publication(s) | [QUAST: quality assessment tool for genome assemblies](https://academic.oup.com/bioinformatics/article/29/8/1072/228832) | + +#### Morgana Magic + +??? task "`morgana_magic`: Genomic Characterization" + Morgana Magic is the viral equivalent of the `merlin_magic` subworkflow used in the TheiaProk workflows. This workflow launches several tasks the characterize the viral genome, including Pangolin4, Nextclade, and others. + + This subworkflow currently only supports the organisms that are natively supported by the [TheiaCoV workflows](./theiacov.md). + + The following tasks only run for the appropriate taxon ID if sufficient reads were extracted. The following table illustrates which characterization tools are run for the indicated organism. + + | | SARS-CoV-2 | MPXV | WNV | Influenza | RSV-A | RSV-B | + | --- | --- | --- | --- | --- | --- | --- | + | Pangolin | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | + | Nextclade | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | + | IRMA | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + | Abricate | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + | GenoFLU | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + + ??? task "`pangolin`" + Pangolin designates SARS-CoV-2 lineage assignments. + + !!! techdetails "Pangolin Technical Details" + + | | Links | + | --- | --- | + | Task | [task_pangolin.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/betacoronavirus/task_pangolin.wdl) | + | Software Source Code | [Pangolin on GitHub](https://github.com/cov-lineages/pangolin) | + | Software Documentation | [Pangolin website](https://cov-lineages.org/resources/pangolin.html) | + | Original Publication(s) | [A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology](https://doi.org/10.1038/s41564-020-0770-5) | + + ??? task "`nextclade`" + ["Nextclade is an open-source project for viral genome alignment, mutation calling, clade assignment, quality checks and phylogenetic placement."](https://docs.nextstrain.org/projects/nextclade/en/stable/) + + !!! techdetails "Nextclade Technical Details" + + | | Links | + | --- | --- | + | Task | [task_nextclade.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_nextclade.wdl#L63) | + | Software Source Code | | + | Software Documentation | [Nextclade](https://docs.nextstrain.org/projects/nextclade/en/stable/) | + | Original Publication(s) | [Nextclade: clade assignment, mutation calling and quality control for viral genomes.](https://doi.org/10.21105/joss.03773) | + + ??? task "`irma`" + Cleaned reads are re-assembled using `irma` which does not use a reference due to the rapid evolution and high variability of influenza. Assemblies produced by `irma` will be orderd from largest to smallest assembled flu segment. `irma` also performs typing and subtyping as part of the assembly process. + + General statistics about the assembly are generated with the `consensus_qc` task ([task_assembly_metrics.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl)). + + !!! techdetails "IRMA Technical Details" + | | Links | + | --- | --- | + | Task | [task_irma.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_irma.wdl) | + | Software Documentation | [IRMA website](https://wonder.cdc.gov/amd/flu/irma/) | + | Original Publication(s) | [Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-3030-6) | + + ??? task "`abricate`" + Abricate assigns types and subtype/lineages for flu samples + + !!! techdetails "Abricate Technical Details" + | | Links | + | --- | --- | + | Task | [task_abricate.wdl (abricate_flu subtask)](https://github.com/theiagen/public_health_bioinformatics/blob/2dff853defc6ea540a058873f6fe6a78cc2350c7/tasks/gene_typing/drug_resistance/task_abricate.wdl#L59) | + | Software Source Code | [ABRicate on GitHub](https://github.com/tseemann/abricate) | + | Software Documentation | [ABRicate on GitHub](https://github.com/tseemann/abricate) | + + ??? task "`genoflu`" + This sub-workflow determines the whole-genome genotype of an H5N1 flu sample. + + !!! techdetails "GenoFLU Technical Details" + | | Links | + | --- | --- | + | Task | [task_genoflu.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/orthomyxoviridae/task_genoflu.wdl) | + | Software Source Code | [GenoFLU on GitHub](https://github.com/USDA-VS/GenoFLU) | + +??? task "`gather_scatter`: Generate Summary File" + The `gather_scatter` task generates a summary file with all the results for all taxon IDs with identified reads. Please see the [`results_by_taxon_tsv`](#results_by_taxon_tsv) section below for more information. + + !!! techdetails "gather_scatter Technical Details" + | | Links | + | --- | --- | + | Task | [task_gather_scatter.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/data_handling/task_gather_scatter.wdl) | + +### Outputs + +
+ +| **Variable** | **Type** | **Description** | +|---|---|---| +| identified_organisms | Array[String] | A list of organisms that were able to be identified in the sample with the specified Kraken2 database | +| kraken2_classified_report | File | Standard Kraken2 output report. TXT filetype, but can be opened in Excel as a TSV file | +| kraken2_database | String | The name of the database used to run Kraken2 | +| kraken2_docker | String | Docker image used to run kraken2 | +| kraken2_report | File | Text document describing taxonomic prediction of every FASTQ record. This file can be very large and cumbersome to open and view | +| kraken2_version | String | The version of Kraken2 used in the analysis | +| results_by_taxon_tsv | File | A TSV file that contains the results for every taxon ID provided in the taxon_ids input variable that had reads identified; characterization (if applicable) and basic statistics regarding read count, assembly generation (if applicable), and general quality, are also associated with each bin; see below for more details. | +| theiameta_panel_illumina_pe_analysis_date | String | Date the workflow was run | +| theiameta_panel_illumina_pe_version | String | Version of PHB used to run the workflow | + +
+ +#### The `results_by_taxon_tsv` Output File {#results_by_taxon_tsv} + +This TSV file contains a summary of all of the taxon IDs provided in the `taxon_ids` input variable that had reads identified, with each row representing a taxon ID. + +Depending on if reads could be extract for the taxon ID, the `organism` column will contain the name of the organism. This column will be blank if no reads were able to be extracted for the taxon ID in the sample. + +??? toggle "What columns are included?" + The following columns are included in the `results_by_taxon_tsv` file: + + - `taxon_id`: The taxon ID used for the binning, generated for all taxon IDs provided in the `taxon_ids` input variable + - `organism`: The name of the organism associated with the taxon ID if reads were able to be extracted; the following columns are blank if no reads were able to be extracted for the taxon ID in the sample + - `extracted_read1`: The GSURI of the extracted read1 FASTQ file + - `extracted_read2`: The GSURI of the extracted read2 FASTQ file + - `krakentools_docker`: The Docker image used to run KrakenTools' `extract_kraken_reads` + - `fastq_scan_num_reads_binned1`: The number of reads in the extracted read1 FASTQ file + - `fastq_scan_num_reads_binned2`: The number of reads in the extracted read2 FASTQ file + - `fastq_scan_num_reads_binned_pairs`: The number of read pairs in the extracted read1 and read2 FASTQ files + - `fastq_scan_docker`: The Docker image used to run the `fastq_scan` task + - `fastq_scan_version`: The version of the `fastq_scan` tool used in the analysis + - `metaspades_warning`: A warning message if an empty assembly was produced for the taxon ID; blank if assembly was successful + - `pilon_warning`: A warning message if Pilon failed, blank if assembly polishing was successful + - `assembly_fasta`: A GSURI to the assembly FASTA file + - `quast_genome_length`: The length of the assembly + - `quast_number_contigs`: The number of contigs in the assembly + - `quast_n50`: The N50 value of the assembly + - `quast_gc_percent`: The GC content of the assembly + - `number_N`: The number of Ns in the assembly + - `number_ATCG`: The number of ATCGs in the assembly + - `number_Degenerate`: The number of degenerate bases in the assembly + - `number_Total`: The total number of bases in the assembly + - `percent_reference_coverage`: The percent of the reference genome covered by the assembly; only applicable if the taxon ID is already supported by TheiaCoV (additional assembly files may be added in the future) + + Any subsequent columns are specific to the identified organism and taxon ID; typically, values for these columns are only produced if the organism is natively supported by the TheiaCoV workflows. + + ??? toggle "SARS-CoV-2: _Pangolin_" + - `pango_lineage`: The Pango lineage of the assembly + - `pango_lineage_expanded`: The Pango lineage of the assembly without aliases + - `pangolin_conflicts`: The number of conflicts in the Pango lineage + - `pangolin_notes`: Any notes generated by Pangolin about the lineage + - `pangolin_assignment_version`: The version of the assignment module used to assign the Pango lineage + - `pangolin_version`: The version of Pangolin used to generate the Pango lineage + - `pangolin_docker`: The Docker image used to run Pangolin + + ??? toggle "Mpox, SARS-CoV-2, RSV-A, RSV-B: _Nextclade_" + - `nextclade_version`: The version of Nextclade used + - `nextclade_docker`: The Docker image used to run Nextclade + - `nextclade_ds_tag`: The dataset tag used to run Nextclade + - `nextclade_aa_subs`: Amino-acid substitutions as detected by Nextclade + - `nextclade_aa_dels`: Amino-acid deletions as detected by Nextclade + - `nextclade_clade`: Nextclade clade designation + - `nextclade_lineage`: Nextclade lineage designation + - `nextclade_qc`: QC metric as determined by Nextclade + + ??? toggle "Flu: _Nextclade_, _IRMA_, _GenoFLU_, _ABRicate_" + - `nextclade_version`: The version of Nextclade used + - `nextclade_docker`: The Docker image used to run Nextclade + - `nextclade_ds_tag_flu_ha`: The dataset tag used to run Nextclade for the HA segment + - `nextclade_aa_subs_flu_ha`: Amino-acid substitutions as detected by Nextclade for the HA segment + - `nextclade_aa_dels_flu_ha`: Amino-acid deletions as detected by Nextclade for the HA segment + - `nextclade_clade_flu_ha`: Nextclade clade designation for the HA segment + - `nextclade_lineage_flu_ha`: Nextclade lineage designation for the HA segment + - `nextclade_qc_flu_ha`: QC metric as determined by Nextclade for the HA segment + - `nextclade_ds_tag_flu_na`: The dataset tag used to run Nextclade for the NA segment + - `nextclade_aa_subs_na`: Amino-acid substitutions as detected by Nextclade for the NA segment + - `nextclade_aa_dels_na`: Amino-acid deletions as detected by Nextclade for the NA segment + - `nextclade_clade_flu_na`: Nextclade clade designation for the NA segment + - `nextclade_lineage_flu_na`: Nextclade lineage designation for the NA segment + - `nextclade_qc_flu_na`: QC metric as determined by Nextclade for the NA segment + - `irma_version`: The version of IRMA used + - `irma_docker`: The Docker image used to run IRMA + - `irma_type`: The flu type identified by IRMA + - `irma_subtype`: The flu subtype identified by IRMA + - `irma_subtype_notes`: Any notes generated by IRMA about the subtype + - `genoflu_version`: The version of GenoFLU used + - `genoflu_genotype`: The complete genotype of the flu sample + - `genoflu_all_segments`: The genotype of each flu segment in the sample + - `abricate_flu_type`: The flu type identified by ABRicate + - `abricate_flu_subtype`: The flu subtype identified by ABRicate + - `abricate_flu_database`: The flu database used by ABRicate + - `abricate_flu_version`: The version of ABRicate used + +This file can be downloaded and opened in Excel to view the full result summary for the sample. Due to the nature of the TheiaMeta_Panel workflow and Terra, displaying this information in the Terra table would be challenging to view, which is why we have generated this file. If you have any suggestions on formatting or additional outputs, please let us know at or by submitting an issue. + +## References + +> **Trimmomatic**: Bolger AM, Lohse M, Usadel B. Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics. 2014 Aug 1;30(15):2114-20. doi: 10.1093/bioinformatics/btu170. Epub 2014 Apr 1. PMID: 24695404; PMCID: PMC4103590. + +> **fastp**: Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018 Sep 1;34(17):i884-i890. doi: 10.1093/bioinformatics/bty560. PMID: 30423086; PMCID: PMC6129281. + +> **MIDAS**: Nayfach S, Rodriguez-Mueller B, Garud N, Pollard KS. An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography. Genome Res. 2016 Nov;26(11):1612-1625. doi: 10.1101/gr.201863.115. Epub 2016 Oct 18. PMID: 27803195; PMCID: PMC5088602. + +> **Kraken2**: Wood DE, Lu J, Langmead B. Improved metagenomic analysis with Kraken 2. Genome Biol. 2019 Nov 28;20(1):257. doi: 10.1186/s13059-019-1891-0. PMID: 31779668; PMCID: PMC6883579. + +> **KrakenTools**: Lu J, Rincon N, Wood DE, Breitwieser FP, Pockrandt C, Langmead B, Salzberg SL, Steinegger M. Metagenome analysis using the Kraken software suite. Nat Protoc. 2022 Dec;17(12):2815-2839. doi: 10.1038/s41596-022-00738-y. Epub 2022 Sep 28. Erratum in: Nat Protoc. 2024 Aug 29. doi: 10.1038/s41596-024-01064-1. PMID: 36171387; PMCID: PMC9725748. + +> **metaSPAdes**: Nurk S, Meleshko D, Korobeynikov A, Pevzner PA. metaSPAdes: a new versatile metagenomic assembler. Genome Res. 2017 May;27(5):824-834. doi: 10.1101/gr.213959.116. Epub 2017 Mar 15. PMID: 28298430; PMCID: PMC5411777. + +> **minimap2**: Li H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics. 2018 Sep 15;34(18):3094-3100. doi: 10.1093/bioinformatics/bty191. PMID: 29750242; PMCID: PMC6137996. + +> **SAMtools**: Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PMID: 19505943; PMCID: PMC2723002. + +> **SAMtools**: Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819. + +> **Pilon**: Walker BJ, Abeel T, Shea T, Priest M, Abouelliel A, Sakthikumar S, Cuomo CA, Zeng Q, Wortman J, Young SK, Earl AM. Pilon: an integrated tool for comprehensive microbial variant detection and genome assembly improvement. PLoS One. 2014 Nov 19;9(11):e112963. doi: 10.1371/journal.pone.0112963. PMID: 25409509; PMCID: PMC4237348. + +> **QUAST**: Gurevich A, Saveliev V, Vyahhi N, Tesler G. QUAST: quality assessment tool for genome assemblies. Bioinformatics. 2013 Apr 15;29(8):1072-5. doi: 10.1093/bioinformatics/btt086. Epub 2013 Feb 19. PMID: 23422339; PMCID: PMC3624806. + +> **Pangolin**: RRambaut A, Holmes EC, O'Toole Á, Hill V, McCrone JT, Ruis C, du Plessis L, Pybus OG. A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology. Nat Microbiol. 2020 Nov;5(11):1403-1407. doi: 10.1038/s41564-020-0770-5. Epub 2020 Jul 15. PMID: 32669681; PMCID: PMC7610519. + +> **Nextclade**: Aksamentov et al., (2021). Nextclade: clade assignment, mutation calling and quality control for viral genomes. Journal of Open Source Software, 6(67), 3773, + +> **IRMA**: Shepard SS, Meno S, Bahl J, Wilson MM, Barnes J, Neuhaus E. Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler. BMC Genomics. 2016 Sep 5;17(1):708. doi: 10.1186/s12864-016-3030-6. Erratum in: BMC Genomics. 2016 Oct 13;17(1):801. doi: 10.1186/s12864-016-3138-8. PMID: 27595578; PMCID: PMC5011931. + diff --git a/docs/workflows_overview/workflows_alphabetically.md b/docs/workflows_overview/workflows_alphabetically.md index c937e815b..5501459f4 100644 --- a/docs/workflows_overview/workflows_alphabetically.md +++ b/docs/workflows_overview/workflows_alphabetically.md @@ -43,6 +43,7 @@ title: Alphabetical Workflows | [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | | [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | | [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaMeta Panel**](../workflows/genomic_characterization/theiameta_panel.md) | Genome assembly and QC from metagenomic sequencing using a panel | Viral | Sample-level | Yes |2.X.X | [TheiaMeta_Panel_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Panel_PHB:main?tab=info) | | [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | | [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | | [**Transfer_Column_Content**](../workflows/data_export/transfer_column_content.md)| Transfer contents of a specified Terra data table column for many samples ("entities") to a GCP storage bucket location | Any taxa | Set-level | Yes | v1.3.0 | [Transfer_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Transfer_Column_Content_PHB:main?tab=info) | diff --git a/docs/workflows_overview/workflows_kingdom.md b/docs/workflows_overview/workflows_kingdom.md index d10fa2afd..85b0b4714 100644 --- a/docs/workflows_overview/workflows_kingdom.md +++ b/docs/workflows_overview/workflows_kingdom.md @@ -88,6 +88,7 @@ title: Workflows by Kingdom | [**Terra_2_GISAID**](../workflows/public_data_sharing/terra_2_gisaid.md)| Upload of assembly data to GISAID | SARS-CoV-2, Viral | Set-level | Yes | v1.2.1 | [Terra_2_GISAID_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_GISAID_PHB:main?tab=info) | | [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | | [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | +| [**TheiaMeta Panel**](../workflows/genomic_characterization/theiameta_panel.md) | Genome assembly and QC from metagenomic sequencing using a panel | Viral | Sample-level | Yes | v2.X.X | [TheiaMeta_Panel_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Panel_PHB:main?tab=info) | | [**Usher_PHB**](../workflows/phylogenetic_placement/usher.md)| Use UShER to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Usher_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Usher_PHB:main?tab=info) | | [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | diff --git a/docs/workflows_overview/workflows_type.md b/docs/workflows_overview/workflows_type.md index 14f23fd92..5f8c20bd9 100644 --- a/docs/workflows_overview/workflows_type.md +++ b/docs/workflows_overview/workflows_type.md @@ -30,6 +30,7 @@ title: Workflows by Type | [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | | [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | | [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaMeta Panel**](../workflows/genomic_characterization/theiameta_panel.md) | Genome assembly and QC from metagenomic sequencing using a panel | Viral | Sample-level | Yes | v2.X.X | [TheiaMeta_Panel_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Panel_PHB:main?tab=info) | | [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | | [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | diff --git a/mkdocs.yml b/mkdocs.yml index 613f81b15..0bb95cfcf 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,6 +25,7 @@ nav: - TheiaCoV Workflow Series: workflows/genomic_characterization/theiacov.md - TheiaEuk Workflow Series: workflows/genomic_characterization/theiaeuk.md - TheiaMeta: workflows/genomic_characterization/theiameta.md + - TheiaMeta_Panel: workflows/genomic_characterization/theiameta_panel.md - TheiaProk Workflow Series: workflows/genomic_characterization/theiaprok.md - VADR_Update: workflows/genomic_characterization/vadr_update.md - Phylogenetic Construction: @@ -115,6 +116,7 @@ nav: - Terra_2_GISAID: workflows/public_data_sharing/terra_2_gisaid.md - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md - TheiaCoV Workflow Series: workflows/genomic_characterization/theiacov.md + - TheiaMeta_Panel: workflows/genomic_characterization/theiameta_panel.md - Usher_PHB: workflows/phylogenetic_placement/usher.md - VADR_Update: workflows/genomic_characterization/vadr_update.md - Workflows Alphabetically: @@ -152,6 +154,7 @@ nav: - TheiaCoV Workflow Series: workflows/genomic_characterization/theiacov.md - TheiaEuk Workflow Series: workflows/genomic_characterization/theiaeuk.md - TheiaMeta: workflows/genomic_characterization/theiameta.md + - TheiaMeta_Panel: workflows/genomic_characterization/theiameta_panel.md - TheiaProk Workflow Series: workflows/genomic_characterization/theiaprok.md - TheiaValidate: workflows/standalone/theiavalidate.md - Transfer_Column_Content: workflows/data_export/transfer_column_content.md diff --git a/tasks/assembly/task_metaspades.wdl b/tasks/assembly/task_metaspades.wdl index a6473ae14..4982e3521 100644 --- a/tasks/assembly/task_metaspades.wdl +++ b/tasks/assembly/task_metaspades.wdl @@ -15,7 +15,9 @@ task metaspades_pe { } command <<< metaspades.py --version | head -1 | cut -d ' ' -f 2 | tee VERSION - metaspades.py \ + touch WARNING + + if metaspades.py \ -1 ~{read1_cleaned} \ -2 ~{read2_cleaned} \ ~{'-k ' + kmers} \ @@ -23,15 +25,25 @@ task metaspades_pe { -t ~{cpu} \ -o metaspades \ --phred-offset ~{phred_offset} \ - ~{metaspades_opts} + ~{metaspades_opts}; then + + mv metaspades/contigs.fasta ~{samplename}_contigs.fasta - mv metaspades/contigs.fasta ~{samplename}_contigs.fasta + if [ ! -s ~{samplename}_contigs.fasta ]; then + tee "Metaspades produced an empty assembly for ~{samplename}" > WARNING + rm -f ~{samplename}_contigs.fasta + fi + + else + tee "Metaspades failed to assemble for ~{samplename}" > WARNING + fi >>> output { - File assembly_fasta = "~{samplename}_contigs.fasta" + File? assembly_fasta = "~{samplename}_contigs.fasta" String metaspades_version = read_string("VERSION") String metaspades_docker = '~{docker}' + String metaspades_warning = read_string("WARNING") } runtime { docker: "~{docker}" diff --git a/tasks/quality_control/basic_statistics/task_fastq_scan.wdl b/tasks/quality_control/basic_statistics/task_fastq_scan.wdl index e2f4a2d4d..7d614e57c 100644 --- a/tasks/quality_control/basic_statistics/task_fastq_scan.wdl +++ b/tasks/quality_control/basic_statistics/task_fastq_scan.wdl @@ -4,13 +4,14 @@ task fastq_scan_pe { input { File read1 File read2 - String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") - String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") - Int disk_size = 50 + + Int disk_size = 100 String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-scan:1.0.1--h4ac6f70_3" Int memory = 2 Int cpu = 1 } + String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") + String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") command <<< # exit task in case anything fails in one-liners or variables are unset set -euo pipefail @@ -77,12 +78,13 @@ task fastq_scan_pe { task fastq_scan_se { input { File read1 - String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") - Int disk_size = 50 + + Int disk_size = 100 Int memory = 2 Int cpu = 1 String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/fastq-scan:1.0.1--h4ac6f70_3" } + String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") command <<< # exit task in case anything fails in one-liners or variables are unset set -euo pipefail diff --git a/tasks/quality_control/read_filtering/task_pilon.wdl b/tasks/quality_control/read_filtering/task_pilon.wdl index 2e869832d..b68f64612 100644 --- a/tasks/quality_control/read_filtering/task_pilon.wdl +++ b/tasks/quality_control/read_filtering/task_pilon.wdl @@ -16,20 +16,26 @@ task pilon { pilon --version | cut -d' ' -f3 | tee VERSION # run pilon - pilon \ + if pilon \ --genome ~{assembly} \ --frags ~{bam} \ --output ~{samplename} \ --outdir pilon \ - --changes --vcf + --changes --vcf; then + touch WARNING + else + tee "Pilon failed to run for ~{samplename}" > WARNING + exit 1 + fi >>> output { - File assembly_fasta = "pilon/~{samplename}.fasta" - File changes = "pilon/~{samplename}.changes" - File vcf = "pilon/~{samplename}.vcf" + File? assembly_fasta = "pilon/~{samplename}.fasta" + File? changes = "pilon/~{samplename}.changes" + File? vcf = "pilon/~{samplename}.vcf" String pilon_version = read_string("VERSION") String pilon_docker = "~{docker}" + String pilon_warning = read_string("WARNING") } runtime { docker: "~{docker}" diff --git a/tasks/species_typing/betacoronavirus/task_pangolin.wdl b/tasks/species_typing/betacoronavirus/task_pangolin.wdl index 848a2fdc5..fa2fc9868 100644 --- a/tasks/species_typing/betacoronavirus/task_pangolin.wdl +++ b/tasks/species_typing/betacoronavirus/task_pangolin.wdl @@ -8,9 +8,9 @@ task pangolin4 { Float max_ambig = 0.5 String docker String? analysis_mode - Boolean expanded_lineage=true - Boolean skip_scorpio=false - Boolean skip_designation_cache=false + Boolean expanded_lineage = true + Boolean skip_scorpio = false + Boolean skip_designation_cache = false String? pangolin_arguments Int disk_size = 100 Int memory = 8 diff --git a/tasks/taxon_id/contamination/task_kraken2.wdl b/tasks/taxon_id/contamination/task_kraken2.wdl index 4a43106f6..ad54432c7 100644 --- a/tasks/taxon_id/contamination/task_kraken2.wdl +++ b/tasks/taxon_id/contamination/task_kraken2.wdl @@ -168,8 +168,8 @@ task kraken2_standalone { File kraken2_unclassified_read1 = "~{samplename}.unclassified_1.fastq.gz" File? kraken2_unclassified_read2 = "~{samplename}.unclassified_2.fastq.gz" File kraken2_classified_read1 = "~{samplename}.classified_1.fastq.gz" - Float kraken2_percent_human = read_float("PERCENT_HUMAN") File? kraken2_classified_read2 = "~{samplename}.classified_2.fastq.gz" + Float kraken2_percent_human = read_float("PERCENT_HUMAN") String kraken2_database = kraken2_db } runtime { diff --git a/tasks/taxon_id/task_krakentools.wdl b/tasks/taxon_id/task_krakentools.wdl new file mode 100644 index 000000000..4788bdea6 --- /dev/null +++ b/tasks/taxon_id/task_krakentools.wdl @@ -0,0 +1,61 @@ +version 1.0 + +task extract_kraken_reads { + input { + File kraken2_output + File kraken2_report + File read1 + File read2 + Int taxon_id + + Int cpu = 1 + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/krakentools:d4a2fbe" + Int memory = 4 + } + command <<< + gunzip -c ~{kraken2_output} > kraken2_output_unzipped.txt + + python3 /KrakenTools/extract_kraken_reads.py \ + -k kraken2_output_unzipped.txt \ + -s1 ~{read1} \ + -s2 ~{read2} \ + --taxid ~{taxon_id} \ + --report ~{kraken2_report} \ + --include-parents \ + --include-children \ + --fastq-output \ + --output ~{taxon_id}_1.fastq \ + --output2 ~{taxon_id}_2.fastq + + if [ -s ~{taxon_id}_1.fastq ]; then + echo "DEBUG: Taxon ~{taxon_id} reads extracted" + echo "true" > CONTINUE + + gzip ~{taxon_id}_1.fastq + gzip ~{taxon_id}_2.fastq + else + echo "DEBUG: No reads were extracted for taxon ~{taxon_id}, removing empty files" + echo "false" > CONTINUE + fi + + grep ~{taxon_id} ~{kraken2_report} | awk '{for (i=6; i <= NF; ++i) print $i}' | tr '\n' ' ' | xargs > ORGANISM_NAME + + >>> + output { + File? extracted_read1 = "~{taxon_id}_1.fastq.gz" + File? extracted_read2 = "~{taxon_id}_2.fastq.gz" + String organism_name = read_string("ORGANISM_NAME") + String krakentools_docker = docker + Boolean success = read_boolean("CONTINUE") + } + runtime { + cpu: cpu + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " SSD" + docker: docker + memory: "~{memory} GB" + preemptible: 1 + maxRetries: 3 + } +} \ No newline at end of file diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl new file mode 100644 index 000000000..09523e4f3 --- /dev/null +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -0,0 +1,189 @@ +version 1.0 + +task gather_scatter { + input { + String samplename + File? taxon_ids + # krakentools outputs + File? organism + File? extracted_read1 + File? extracted_read2 + File? krakentools_docker + # fastq_scan outputs + File? fastq_scan_num_reads_binned1 + File? fastq_scan_num_reads_binned2 + File? fastq_scan_num_reads_binned_pairs + File? fastq_scan_docker + File? fastq_scan_version + # Assembly + File? metaspades_warning + File? pilon_warning + File? assembly_fasta + # quast outputs + File? quast_genome_length + File? quast_number_contigs + File? quast_n50 + File? quast_gc_percent + # consensus qc outputs + File? number_N + File? number_ATCG + File? number_Degenerate + File? number_Total + File? percent_reference_coverage + # pangolin outputs + File? pango_lineage + File? pango_lineage_expanded + File? pangolin_conflicts + File? pangolin_notes + File? pangolin_assignment_version + File? pangolin_versions + File? pangolin_docker + # Nextclade outputs for non-flu + File? nextclade_version + File? nextclade_docker + File? nextclade_ds_tag + File? nextclade_aa_subs + File? nextclade_aa_dels + File? nextclade_clade + File? nextclade_lineage + File? nextclade_qc + # Nextclade outputs for flu HA + File? nextclade_ds_tag_flu_ha + File? nextclade_aa_subs_flu_ha + File? nextclade_aa_dels_flu_ha + File? nextclade_clade_flu_ha + File? nextclade_qc_flu_ha + # Nextclade outputs for flu NA + File? nextclade_ds_tag_flu_na + File? nextclade_aa_subs_flu_na + File? nextclade_aa_dels_flu_na + File? nextclade_clade_flu_na + File? nextclade_qc_flu_na + # IRMA outputs + File? irma_version + File? irma_docker + File? irma_type + File? irma_subtype + File? irma_subtype_notes + # GenoFLU outputs + File? genoflu_version + File? genoflu_genotype + File? genoflu_all_segments + # abricate outputs + File? abricate_flu_type + File? abricate_flu_subtype + File? abricate_flu_database + File? abricate_flu_version + + # runtime parameters + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16" + Int disk_size = 50 + Int cpu = 2 + Int memory = 8 + } + command <<< + python3<>> + output { + File gathered_results = "~{samplename}.results.tsv" + Array[String] organism_names = read_lines("~{samplename}.organism_names.tsv") + } + runtime { + docker: "~{docker}" + memory: memory + " GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 0 + preemptible: 0 + } +} \ No newline at end of file diff --git a/workflows/theiameta/wf_theiameta_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_illumina_pe.wdl index 2a6a23488..7cf9d0fd0 100644 --- a/workflows/theiameta/wf_theiameta_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_illumina_pe.wdl @@ -71,110 +71,114 @@ workflow theiameta_illumina_pe { read2_cleaned = read_QC_trim.read2_clean, samplename = samplename } - call minimap2_task.minimap2 as minimap2_assembly_correction { - input: - query1 = read_QC_trim.read1_clean, - query2 = read_QC_trim.read2_clean, - reference = metaspades_pe.assembly_fasta, - samplename = samplename, - mode = "sr", - output_sam = true - } - call parse_mapping_task.sam_to_sorted_bam as sort_bam_assembly_correction { - input: - sam = minimap2_assembly_correction.minimap2_out, - samplename = samplename - } - call pilon_task.pilon { - input: - assembly = metaspades_pe.assembly_fasta, - bam = sort_bam_assembly_correction.bam, - bai = sort_bam_assembly_correction.bai, - samplename = samplename - } - # if reference is provided, perform mapping of assembled contigs to - # reference with minimap2, and extract those as final assembly - if (defined(reference)) { - call minimap2_task.minimap2 as minimap2_assembly { - input: - query1 = pilon.assembly_fasta, - reference = select_first([reference]), - samplename = samplename, - mode = "asm20", - output_sam = false - } - call parse_mapping_task.retrieve_aligned_contig_paf { - input: - paf = minimap2_assembly.minimap2_out, - assembly = pilon.assembly_fasta, - samplename = samplename - } - call parse_mapping_task.calculate_coverage_paf { - input: - paf = minimap2_assembly.minimap2_out - } - } - call quast_task.quast { + if (defined(metaspades_pe.assembly_fasta)) { + call minimap2_task.minimap2 as minimap2_assembly_correction { input: - assembly = select_first([retrieve_aligned_contig_paf.final_assembly, pilon.assembly_fasta]), + query1 = read_QC_trim.read1_clean, + query2 = read_QC_trim.read2_clean, + reference = select_first([metaspades_pe.assembly_fasta]), samplename = samplename, - min_contig_length = 1 - } - if (output_additional_files) { - call minimap2_task.minimap2 as minimap2_reads { - input: - query1 = read_QC_trim.read1_clean, - query2 = read_QC_trim.read2_clean, - reference = select_first([retrieve_aligned_contig_paf.final_assembly, pilon.assembly_fasta]), - samplename = samplename, - mode = "sr", - output_sam = true - } - call parse_mapping_task.sam_to_sorted_bam { - input: - sam = minimap2_reads.minimap2_out, - samplename = samplename - } - call parse_mapping_task.calculate_coverage { - input: - bam = sam_to_sorted_bam.bam, - bai = sam_to_sorted_bam.bai - } - call parse_mapping_task.retrieve_pe_reads_bam as retrieve_unaligned_pe_reads_sam { - input: - bam = sam_to_sorted_bam.bam, - samplename = samplename, - prefix = "unassembled", - sam_flag = 4 + mode = "sr", + output_sam = true + } + call parse_mapping_task.sam_to_sorted_bam as sort_bam_assembly_correction { + input: + sam = minimap2_assembly_correction.minimap2_out, + samplename = samplename + } + call pilon_task.pilon { + input: + assembly = select_first([metaspades_pe.assembly_fasta]), + bam = sort_bam_assembly_correction.bam, + bai = sort_bam_assembly_correction.bai, + samplename = samplename + } + if (defined(pilon.assembly_fasta)) { + # if reference is provided, perform mapping of assembled contigs to + # reference with minimap2, and extract those as final assembly + if (defined(reference)) { + call minimap2_task.minimap2 as minimap2_assembly { + input: + query1 = select_first([pilon.assembly_fasta]), + reference = select_first([reference]), + samplename = samplename, + mode = "asm20", + output_sam = false + } + call parse_mapping_task.retrieve_aligned_contig_paf { + input: + paf = minimap2_assembly.minimap2_out, + assembly = select_first([pilon.assembly_fasta]), + samplename = samplename + } + call parse_mapping_task.calculate_coverage_paf { + input: + paf = minimap2_assembly.minimap2_out + } } - call parse_mapping_task.retrieve_pe_reads_bam as retrieve_aligned_pe_reads_sam { + call quast_task.quast { input: - bam = sam_to_sorted_bam.bam, + assembly = select_first([retrieve_aligned_contig_paf.final_assembly, pilon.assembly_fasta]), samplename = samplename, - sam_flag = 2, - prefix = "assembled" + min_contig_length = 1 + } + if (output_additional_files) { + call minimap2_task.minimap2 as minimap2_reads { + input: + query1 = read_QC_trim.read1_clean, + query2 = read_QC_trim.read2_clean, + reference = select_first([retrieve_aligned_contig_paf.final_assembly, pilon.assembly_fasta]), + samplename = samplename, + mode = "sr", + output_sam = true + } + call parse_mapping_task.sam_to_sorted_bam { + input: + sam = minimap2_reads.minimap2_out, + samplename = samplename + } + call parse_mapping_task.calculate_coverage { + input: + bam = sam_to_sorted_bam.bam, + bai = sam_to_sorted_bam.bai + } + call parse_mapping_task.retrieve_pe_reads_bam as retrieve_unaligned_pe_reads_sam { + input: + bam = sam_to_sorted_bam.bam, + samplename = samplename, + prefix = "unassembled", + sam_flag = 4 + } + call parse_mapping_task.retrieve_pe_reads_bam as retrieve_aligned_pe_reads_sam { + input: + bam = sam_to_sorted_bam.bam, + samplename = samplename, + sam_flag = 2, + prefix = "assembled" + } + call parse_mapping_task.assembled_reads_percent { + input: + bam = sam_to_sorted_bam.bam, + } } - call parse_mapping_task.assembled_reads_percent { - input: - bam = sam_to_sorted_bam.bam, - } - } - if (! defined(reference)) { - call bwa_task.bwa as bwa { - input: - read1 = read_QC_trim.read1_clean, - read2 = read_QC_trim.read2_clean, - reference_genome = pilon.assembly_fasta, - samplename = samplename - } - call semibin_task.semibin as semibin { - input: - sorted_bam = bwa.sorted_bam, - sorted_bai = bwa.sorted_bai, - assembly_fasta = pilon.assembly_fasta, - samplename = samplename + if (! defined(reference)) { + call bwa_task.bwa as bwa { + input: + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean, + reference_genome = pilon.assembly_fasta, + samplename = samplename + } + call semibin_task.semibin as semibin { + input: + sorted_bam = bwa.sorted_bam, + sorted_bai = bwa.sorted_bai, + assembly_fasta = select_first([pilon.assembly_fasta]), + samplename = samplename + } } } + } call versioning.version_capture { input: } @@ -244,20 +248,20 @@ workflow theiameta_illumina_pe { String metaspades_version = metaspades_pe.metaspades_version String metaspades_docker = metaspades_pe.metaspades_docker # Assembly - minimap2 - String minimap2_version = minimap2_assembly_correction.minimap2_version - String minimap2_docker = minimap2_assembly_correction.minimap2_docker + String? minimap2_version = minimap2_assembly_correction.minimap2_version + String? minimap2_docker = minimap2_assembly_correction.minimap2_docker # Assembly - samtools - String samtools_version = sort_bam_assembly_correction.samtools_version - String samtools_docker = sort_bam_assembly_correction.samtools_docker + String? samtools_version = sort_bam_assembly_correction.samtools_version + String? samtools_docker = sort_bam_assembly_correction.samtools_docker # Assembly - pilon - String pilon_version = pilon.pilon_version - String pilon_docker = pilon.pilon_docker + String? pilon_version = pilon.pilon_version + String? pilon_docker = pilon.pilon_docker # Assembly QC - quast - Int assembly_length = quast.genome_length - Int contig_number = quast.number_contigs - Int largest_contig = quast.largest_contig - String quast_version = quast.version - String quast_docker = quast.quast_docker + Int? assembly_length = quast.genome_length + Int? contig_number = quast.number_contigs + Int? largest_contig = quast.largest_contig + String? quast_version = quast.version + String? quast_docker = quast.quast_docker # Assembly QC - minimap2 Float? percent_coverage = calculate_coverage_paf.percent_coverage # Assembly QC - bedtools diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl new file mode 100644 index 000000000..95a71f46a --- /dev/null +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -0,0 +1,195 @@ +version 1.0 + +import "../../tasks/alignment/task_minimap2.wdl" as minimap2_task +import "../../tasks/assembly/task_metaspades.wdl" as metaspades_task +import "../../tasks/quality_control/basic_statistics/task_fastq_scan.wdl" as fastq_scan +import "../../tasks/quality_control/basic_statistics/task_quast.wdl" as quast_task +import "../../tasks/quality_control/read_filtering/task_pilon.wdl" as pilon_task +import "../../tasks/task_versioning.wdl" as versioning +import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken_task +import "../../tasks/taxon_id/task_krakentools.wdl" as krakentools_task +import "../../tasks/utilities/data_handling/task_gather_scatter.wdl" as gather_scatter_task +import "../../tasks/utilities/data_handling/task_parse_mapping.wdl" as parse_mapping_task +import "../utilities/wf_morgana_magic.wdl" as morgana_magic_workflow +import "../utilities/wf_read_QC_trim_pe.wdl" as read_qc_trim_pe + +workflow theiameta_panel_illumina_pe { + input { + String samplename + File read1 + File read2 + # default taxon IDs for Illumina VSP panel + Array[Int] taxon_ids = [10244, 10255, 10298, 10359, 10376, 10632, 10804, 11021, 11029, 11033, 11034, 11036, 11039, 11041, 11053, 11060, 11069, 11070, 11072, 11079, 11080, 11082, 11083, 11084, 11089, 11137, 11234, 11292, 11520, 11552, 11577, 11580, 11587, 11588, 11676, 11709, 12092, 12475, 12538, 12542, 28875, 28876, 31631, 33743, 35305, 35511, 36427, 37124, 38766, 38767, 45270, 46839, 57482, 57483, 59301, 64286, 64320, 68887, 80935, 90961, 95341, 102793, 102796, 108098, 114727, 114729, 118655, 119210, 129875, 129951, 130308, 130309, 130310, 138948, 138949, 138950, 138951, 147711, 147712, 152219, 162145, 169173, 186538, 186539, 186540, 186541, 238817, 277944, 290028, 333278, 333760, 333761, 333762, 440266, 463676, 493803, 536079, 565995, 862909, 1003835, 1216928, 1221391, 1239565, 1239570, 1239573, 1277649, 1313215, 1330524, 1335626, 1348384, 1424613, 1452514, 1474807, 1497391, 1608084, 1618189, 1891764, 1891767, 1965344, 1980456, 2010960, 2169701, 2169991, 2560525, 2560602, 2697049, 2847089, 2901879, 2907957, 3052148, 3052223, 3052225, 3052230, 3052302, 3052307, 3052310, 3052314, 3052470, 3052477, 3052480, 3052489, 3052490, 3052493, 3052496, 3052499, 3052503, 3052505, 3052518, 10798, 11216, 1203539, 12730, 142786, 1803956, 208893, 2560526, 2849717, 3052303, 3052317, 3052484, 3052498, 746830, 746831, 943908] + + Int minimum_read_number = 1000 + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" + } + call versioning.version_capture { + input: + } + call read_qc_trim_pe.read_QC_trim_pe as read_QC_trim { + input: + samplename = samplename, + read1 = read1, + read2 = read2, + workflow_series = "theiameta", + # adding these additional inputs to hide them from Terra; these are not used and we don't want the user to modiy them + call_kraken = false, + kraken_disk_size = 0, + kraken_memory = 0, + kraken_cpu = 0, + kraken_db = kraken2_db, + target_organism = "" + } + # kraken does not run as part of the theiameta track in read_QC_trim -- we may want to change that + # if we do change that, we will want to change the inputs to read_QC_trim to no longer have defaults hiding them from Terra + call kraken_task.kraken2_standalone as kraken2 { + input: + samplename = samplename, + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean, + kraken2_db = kraken2_db + } + scatter (taxon_id in taxon_ids) { + call krakentools_task.extract_kraken_reads as krakentools { + input: + kraken2_output = kraken2.kraken2_classified_report, + kraken2_report = kraken2.kraken2_report, + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean, + taxon_id = taxon_id + } + if (krakentools.success) { + call fastq_scan.fastq_scan_pe as fastq_scan_binned { + input: + read1 = select_first([krakentools.extracted_read1]), + read2 = select_first([krakentools.extracted_read2]) + } + if (fastq_scan_binned.read1_seq > minimum_read_number) { + call metaspades_task.metaspades_pe { + input: + read1_cleaned = select_first([krakentools.extracted_read1]), + read2_cleaned = select_first([krakentools.extracted_read2]), + samplename = "~{samplename}_~{taxon_id}" + } + if (defined(metaspades_pe.assembly_fasta)) { + call minimap2_task.minimap2 as minimap2_assembly_correction { + input: + query1 = select_first([krakentools.extracted_read1]), + query2 = select_first([krakentools.extracted_read2]), + reference = select_first([metaspades_pe.assembly_fasta]), + samplename = "~{samplename}_~{taxon_id}", + mode = "sr", + output_sam = true + } + call parse_mapping_task.sam_to_sorted_bam as sort_bam_assembly_correction { + input: + sam = minimap2_assembly_correction.minimap2_out, + samplename = "~{samplename}_~{taxon_id}" + } + call pilon_task.pilon { + input: + assembly = select_first([metaspades_pe.assembly_fasta]), + bam = sort_bam_assembly_correction.bam, + bai = sort_bam_assembly_correction.bai, + samplename = "~{samplename}_~{taxon_id}" + } + if (defined(pilon.assembly_fasta)) { + call quast_task.quast { + input: + assembly = select_first([pilon.assembly_fasta]), + samplename = "~{samplename}_~{taxon_id}", + min_contig_length = 1 + } + call morgana_magic_workflow.morgana_magic { + input: + samplename = "~{samplename}_~{taxon_id}", + assembly_fasta = select_first([pilon.assembly_fasta]), + read1 = select_first([krakentools.extracted_read1]), + read2 = select_first([krakentools.extracted_read2]), + taxon_id = taxon_id, + seq_method = "ILLUMINA" + } + } + } + } + } + } + call gather_scatter_task.gather_scatter { + input: + samplename = samplename, + taxon_ids = write_json(taxon_ids), + organism = write_json(krakentools.organism_name), + extracted_read1 = write_json(krakentools.extracted_read1), + extracted_read2 = write_json(krakentools.extracted_read2), + krakentools_docker = write_json(krakentools.krakentools_docker), + fastq_scan_num_reads_binned1 = write_json(fastq_scan_binned.read1_seq), + fastq_scan_num_reads_binned2 = write_json(fastq_scan_binned.read2_seq), + fastq_scan_num_reads_binned_pairs = write_json(fastq_scan_binned.read_pairs), + fastq_scan_docker = write_json(fastq_scan_binned.fastq_scan_docker), + fastq_scan_version = write_json(fastq_scan_binned.version), + metaspades_warning = write_json(metaspades_pe.metaspades_warning), + pilon_warning = write_json(pilon.pilon_warning), + assembly_fasta = write_json(pilon.assembly_fasta), + quast_genome_length = write_json(quast.genome_length), + quast_number_contigs = write_json(quast.number_contigs), + quast_n50 = write_json(quast.n50_value), + quast_gc_percent = write_json(quast.gc_percent), + number_N = write_json(morgana_magic.number_N), + number_ATCG = write_json(morgana_magic.number_ATCG), + number_Degenerate = write_json(morgana_magic.number_Degenerate), + number_Total = write_json(morgana_magic.number_Total), + percent_reference_coverage = write_json(morgana_magic.percent_reference_coverage), + pango_lineage = write_json(morgana_magic.pango_lineage), + pango_lineage_expanded = write_json(morgana_magic.pango_lineage_expanded), + pangolin_conflicts = write_json(morgana_magic.pangolin_conflicts), + pangolin_notes = write_json(morgana_magic.pangolin_notes), + pangolin_assignment_version = write_json(morgana_magic.pangolin_assignment_version), + pangolin_versions = write_json(morgana_magic.pangolin_versions), + pangolin_docker = write_json(morgana_magic.pangolin_docker), + nextclade_version = write_json(morgana_magic.nextclade_version), + nextclade_docker = write_json(morgana_magic.nextclade_docker), + nextclade_ds_tag = write_json(morgana_magic.nextclade_ds_tag), + nextclade_aa_subs = write_json(morgana_magic.nextclade_aa_subs), + nextclade_aa_dels = write_json(morgana_magic.nextclade_aa_dels), + nextclade_clade = write_json(morgana_magic.nextclade_clade), + nextclade_lineage = write_json(morgana_magic.nextclade_lineage), + nextclade_qc = write_json(morgana_magic.nextclade_qc), + nextclade_ds_tag_flu_ha = write_json(morgana_magic.nextclade_ds_tag_flu_ha), + nextclade_aa_subs_flu_ha = write_json(morgana_magic.nextclade_aa_subs_flu_ha), + nextclade_aa_dels_flu_ha = write_json(morgana_magic.nextclade_aa_dels_flu_ha), + nextclade_clade_flu_ha = write_json(morgana_magic.nextclade_clade_flu_ha), + nextclade_qc_flu_ha = write_json(morgana_magic.nextclade_qc_flu_ha), + nextclade_ds_tag_flu_na = write_json(morgana_magic.nextclade_ds_tag_flu_na), + nextclade_aa_subs_flu_na = write_json(morgana_magic.nextclade_aa_subs_flu_na), + nextclade_aa_dels_flu_na = write_json(morgana_magic.nextclade_aa_dels_flu_na), + nextclade_clade_flu_na = write_json(morgana_magic.nextclade_clade_flu_na), + nextclade_qc_flu_na = write_json(morgana_magic.nextclade_qc_flu_na), + irma_version = write_json(morgana_magic.irma_version), + irma_docker = write_json(morgana_magic.irma_docker), + irma_type = write_json(morgana_magic.irma_type), + irma_subtype = write_json(morgana_magic.irma_subtype), + irma_subtype_notes = write_json(morgana_magic.irma_subtype_notes), + genoflu_version = write_json(morgana_magic.genoflu_version), + genoflu_genotype = write_json(morgana_magic.genoflu_genotype), + genoflu_all_segments = write_json(morgana_magic.genoflu_all_segments), + abricate_flu_type = write_json(morgana_magic.abricate_flu_type), + abricate_flu_subtype = write_json(morgana_magic.abricate_flu_subtype), + abricate_flu_database = write_json(morgana_magic.abricate_flu_database), + abricate_flu_version = write_json(morgana_magic.abricate_flu_version) + } + output { + # versioning outputs + String theiameta_panel_illumina_pe_version = version_capture.phb_version + String theiameta_panel_illumina_pe_analysis_date = version_capture.date + # kraken2 outputs + String kraken2_version = kraken2.kraken2_version + String kraken2_database = kraken2.kraken2_database + String kraken2_docker = kraken2.kraken2_docker + File kraken2_report = kraken2.kraken2_report + File kraken2_classified_report = kraken2.kraken2_classified_report + # krakentools outputs + Array[String] identified_organisms = gather_scatter.organism_names + File results_by_taxon_tsv = gather_scatter.gathered_results + } +} \ No newline at end of file diff --git a/workflows/utilities/wf_flu_track.wdl b/workflows/utilities/wf_flu_track.wdl index 6bb2f8c85..0db56d2ef 100644 --- a/workflows/utilities/wf_flu_track.wdl +++ b/workflows/utilities/wf_flu_track.wdl @@ -53,6 +53,7 @@ workflow flu_track { Int? abricate_flu_disk_size # flu antiviral substitutions subworkflow inputs + Boolean analyze_flu_antiviral_substitutions = true File? flu_h1_ha_ref File? flu_h3_ha_ref File? flu_n1_na_ref @@ -171,7 +172,7 @@ workflow flu_track { } # if IRMA was run successfully, run the flu_antiviral substitutions task # this block must be placed beneath the previous block because it is used in this subworkflow - if (defined(irma.irma_assemblies)) { + if (defined(irma.irma_assemblies) && analyze_flu_antiviral_substitutions) { call flu_antiviral.flu_antiviral_substitutions { input: na_segment_assembly = irma.seg_na_assembly_padded, diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl new file mode 100644 index 000000000..b444c7017 --- /dev/null +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -0,0 +1,220 @@ +version 1.0 + +import "../../tasks/quality_control/basic_statistics/task_consensus_qc.wdl" as consensus_qc_task +import "../../tasks/species_typing/betacoronavirus/task_pangolin.wdl" as pangolin +import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_task +import "../utilities/wf_flu_track.wdl" as flu_track_wf +import "../utilities/wf_organism_parameters.wdl" as set_organism_defaults +import "../utilities/wf_taxon_id_conversion.wdl" as taxon_id_conversion + +workflow morgana_magic { + input { + String samplename + File assembly_fasta + File read1 + File read2 + String taxon_id + String seq_method + # consensus qc + Int? consensus_qc_cpu + Int? consensus_qc_disk_size + String? consensus_qc_docker + Int? consensus_qc_memory + # assembly metrics + Int? assembly_metrics_cpu + Int? assembly_metrics_disk_size + String? assembly_metrics_docker + Int? assembly_metrics_memory + # flu track - irma + Int? irma_cpu + Int? irma_disk_size + String? irma_docker_image + Boolean? irma_keep_ref_deletions + Int? irma_memory + # flu track - genoflu + Int? genoflu_cpu + File? genoflu_cross_reference + Int? genoflu_disk_size + String? genoflu_docker + Int? genoflu_memory + # flu track - abricate + Int? abricate_flu_cpu + Int? abricate_flu_disk_size + String? abricate_flu_docker + Int? abricate_flu_memory + Int? abricate_flu_mincov + Int? abricate_flu_minid + # nextclade inputs + Int? nextclade_cpu + Int? nextclade_disk_size + String? nextclade_docker_image + Int? nextclade_memory + Int? nextclade_output_parser_cpu + Int? nextclade_output_parser_disk_size + String? nextclade_output_parser_docker + Int? nextclade_output_parser_memory + # pangolin inputs + Int? pangolin_cpu + Int? pangolin_disk_size + String? pangolin_docker_image + Int? pangolin_memory + } + call taxon_id_conversion.convert_taxon_ids { + input: + taxon_id = taxon_id + } + call set_organism_defaults.organism_parameters { + input: + organism = convert_taxon_ids.organism, + pangolin_docker_image = pangolin_docker_image + } + if (organism_parameters.standardized_organism != "unsupported") { # occurs in theiameta_panel + call consensus_qc_task.consensus_qc { + input: + assembly_fasta = assembly_fasta, + reference_genome = organism_parameters.reference, + genome_length = organism_parameters.genome_length, + cpu = consensus_qc_cpu, + disk_size = consensus_qc_disk_size, + docker = consensus_qc_docker, + memory = consensus_qc_memory + } + } + if (organism_parameters.standardized_organism == "flu") { + call flu_track_wf.flu_track { + input: + samplename = samplename, + read1 = read1, + read2 = read2, + seq_method = seq_method, + standardized_organism = organism_parameters.standardized_organism, + analyze_flu_antiviral_substitutions = false, + assembly_metrics_cpu = assembly_metrics_cpu, + assembly_metrics_disk_size = assembly_metrics_disk_size, + assembly_metrics_docker = assembly_metrics_docker, + assembly_metrics_memory = assembly_metrics_memory, + irma_cpu = irma_cpu, + irma_disk_size = irma_disk_size, + irma_docker_image = irma_docker_image, + irma_keep_ref_deletions = irma_keep_ref_deletions, + irma_memory = irma_memory, + genoflu_cross_reference = genoflu_cross_reference, + genoflu_cpu = genoflu_cpu, + genoflu_disk_size = genoflu_disk_size, + genoflu_docker = genoflu_docker, + genoflu_memory = genoflu_memory, + abricate_flu_cpu = abricate_flu_cpu, + abricate_flu_disk_size = abricate_flu_disk_size, + abricate_flu_docker = abricate_flu_docker, + abricate_flu_memory = abricate_flu_memory, + abricate_flu_mincov = abricate_flu_mincov, + abricate_flu_minid = abricate_flu_minid, + nextclade_cpu = nextclade_cpu, + nextclade_disk_size = nextclade_disk_size, + nextclade_docker_image = nextclade_docker_image, + nextclade_memory = nextclade_memory, + nextclade_output_parser_cpu = nextclade_output_parser_cpu, + nextclade_output_parser_disk_size = nextclade_output_parser_disk_size, + nextclade_output_parser_docker = nextclade_output_parser_docker, + nextclade_output_parser_memory = nextclade_output_parser_memory + } + } + if (organism_parameters.standardized_organism == "sars-cov-2") { + call pangolin.pangolin4 { + input: + samplename = samplename, + fasta = assembly_fasta, + docker = organism_parameters.pangolin_docker, + cpu = pangolin_cpu, + disk_size = pangolin_disk_size, + memory = pangolin_memory + } + } + if (organism_parameters.standardized_organism == "MPXV" || organism_parameters.standardized_organism == "sars-cov-2" || organism_parameters.standardized_organism == "rsv_a" || organism_parameters.standardized_organism == "rsv_b") { + call nextclade_task.nextclade_v3 { + input: + genome_fasta = assembly_fasta, + dataset_name = organism_parameters.nextclade_dataset_name, + dataset_tag = organism_parameters.nextclade_dataset_tag, + cpu = nextclade_cpu, + disk_size = nextclade_disk_size, + docker = nextclade_docker_image, + memory = nextclade_memory + } + call nextclade_task.nextclade_output_parser { + input: + nextclade_tsv = nextclade_v3.nextclade_tsv, + organism = organism_parameters.standardized_organism, + cpu = nextclade_output_parser_cpu, + disk_size = nextclade_output_parser_disk_size, + docker = nextclade_output_parser_docker, + memory = nextclade_output_parser_memory + } + } + output { + String organism = organism_parameters.standardized_organism + # Consensus QC outputs + Int? number_N = consensus_qc.number_N + Int? number_ATCG = consensus_qc.number_ATCG + Int? number_Degenerate = consensus_qc.number_Degenerate + Int? number_Total = consensus_qc.number_Total + Float? percent_reference_coverage = consensus_qc.percent_reference_coverage + # Pangolin outputs + String? pango_lineage = pangolin4.pangolin_lineage + String? pango_lineage_expanded = pangolin4.pangolin_lineage_expanded + String? pangolin_conflicts = pangolin4.pangolin_conflicts + String? pangolin_notes = pangolin4.pangolin_notes + String? pangolin_assignment_version = pangolin4.pangolin_assignment_version + File? pango_lineage_report = pangolin4.pango_lineage_report + String? pangolin_docker = pangolin4.pangolin_docker + String? pangolin_versions = pangolin4.pangolin_versions + # Nextclade outputs for all organisms + String nextclade_version = select_first([nextclade_v3.nextclade_version, flu_track.nextclade_version, ""]) + String nextclade_docker = select_first([nextclade_v3.nextclade_docker, flu_track.nextclade_docker, ""]) + # Nextclade outputs for non-flu + File? nextclade_json = nextclade_v3.nextclade_json + File? auspice_json = nextclade_v3.auspice_json + File? nextclade_tsv = nextclade_v3.nextclade_tsv + String nextclade_ds_tag = organism_parameters.nextclade_dataset_tag + String? nextclade_aa_subs = nextclade_output_parser.nextclade_aa_subs + String? nextclade_aa_dels = nextclade_output_parser.nextclade_aa_dels + String? nextclade_clade = nextclade_output_parser.nextclade_clade + String? nextclade_lineage = nextclade_output_parser.nextclade_lineage + String? nextclade_qc = nextclade_output_parser.nextclade_qc + # Nextclade outputs for flu HA + File? nextclade_json_flu_ha = flu_track.nextclade_json_flu_ha + File? auspice_json_flu_ha = flu_track.auspice_json_flu_ha + File? nextclade_tsv_flu_ha = flu_track.nextclade_tsv_flu_ha + String? nextclade_ds_tag_flu_ha = flu_track.nextclade_ds_tag_flu_ha + String? nextclade_aa_subs_flu_ha = flu_track.nextclade_aa_subs_flu_ha + String? nextclade_aa_dels_flu_ha = flu_track.nextclade_aa_dels_flu_ha + String? nextclade_clade_flu_ha = flu_track.nextclade_clade_flu_ha + String? nextclade_qc_flu_ha = flu_track.nextclade_qc_flu_ha + # Nextclade outputs for flu NA + File? nextclade_json_flu_na = flu_track.nextclade_json_flu_na + File? auspice_json_flu_na = flu_track.auspice_json_flu_na + File? nextclade_tsv_flu_na = flu_track.nextclade_tsv_flu_na + String? nextclade_ds_tag_flu_na = flu_track.nextclade_ds_tag_flu_na + String? nextclade_aa_subs_flu_na = flu_track.nextclade_aa_subs_flu_na + String? nextclade_aa_dels_flu_na = flu_track.nextclade_aa_dels_flu_na + String? nextclade_clade_flu_na = flu_track.nextclade_clade_flu_na + String? nextclade_qc_flu_na = flu_track.nextclade_qc_flu_na + # Flu IRMA Outputs + String? irma_version = flu_track.irma_version + String? irma_docker = flu_track.irma_docker + String? irma_type = flu_track.irma_type + String? irma_subtype = flu_track.irma_subtype + String? irma_subtype_notes = flu_track.irma_subtype_notes + # Flu GenoFLU Outputs + String? genoflu_version = flu_track.genoflu_version + String? genoflu_genotype = flu_track.genoflu_genotype + String? genoflu_all_segments = flu_track.genoflu_all_segments + File? genoflu_output_tsv = flu_track.genoflu_output_tsv + # Flu Abricate Outputs + String? abricate_flu_type = flu_track.abricate_flu_type + String? abricate_flu_subtype = flu_track.abricate_flu_subtype + File? abricate_flu_results = flu_track.abricate_flu_results + String? abricate_flu_database = flu_track.abricate_flu_database + String? abricate_flu_version = flu_track.abricate_flu_version + } +} \ No newline at end of file diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index 9e2b648fc..0f4e26d14 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -22,7 +22,7 @@ workflow organism_parameters { File? gene_locations_bed_file Int? genome_length_input - # set default nextclade information as NA + # set default nextclade information as "NA" String? nextclade_dataset_tag_input String? nextclade_dataset_name_input @@ -48,6 +48,9 @@ workflow organism_parameters { Float? narrow_bandwidth Float? proportion_wide } + if (organism == "unsupported") { + Int unsupported_genome_length = 0 + } if (organism == "sars-cov-2" || organism == "SARS-CoV-2") { String sc2_org_name = "sars-cov-2" String sc2_reference_genome = "gs://theiagen-public-files-rp/terra/augur-sars-cov-2-references/MN908947.fasta" @@ -212,7 +215,7 @@ workflow organism_parameters { File rsv_a_clades_tsv = "gs://theiagen-public-files-rp/terra/rsv_references/rsv_a_clades.tsv" File rsv_a_reference_gbk = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.gb" File rsv_a_auspice_config = "gs://theiagen-public-files-rp/terra/rsv_references/rsv_auspice_config.json" - Int rsv_a_min_num_unambig = 10850 #using 70% of 15500 + Int rsv_a_min_num_unambig = 10850 # using 70% of 15500 # inherited from flu defaults Float rsv_a_min_date = 2020.0 Int rsv_a_pivot_interval = 1 @@ -236,7 +239,7 @@ workflow organism_parameters { File rsv_b_clades_tsv = "gs://theiagen-public-files-rp/terra/rsv_references/rsv_b_clades.tsv" File rsv_b_reference_gbk = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.gb" File rsv_b_auspice_config = "gs://theiagen-public-files-rp/terra/rsv_references/rsv_auspice_config.json" - Int rsv_b_min_num_unambig = 10850 #using 70% of 15500 + Int rsv_b_min_num_unambig = 10850 # using 70% of 15500 # inherited from flu defaults Float rsv_b_min_date = 2020.0 Int rsv_b_pivot_interval = 1 @@ -268,7 +271,7 @@ workflow organism_parameters { File gene_locations_bed = select_first([gene_locations_bed_file, sc2_gene_locations_bed, mpox_gene_locations_bed, "gs://theiagen-public-files/terra/theiacov-files/empty.bed"]) File primer_bed = select_first([primer_bed_file, mpox_primer_bed_file, wnv_primer_bed_file, hiv_v1_primer_bed, hiv_v2_primer_bed, "gs://theiagen-public-files/terra/theiacov-files/empty.bed"]) File reference_gff = select_first([reference_gff_file, mpox_reference_gff_file, hiv_v1_reference_gff, hiv_v2_reference_gff, "gs://theiagen-public-files/terra/theiacov-files/empty.gff3"]) - Int genome_length = select_first([genome_length_input, sc2_genome_len, mpox_genome_len, wnv_genome_len, flu_genome_len, rsv_a_genome_len, rsv_b_genome_len, hiv_v1_genome_len, hiv_v2_genome_len]) + Int genome_length = select_first([genome_length_input, sc2_genome_len, mpox_genome_len, wnv_genome_len, flu_genome_len, rsv_a_genome_len, rsv_b_genome_len, hiv_v1_genome_len, hiv_v2_genome_len, unsupported_genome_length]) # nextclade information String nextclade_dataset_tag = select_first([nextclade_dataset_tag_input, sc2_nextclade_ds_tag, mpox_nextclade_ds_tag, wnv_nextclade_ds_tag, h1n1_ha_nextclade_ds_tag, h3n2_ha_nextclade_ds_tag, vic_ha_nextclade_ds_tag, yam_ha_nextclade_ds_tag, h5n1_ha_nextclade_ds_tag, h1n1_na_nextclade_ds_tag, h3n2_na_nextclade_ds_tag, vic_na_nextclade_ds_tag, yam_na_nextclade_ds_tag, rsv_a_nextclade_ds_tag, rsv_b_nextclade_ds_tag, "NA"]) String nextclade_dataset_name = select_first([nextclade_dataset_name_input, sc2_nextclade_ds_name, mpox_nextclade_ds_name, wnv_nextclade_ds_name, h1n1_ha_nextclade_ds_name, h3n2_ha_nextclade_ds_name, vic_ha_nextclade_ds_name, yam_ha_nextclade_ds_name, h5n1_ha_nextclade_ds_name, h1n1_na_nextclade_ds_name, h3n2_na_nextclade_ds_name, vic_na_nextclade_ds_name, yam_na_nextclade_ds_name, rsv_a_nextclade_ds_name, rsv_b_nextclade_ds_name, "NA"]) diff --git a/workflows/utilities/wf_taxon_id_conversion.wdl b/workflows/utilities/wf_taxon_id_conversion.wdl new file mode 100644 index 000000000..784dd958a --- /dev/null +++ b/workflows/utilities/wf_taxon_id_conversion.wdl @@ -0,0 +1,35 @@ +version 1.0 + +workflow convert_taxon_ids { + input { + String taxon_id + } + String unsupported_organism = "unsupported" + if (taxon_id == "2697049") { + String sars_cov_2 = "sars-cov-2" + } + if (taxon_id == "10244") { + String mpox = "MPXV" + } + if (taxon_id == "11082") { + String wnv = "WNV" + } + if (taxon_id == "11320") { + String flu_a = "flu" # flu A + } + if (taxon_id == "11520") { + String flu_b = "flu" # flu B + } + if (taxon_id == "12814") { + String rsv_a = "rsv_a" + } + if (taxon_id == "12815") { + String rsv_b = "rsv_b" + } + if (taxon_id == "11676") { + String hiv = "HIV" + } + output { + String organism = select_first([sars_cov_2, mpox, wnv, flu_a, flu_b, rsv_a, rsv_b, hiv, unsupported_organism]) + } +} \ No newline at end of file