diff --git a/docs/assets/files/theiavalidate/theiacov-validation-criteria.txt b/docs/assets/files/theiavalidate/theiacov-validation-criteria.txt index 504222f3f..d71473608 100644 --- a/docs/assets/files/theiavalidate/theiacov-validation-criteria.txt +++ b/docs/assets/files/theiavalidate/theiacov-validation-criteria.txt @@ -5,12 +5,12 @@ assembly_length_unambiguous 0.01 assembly_mean_coverage 0.01 irma_subtype EXACT irma_type EXACT -kraken_human EXACT -kraken_human_dehosted EXACT -kraken_sc2 EXACT -kraken_sc2_dehosted EXACT -kraken_target_org EXACT -kraken_target_org_dehosted EXACT +kraken2_human EXACT +kraken2_human_dehosted EXACT +kraken2_sc2 EXACT +kraken2_sc2_dehosted EXACT +kraken2_target_org EXACT +kraken2_target_org_dehosted EXACT nextclade_aa_dels SET nextclade_aa_subs SET nextclade_clade EXACT diff --git a/docs/workflows/genomic_characterization/freyja.md b/docs/workflows/genomic_characterization/freyja.md index b3e2f4b6f..39ed7d714 100644 --- a/docs/workflows/genomic_characterization/freyja.md +++ b/docs/workflows/genomic_characterization/freyja.md @@ -146,13 +146,13 @@ This workflow runs on the sample level. | primer_trim | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | | read_QC_trim_pe | **adapters** | File | A FASTA file containing adapter sequence | None | Optional | | read_QC_trim_pe | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | -| read_QC_trim_pe | **call_kraken** | Boolean | By default this is set to false to skip kraken2; set to true to run kraken2 but a database must be also provided via the kraken_db input parameter for this to run successfully | FALSE | Optional | +| read_QC_trim_pe | **call_kraken** | Boolean | By default this is set to false to skip kraken2; set to true to run kraken2 but a database must be also provided via the kraken2_db input parameter for this to run successfully | FALSE | Optional | | read_QC_trim_pe | **call_midas** | Boolean | By default this is set to true to run MIDAS; set to false to skip MIDAS | FALSE | Optional | | read_QC_trim_pe | **fastp_args** | String | Additional arguments to use with fastp | "--detect_adapter_for_pe -g -5 20 -3 20" | Optional | -| read_QC_trim_pe | **kraken_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | -| read_QC_trim_pe | **kraken_db** | File | A kraken2 database to use with the kraken2 optional task. The file must be a .tar.gz kraken2 database. | None | Optional, Sometimes required | -| read_QC_trim_pe | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | -| read_QC_trim_pe | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim_pe | **kraken2_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| read_QC_trim_pe | **kraken2_db** | File | A kraken2 database to use with the kraken2 optional task. The file must be a .tar.gz kraken2 database. | None | Optional, Sometimes required | +| read_QC_trim_pe | **kraken2_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| read_QC_trim_pe | **kraken2_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | | read_QC_trim_pe | **midas_db** | File | Database to use with MIDAS. Not required as one will be auto-selected when running the MIDAS task. | None | Optional, Sometimes required | | read_QC_trim_pe | **phix** | File | The file containing the phix sequence to be used during bbduk task | None | Optional | | read_QC_trim_pe | **read_processing** | String | Options: "trimmomatic" or "fastp" to indicate which read trimming module to use | "trimmomatic" | Optional | @@ -161,26 +161,26 @@ This workflow runs on the sample level. | read_QC_trim_pe | **trim_quality_trim_score** | Int | The minimum quality score to keep during trimming | 30 | Optional | | read_QC_trim_pe | **trim_window_size** | Int | The window size to use during trimming | 4 | Optional | | read_QC_trim_pe | **trimmomatic_args** | String | Additional command-line arguments to use with trimmomatic | None | Optional | -| read_QC_trim_ont | **call_kraken** | Boolean | By default this is set to false to skip kraken2; set to true to run kraken2 but a database must be also provided via the kraken_db input parameter for this to run successfully | FALSE | Optional | +| read_QC_trim_ont | **call_kraken2** | Boolean | By default this is set to false to skip kraken2; set to true to run kraken2 but a database must be also provided via the kraken2_db input parameter for this to run successfully | FALSE | Optional | | read_QC_trim_ont | **downsampling_coverage** | Float | The depth to downsample to with Rasusa. Internal component. Do not modify. | 150 | Do not modify, Optional | | read_QC_trim_ont | **genome_length** | Int | Internal component. Do not modify | None | Do not modify, Optional | -| read_QC_trim_ont | **kraken_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | -| read_QC_trim_ont | **kraken_db** | File | A kraken2 database to use with the kraken2 optional task. The file must be a .tar.gz kraken2 database. | None | Optional | -| read_QC_trim_ont | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | -| read_QC_trim_ont | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim_ont | **kraken2_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| read_QC_trim_ont | **kraken2_db** | File | A kraken2 database to use with the kraken2 optional task. The file must be a .tar.gz kraken2 database. | None | Optional | +| read_QC_trim_ont | **kraken2_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| read_QC_trim_ont | **kraken2_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | | read_QC_trim_ont | **max_length** | Int | Internal component, do not modify | | Do not modify, Optional | | read_QC_trim_ont | **min_length** | Int | Internal component, do not modify | | Do not modify, Optional | | read_QC_trim_ont | **run_prefix** | String | Internal component, do not modify | | Do not modify, Optional | | read_QC_trim_ont | **target_organism** | String | This string is searched for in the kraken2 outputs to extract the read percentage | | Optional | | read_QC_trim_se | **adapters** | File | A FASTA file containing adapter sequence | None | Optional | | read_QC_trim_se | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | -| read_QC_trim_se | **call_kraken** | Boolean | By default this is set to false to skip kraken2; set to true to run kraken2 but a database must be also provided via the kraken_db input parameter for this to run successfully | FALSE | Optional | +| read_QC_trim_se | **call_kraken** | Boolean | By default this is set to false to skip kraken2; set to true to run kraken2 but a database must be also provided via the kraken2_db input parameter for this to run successfully | FALSE | Optional | | read_QC_trim_se | **call_midas** | Boolean | By default this is set to true to run MIDAS; set to false to skip MIDAS | FALSE | Optional | | read_QC_trim_se | **fastp_args** | String | Additional arguments to use with fastp | "--detect_adapter_for_pe -g -5 20 -3 20" | Optional | -| read_QC_trim_se | **kraken_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | -| read_QC_trim_se | **kraken_db** | File | A kraken2 database to use with the kraken2 optional task. The file must be a .tar.gz kraken2 database. | None | Optional | -| read_QC_trim_se | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | -| read_QC_trim_se | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim_se | **kraken2_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| read_QC_trim_se | **kraken2_db** | File | A kraken2 database to use with the kraken2 optional task. The file must be a .tar.gz kraken2 database. | None | Optional | +| read_QC_trim_se | **kraken2_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| read_QC_trim_se | **kraken2_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | | read_QC_trim_se | **midas_db** | File | Database to use with MIDAS. Not required as one will be auto-selected when running the MIDAS task. | None | Optional, Sometimes required | | read_QC_trim_se | **phix** | File | The file containing the phix sequence to be used during bbduk task | None | Optional | | read_QC_trim_se | **read_processing** | String | Options: "trimmomatic" or "fastp" to indicate which read trimming module to use | "trimmomatic" | Optional | @@ -363,13 +363,13 @@ The main output file used in subsequent Freyja workflows is found under the `fre | freyja_variants | File | The TSV file containing the variants identified by Freyja | ONT, PE, SE | | freyja_version | String | version of Freyja used | ONT, PE, SE | | ivar_version_primtrim | String | Version of iVar for running the iVar trim command | ONT, PE, SE | -| kraken_human | Float | Percent of human read data detected using the Kraken2 software | ONT, PE, SE | -| kraken_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | ONT, PE, SE | -| kraken_report | File | Full Kraken report | ONT, PE, SE | -| kraken_report_dehosted | File | Full Kraken report after host removal | ONT, PE, SE | -| kraken_sc2 | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software | ONT, PE, SE | -| kraken_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | ONT, PE, SE | -| kraken_version | String | Version of Kraken software used | ONT, PE, SE | +| kraken2_human | Float | Percent of human read data detected using the Kraken2 software | ONT, PE, SE | +| kraken2_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | ONT, PE, SE | +| kraken2_report | File | Full Kraken report | ONT, PE, SE | +| kraken2_report_dehosted | File | Full Kraken report after host removal | ONT, PE, SE | +| kraken2_sc2 | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software | ONT, PE, SE | +| kraken2_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | ONT, PE, SE | +| kraken2_version | String | Version of Kraken software used | ONT, PE, SE | | minimap2_docker | String | Docker image used to run minimap2 | ONT | | minimap2_version | String | Version of minimap2 used | ONT | | nanoplot_html_clean | File | Clean read file | ONT | diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md index 2e72b86d4..9265eda62 100644 --- a/docs/workflows/genomic_characterization/theiacov.md +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -219,19 +219,22 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | ivar_consensus | **stats_n_coverage_primtrim_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | SE,PE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | | ivar_consensus | **stats_n_coverage_primtrim_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.15 | Optional | SE,PE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | | ivar_consensus | **stats_n_coverage_primtrim_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | SE,PE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| kraken2_dehosted | **classified_out** | String | Allows user to rename the classified FASTQ files output. Must include .fastq as the suffix | classified#.fastq | Optional | CL | sars-cov-2 | | kraken2_dehosted | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL | sars-cov-2 | | kraken2_dehosted | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | -| kraken2_dehosted | **docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | +| kraken2_dehosted | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | | kraken2_dehosted | **kraken2_db** | String | The database used to run Kraken2 | /kraken2-db | Optional | CL | sars-cov-2 | | kraken2_dehosted | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL | sars-cov-2 | | kraken2_dehosted | **read2** | File | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | +| kraken2_dehosted | **unclassified_out** | String | Allows user to rename the unclassified FASTQ files output. Must include .fastq as the suffix | unclassified#.fastq | Optional | CL | sars-cov-2 | +| kraken2_raw | **classified_out** | String | Allows user to rename the classified FASTQ files output. Must include .fastq as the suffix | classified#.fastq | Optional | CL | sars-cov-2 | | kraken2_raw | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | CL | sars-cov-2 | | kraken2_raw | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL | sars-cov-2 | -| kraken2_raw | **docker_image** | Int | Docker container used in this task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | +| kraken2_raw | **docker** | Int | Docker container used in this task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv | Optional | CL | sars-cov-2 | | kraken2_raw | **kraken2_db** | String | The database used to run Kraken2 | /kraken2-db | Optional | CL | sars-cov-2 | | kraken2_raw | **memory** | String | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL | sars-cov-2 | -| kraken2_raw | **read_processing** | String | The tool used for trimming of primers from reads. Options are trimmomatic and fastp | trimmomatic | Optional | | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | kraken2_raw | **read2** | File | Internal component, do not modify | | Do not modify, Optional | CL | sars-cov-2 | +| kraken2_raw | **unclassified_out** | String | Allows user to rename the unclassified FASTQ files output. Must include .fastq as the suffix | unclassified#.fastq | Optional | CL | sars-cov-2 | | nanoplot_clean | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | nanoplot_clean | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | nanoplot_clean | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/nanoplot:1.40.0 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | @@ -265,7 +268,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | organism_parameters | **gene_locations_bed_file** | File | Use to provide locations of interest where average coverage will be calculated | Default provided for SARS-CoV-2 ("gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed") and mpox ("gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed") | Optional | CL, FASTA | | | organism_parameters | **genome_length_input** | Int | Use to specify the expected genome length; provided by default for all supported organisms | Default provided for SARS-CoV-2 (29903), mpox (197200), WNV (11000), flu (13000), RSV-A (16000), RSV-B (16000), HIV (primer versions 1 [9181] and 2 [9840]) | Optional | CL | | | organism_parameters | **hiv_primer_version** | String | The version of HIV primers used. Options are "https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_organism_parameters.wdl#L156" and "https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_organism_parameters.wdl#L164". This input is ignored if provided for TheiaCoV_Illumina_SE and TheiaCoV_ClearLabs | v1 | Optional | CL, FASTA, ONT, PE, SE | HIV | -| organism_parameters | **kraken_target_organism_input** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | Default provided for mpox (Monkeypox virus), WNV (West Nile virus), and HIV (Human immunodeficiency virus 1) | Optional | FASTA, ONT, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | +| organism_parameters | **kraken2_target_organism_input** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | Default provided for mpox (Monkeypox virus), WNV (West Nile virus), and HIV (Human immunodeficiency virus 1) | Optional | FASTA, ONT, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 | | organism_parameters | **pangolin_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29 | Optional | CL, FASTA | | | organism_parameters | **primer_bed_file** | File | The bed file containing the primers used when sequencing was performed | REQUIRED FOR SARS-CoV-2, MPOX, WNV, RSV-A & RSV-B. Provided by default only for HIV primer versions 1 ("gs://theiagen-public-files/terra/hivgc-files/HIV-1_v1.0.primer.hyphen.bed" and 2 ("gs://theiagen-public-files/terra/hivgc-files/HIV-1_v2.0.primer.hyphen400.1.bed") | Optional, Sometimes required | CL, FASTA | | | organism_parameters | **reference_gff_file** | File | Reference GFF file for the organism being analyzed | Default provided for mpox ("gs://theiagen-public-files/terra/mpxv-files/Mpox-MT903345.1.reference.gff3") and HIV (primer versions 1 ["gs://theiagen-public-files/terra/hivgc-files/NC_001802.1.gff3"] and 2 ["gs://theiagen-public-files/terra/hivgc-files/AY228557.1.gff3"]) | Optional | CL, FASTA, ONT | | @@ -298,12 +301,12 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | qc_check_task | **est_coverage_clean** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **est_coverage_raw** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **gambit_predicted_taxon** | String | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| qc_check_task | **kraken_human** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | -| qc_check_task | **kraken_human_dehosted** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | -| qc_check_task | **kraken_sc2** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| qc_check_task | **kraken_sc2_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| qc_check_task | **kraken_target_organism** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| qc_check_task | **kraken_target_organism_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken2_human** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | +| qc_check_task | **kraken2_human_dehosted** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, SE | | +| qc_check_task | **kraken2_sc2** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken2_sc2_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken2_target_organism** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| qc_check_task | **kraken2_target_organism_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **midas_secondary_genus_abundance** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | qc_check_task | **midas_secondary_genus_coverage** | Float | Internal component, do not modify | | Do not modify, Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | @@ -337,13 +340,13 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | raw_check_reads | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/bactopia/gather_samples:2.0.2 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | raw_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| read_QC_trim | **call_kraken** | Boolean | True/False variable that determines if the Kraken2 task should be called. | FALSE | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **call_kraken2** | Boolean | True/False variable that determines if the Kraken2 task should be called. | FALSE | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **call_midas** | Boolean | True/False variable that determines if the MIDAS task should be called. | TRUE | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **downsampling_coverage** | Float | The desired coverage to sub-sample the reads to with RASUSA | 150 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **fastp_args** | String | Additional fastp task arguments | --detect_adapter_for_pe -g -5 20 -3 20 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| read_QC_trim | **kraken_db** | File | The database used to run Kraken2 | /kraken2-db | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| read_QC_trim | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | -| read_QC_trim | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **kraken2_db** | File | The database used to run Kraken2 | /kraken2-db | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **kraken2_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| read_QC_trim | **kraken2_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **midas_db** | File | The database used by the MIDAS task | gs://theiagen-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | read_QC_trim | **read_processing** | String | The name of the tool to perform basic read processing; options: "trimmomatic" or "fastp" | trimmomatic | Optional | PE, SE | | | read_QC_trim | **read_qc** | String | The tool used for quality control (QC) of reads. Options are fastq_scan and fastqc | fastq_scan | Optional | PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | @@ -375,6 +378,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | workflow name | **flu_segment** | String | Influenza genome segment being analyzed. Options: "HA" or "NA". | HA | Optional, Required | FASTA | | | workflow name | **flu_subtype** | String | The influenza subtype being analyzed. Options: "Yamagata", "Victoria", "H1N1", "H3N2", "H5N1". Automatically determined. | | Optional | FASTA | | | workflow name | **genome_length** | Int | Use to specify the expected genome length | | Optional | FASTA, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | +| workflow name | **kraken2_db** | File | A Kraken2 database in .tar.gz format. Must contain viral and human sequences. | gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz | Optional | CL, ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | workflow name | **max_genome_length** | Int | Maximum genome length able to pass read screening | 2673870 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | workflow name | **max_length** | Int | Maximum length for a read based on the SARS-CoV-2 primer scheme | 700 | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 | | workflow name | **medaka_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/artic-ncov2019:1.3.0-medaka-1.4.3 | Optional | CL | | @@ -447,7 +451,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | organism_parameters | **gene_locations_bed_file** | File | | | Optional | | organism_parameters | **genome_length_input** | Int | | | Optional | | organism_parameters | **hiv_primer_version** | String | | | Optional | - | organism_parameters | **kraken_target_organism_input** | String | | | Optional | + | organism_parameters | **kraken2_target_organism_input** | String | | | Optional | | organism_parameters | **primer_bed_file** | File | | | Optional | | organism_parameters | **reference_genome** | File | | | Optional | | organism_parameters | **reference_gff_file** | File | | | Optional | @@ -498,7 +502,7 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo |---|---|---| | gene_locations_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed"` | | genome_length_input | MPXV | `197200` | - | kraken_target_organism_input | MPXV | `"Monkeypox virus"` | + | kraken2_target_organism_input | MPXV | `"Monkeypox virus"` | | nextclade_dataset_name_input | MPXV | `"nextstrain/mpox/lineage-b.1"` | | nextclade_dataset_tag_input | MPXV | `"2024-04-19--07-50-39Z"` | | primer_bed_file | MPXV | `"gs://theiagen-public-files/terra/mpxv-files/MPXV.primer.bed"` | @@ -512,7 +516,7 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | |---|---|---|---| | genome_length_input | WNV | `11000` | | - | kraken_target_organism_input | WNV | `"West Nile virus`" | | + | kraken2_target_organism_input | WNV | `"West Nile virus`" | | | nextclade_dataset_name_input | WNV | `"NA"` | TheiaCoV's Nextclade currently does not support WNV | | nextclade_dataset_tag_input | WNV | `"NA"` | TheiaCoV's Nextclade currently does not support WNV | | primer_bed_file | WNV | `"gs://theiagen-public-files/terra/theiacov-files/WNV/WNV-L1_primer.bed"` | | @@ -562,7 +566,7 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_a | 16000 | - | kraken_target_organism | rsv_a | Respiratory syncytial virus | + | kraken2_target_organism | rsv_a | Respiratory syncytial virus | | nextclade_dataset_name_input | rsv_a | nextstrain/rsv/a/EPI_ISL_412866 | | nextclade_dataset_tag_input | rsv_a | 2024-08-01--22-31-31Z | | reference_genome | rsv_a | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta | @@ -574,7 +578,7 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo | **Overwrite Variable Name** | **Organism** | **Default Value** | |---|---|---| | genome_length_input | rsv_b | 16000 | - | kraken_target_organism | rsv_b | "Human orthopneumovirus" | + | kraken2_target_organism | rsv_b | "Human orthopneumovirus" | | nextclade_dataset_name_input | rsv_b | nextstrain/rsv/b/EPI_ISL_1653999 | | nextclade_dataset_tag_input | rsv_b | "2024-08-01--22-31-31Z" | | reference_genome | rsv_b | gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta | @@ -585,7 +589,7 @@ The `organism_parameters` sub-workflow is the first step in all TheiaCoV workflo ??? toggle "HIV Defaults" | **Overwrite Variable Name** | **Organism** | **Default Value** | **Notes** | |---|---|---|---| - | kraken_target_organism_input | HIV | Human immunodeficiency virus 1 | | + | kraken2_target_organism_input | HIV | Human immunodeficiency virus 1 | | | genome_length_input | HIV-v1 | 9181 | This version of HIV originates from Oregon | | primer_bed_file | HIV-v1 | gs://theiagen-public-files/terra/hivgc-files/HIV-1_v1.0.primer.hyphen.bed | This version of HIV originates from Oregon | | reference_genome | HIV-v1 | gs://theiagen-public-files/terra/hivgc-files/NC_001802.1.fasta | This version of HIV originates from Oregon | @@ -1085,16 +1089,17 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | ivar_vcf | File | iVar tsv output converted to VCF format | PE, SE | | ivar_version_consensus | String | Version of iVar for running the iVar consensus command | PE, SE | | ivar_version_primtrim | String | Version of iVar for running the iVar trim command | PE, SE | -| kraken_human | Float | Percent of human read data detected using the Kraken2 software | CL, ONT, PE, SE | -| kraken_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | CL, ONT, PE | -| kraken_report | File | Full Kraken report | CL, ONT, PE, SE | -| kraken_report_dehosted | File | Full Kraken report after host removal | CL, ONT, PE | -| kraken_sc2 | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software | CL, ONT, PE, SE | -| kraken_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | CL, ONT, PE | -| kraken_target_organism | String | Percent of target organism read data detected using the Kraken2 software | CL, ONT, PE, SE | -| kraken_target_organism_dehosted | String | Percent of target organism read data detected using the Kraken2 software after host removal | CL, ONT, PE | -| kraken_target_organism_name | String | The name of the target organism; e.g., "Monkeypox" or "Human immunodeficiency virus" | CL, ONT, PE, SE | -| kraken_version | String | Version of Kraken software used | CL, ONT, PE, SE | +| kraken2_database | String | Database file used for Kraken2 analysis | CL, ONT, PE, SE | +| kraken2_human | Float | Percent of human read data detected using the Kraken2 software | CL, ONT, PE, SE | +| kraken2_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | CL, ONT, PE | +| kraken2_report | File | Full Kraken report | CL, ONT, PE, SE | +| kraken2_report_dehosted | File | Full Kraken report after host removal | CL, ONT, PE | +| kraken2_sc2 | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software | CL, ONT, PE, SE | +| kraken2_sc2_dehosted | String | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | CL, ONT, PE | +| kraken2_target_organism | String | Percent of target organism read data detected using the Kraken2 software | CL, ONT, PE, SE | +| kraken2_target_organism_dehosted | String | Percent of target organism read data detected using the Kraken2 software after host removal | CL, ONT, PE | +| kraken2_target_organism_name | String | The name of the target organism; e.g., "Monkeypox" or "Human immunodeficiency virus" | CL, ONT, PE, SE | +| kraken2_version | String | Version of Kraken software used | CL, ONT, PE, SE | | meanbaseq_trim | Float | Mean quality of the nucleotide basecalls aligned to the reference genome after primer trimming | CL, ONT, PE, SE | | meanmapq_trim | Float | Mean quality of the mapped reads to the reference genome after primer trimming | CL, ONT, PE, SE | | medaka_reference | String | Reference sequence used in medaka task | CL, ONT | diff --git a/docs/workflows/genomic_characterization/theiaeuk.md b/docs/workflows/genomic_characterization/theiaeuk.md index 19141cd05..ab84d2f0b 100644 --- a/docs/workflows/genomic_characterization/theiaeuk.md +++ b/docs/workflows/genomic_characterization/theiaeuk.md @@ -95,12 +95,12 @@ All input reads are processed through "core tasks" in each workflow. The core ta | qc_check_task | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | | qc_check_task | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | | qc_check_task | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16" | Optional | -| qc_check_task | **kraken_human** | Float | Internal component, do not modify | | Do Not Modify, Optional | -| qc_check_task | **kraken_human_dehosted** | Float | Internal component, do not modify | | Do Not Modify, Optional | -| qc_check_task | **kraken_sc2** | Float | Internal component, do not modify | | Do Not Modify, Optional | -| qc_check_task | **kraken_sc2_dehosted** | Float | Internal component, do not modify | | Do Not Modify, Optional | -| qc_check_task | **kraken_target_organism** | Float | Internal component, do not modify | | Do Not Modify, Optional | -| qc_check_task | **kraken_target_organism_dehosted** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken2_human** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken2_human_dehosted** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken2_sc2** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken2_sc2_dehosted** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken2_target_organism** | Float | Internal component, do not modify | | Do Not Modify, Optional | +| qc_check_task | **kraken2_target_organism_dehosted** | Float | Internal component, do not modify | | Do Not Modify, Optional | | qc_check_task | **meanbaseq_trim** | String | Internal component, do not modify | | Do Not Modify, Optional | | qc_check_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | | qc_check_task | **midas_secondary_genus_abundance** | Int | Internal component, do not modify | | Do Not Modify, Optional | @@ -133,9 +133,9 @@ All input reads are processed through "core tasks" in each workflow. The core ta | read_QC_trim | **call_kraken** | Boolean | If true, Kraken2 is executed on the dataset | FALSE | Optional | | read_QC_trim | **call_midas** | Boolean | Internal component, do not modify | FALSE | Do Not Modify, Optional | | read_QC_trim | **fastp_args** | String | Additional arguments to pass to fastp | --detect_adapter_for_pe -g -5 20 -3 20 | Optional | -| read_QC_trim | **kraken_db** | File | Database to use with kraken2 | | Optional | -| read_QC_trim | **kraken_disk_size** | Int | Amount of storage (in GB) to allocate to the task | | Optional | -| read_QC_trim | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | | Optional | +| read_QC_trim | **kraken2_db** | File | Database to use with kraken2 | | Optional | +| read_QC_trim | **kraken2_disk_size** | Int | Amount of storage (in GB) to allocate to the task | | Optional | +| read_QC_trim | **kraken2_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | | Optional | | read_QC_trim | **midas_db** | File | Internal component, do not modify | | Do Not Modify, Optional | | read_QC_trim | **phix** | File | A file containing the phix used during Illumina sequencing; used in the BBDuk task | | Optional | | read_QC_trim | **read_processing** | String | Read trimming software to use, either "trimmomatic" or "fastp" | trimmomatic | Optional | diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md index 35422e658..c0d064623 100644 --- a/docs/workflows/genomic_characterization/theiaprok.md +++ b/docs/workflows/genomic_characterization/theiaprok.md @@ -471,12 +471,12 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | qc_check_task | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16" | Optional | FASTA, ONT, PE, SE | | qc_check_task | **est_coverage_clean** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | | qc_check_task | **est_coverage_raw** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA | -| qc_check_task | **kraken_human** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | -| qc_check_task | **kraken_human_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | -| qc_check_task | **kraken_sc2** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | -| qc_check_task | **kraken_sc2_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | -| qc_check_task | **kraken_target_organism** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | -| qc_check_task | **kraken_target_organism_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken2_human** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken2_human_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken2_sc2** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken2_sc2_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken2_target_organism** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | +| qc_check_task | **kraken2_target_organism_dehosted** | Float | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | | qc_check_task | **meanbaseq_trim** | String | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT, PE, SE | | qc_check_task | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | FASTA, ONT, PE, SE | | qc_check_task | **midas_secondary_genus_abundance** | Int | Internal component, do not modify | | Do not modify, Optional | FASTA, ONT | @@ -512,15 +512,15 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al | raw_check_reads | **workflow_series** | String | Internal component, do not modify | | Do not modify, Optional | ONT, PE, SE | | read_QC_trim | **adapters** | File | A file containing the sequence of the adapters used during library preparation, used in the BBDuk task | | Optional | PE, SE | | read_QC_trim | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | PE, SE | -| read_QC_trim | **call_kraken** | Boolean | Set to true to launch Kraken2; if true, you must provide a kraken_db | FALSE | Optional | ONT, PE, SE | +| read_QC_trim | **call_kraken** | Boolean | Set to true to launch Kraken2; if true, you must provide a kraken2_db | FALSE | Optional | ONT, PE, SE | | read_QC_trim | **call_midas** | Boolean | Set to true to launch Midas | TRUE | Optional | PE, SE | | read_QC_trim | **downsampling_coverage** | Float | The depth to downsample to with Rasusa | 150 | Optional | ONT | | read_QC_trim | **fastp_args** | String | Additional arguments to pass to fastp | -g -5 20 -3 20 | Optional | SE | | read_QC_trim | **fastp_args** | String | Additional arguments to pass to fastp | "--detect_adapter_for_pe -g -5 20 -3 20 | Optional | PE | -| read_QC_trim | **kraken_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT, PE, SE | -| read_QC_trim | **kraken_db** | File | Kraken2 database file; must be provided in call_kraken is true | | Optional | ONT, PE, SE | -| read_QC_trim | **kraken_disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | ONT, PE, SE | -| read_QC_trim | **kraken_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | ONT, PE, SE | +| read_QC_trim | **kraken2_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT, PE, SE | +| read_QC_trim | **kraken2_db** | File | Kraken2 database file; must be provided in call_kraken is true | | Optional | ONT, PE, SE | +| read_QC_trim | **kraken2_disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | ONT, PE, SE | +| read_QC_trim | **kraken2_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | ONT, PE, SE | | read_QC_trim | **max_length** | Int | Internal component, do not modify | | Do not modify, Optional | ONT | | read_QC_trim | **midas_db** | File | Midas database file | gs://theiagen-large-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz | Optional | PE, SE | | read_QC_trim | **min_length** | Int | Internal component, do not modify | | Do not modify, Optional | ONT | @@ -689,7 +689,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al Alternatively to `MIDAS`, the `Kraken2` task can also be turned on through setting the `call_kraken` input variable as `true` for the identification of reads to detect contamination with non-target taxa. - Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate) whole genome sequence data. A database must be provided if this optional module is activated, through the kraken_db optional input. A list of suggested databases can be found on [Kraken2 standalone documentation](../standalone/kraken2.md). + Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate) whole genome sequence data. A database must be provided if this optional module is activated, through the kraken2_db optional input. A list of suggested databases can be found on [Kraken2 standalone documentation](../standalone/kraken2.md). !!! techdetails "read_QC_trim Technical Details" diff --git a/docs/workflows/phylogenetic_construction/augur.md b/docs/workflows/phylogenetic_construction/augur.md index 7ccf78d56..166613893 100644 --- a/docs/workflows/phylogenetic_construction/augur.md +++ b/docs/workflows/phylogenetic_construction/augur.md @@ -226,7 +226,7 @@ This workflow runs on the set level. Please note that for every task, runtime pa | organism_parameters | **gene_locations_bed_file** | File | Use to provide locations of interest where average coverage will be calculated | Defaults are organism-specific. Please find default values for some organisms here: . For an organism without set defaults, an empty file is provided, "gs://theiagen-public-files/terra/theiacov-files/empty.bed", but will not be as useful as an organism specific gene locations bed file. | Optional | | organism_parameters | **genome_length_input** | Int | Use to specify the expected genome length; provided by default for all supported organisms | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, the genome length input must be provided otherwise the workflow fails. | Optional, Required | | organism_parameters | **hiv_primer_version** | String | The version of HIV primers used. Options are and . This input is ignored if provided for TheiaCoV_Illumina_SE and TheiaCoV_ClearLabs | v1 | Optional | -| organism_parameters | **kraken_target_organism_input** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | Defaults are organism-specific. Please find default values for all organisms here: . For an organism without set defaults, the default is "". | Optional | +| organism_parameters | **kraken2_target_organism_input** | String | The organism whose abundance the user wants to check in their reads. This should be a proper taxonomic name recognized by the Kraken database. | Defaults are organism-specific. Please find default values for all organisms here: . For an organism without set defaults, the default is "". | Optional | | organism_parameters | **nextclade_dataset_name_input** | String | NextClade organism dataset name | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, the default is "NA". | Optional | | organism_parameters | **nextclade_dataset_tag_input** | String | NextClade organism dataset tag | Defaults are organism-specific. Please find default values for all organisms (and for Flu - their respective genome segments and subtypes) here: . For an organism without set defaults, the default is "NA". | Optional | | organism_parameters | **pangolin_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.26 | Optional | diff --git a/docs/workflows/standalone/kraken2.md b/docs/workflows/standalone/kraken2.md index df36e56a1..570c344c0 100644 --- a/docs/workflows/standalone/kraken2.md +++ b/docs/workflows/standalone/kraken2.md @@ -35,10 +35,11 @@ Besides the data input types, there are minimal differences between these two wo | Database name | Database Description | Suggested Applications | GCP URI (for usage in Terra) | Source | Database Size (GB) | Date of Last Update | | --- | --- | --- | --- | --- | --- | --- | | **Kalamari v5.1** | Kalamari is a database of complete public assemblies, that has been fine-tuned for enteric pathogens and is backed by trusted institutions. [Full list available here ( in chromosomes.tsv and plasmids.tsv)](https://github.com/lskatz/Kalamari/tree/master/src) | Single-isolate enteric bacterial pathogen analysis (Salmonella, Escherichia, Shigella, Listeria, Campylobacter, Vibrio, Yersinia) | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2.kalamari_5.1.tar.gz`** | ‣ | 1.5 | 18/5/2022 | -| **standard 8GB** | Standard RefSeq database (archaea, bacteria, viral, plasmid, human, UniVec_Core) capped at 8GB | Prokaryotic or viral organisms, but for enteric pathogens, we recommend Kalamari | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_standard_08gb_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 7.5 | 12/1/2024 | -| **standard 16GB** | Standard RefSeq database (archaea, bacteria, viral, plasmid, human, UniVec_Core) capped at 16GB | Prokaryotic or viral organisms, but for enteric pathogens, we recommend Kalamari | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_standard_16gb_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 15 | 12/1/2024 | | **standard** | Standard RefSeq database (archaea, bacteria, viral, plasmid, human, UniVec_Core) | Prokaryotic or viral organisms, but for enteric pathogens, we recommend Kalamari | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_standard_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 72 | 18/4/2023 | +| **standard 16GB** | Standard RefSeq database (archaea, bacteria, viral, plasmid, human, UniVec_Core) capped at 16GB | Prokaryotic or viral organisms, but for enteric pathogens, we recommend Kalamari | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_standard_16gb_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 15 | 12/1/2024 | +| **standard 8GB** | Standard RefSeq database (archaea, bacteria, viral, plasmid, human, UniVec_Core) capped at 8GB | Prokaryotic or viral organisms, but for enteric pathogens, we recommend Kalamari | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_standard_08gb_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 7.5 | 12/1/2024 | | **viral** | RefSeq viral | Viral metagenomics | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 0.6 | 12/1/2024 | +| **viral with human** | Refseq viral plus human (GRCh38) | Viral metagenomics | **`gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz`** | Theiagen Genomics | 2.76 | 10/7/2024 | | **EuPathDB48** | Eukaryotic pathogen genomes with contaminants removed. [Full list available here](https://genome-idx.s3.amazonaws.com/kraken/k2_eupathdb48_20201113/EuPathDB48_Contents.txt) | Eukaryotic organisms (Candida spp., Aspergillus spp., etc) | **`gs://theiagen-public-files-rp/terra/theiaprok-files/k2_eupathdb48_20201113.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 30.3 | 13/11/2020 | | **EuPathDB48** | Eukaryotic pathogen genomes with contaminants removed. [Full list available here](https://genome-idx.s3.amazonaws.com/kraken/k2_eupathdb48_20201113/EuPathDB48_Contents.txt) | Eukaryotic organisms (Candida spp., Aspergillus spp., etc) | **`gs://theiagen-large-public-files-rp/terra/databases/kraken/k2_eupathdb48_20230407.tar.gz`** | https://benlangmead.github.io/aws-indexes/k2 | 11 | 7/4/2023 | @@ -54,13 +55,13 @@ Besides the data input types, there are minimal differences between these two wo | *workflow_name | **read1** | File | | | Required | ONT, PE, SE | | *workflow_name | **read2** | File | | | Required for PE only | PE | | *workflow_name | **samplename** | String | | | Required | ONT, PE, SE | -| kraken2_pe or kraken2_se | **classified_out** | String | Allows user to rename the classified FASTQ files output. Must include .fastq as the suffix | classified#.fastq | Optional | ONT, PE, SE | -| kraken2_pe or kraken2_se | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT, PE, SE | -| kraken2_pe or kraken2_se | **disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | ONT, PE, SE | -| kraken2_pe or kraken2_se | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | ONT, PE, SE | -| kraken2_pe or kraken2_se | **kraken2_args** | String | Allows a user to supply additional kraken2 command-line arguments | | Optional | ONT, PE, SE | -| kraken2_pe or kraken2_se | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | ONT, PE, SE | -| kraken2_pe or kraken2_se | **unclassified_out** | String | Allows user to rename unclassified FASTQ files output. Must include .fastq as the suffix | unclassified#.fastq | Optional | ONT, PE, SE | +| kraken2 | **classified_out** | String | Allows user to rename the classified FASTQ files output. Must include .fastq as the suffix | classified#.fastq | Optional | ONT, PE, SE | +| kraken2 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | ONT, PE, SE | +| kraken2 | **disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | ONT, PE, SE | +| kraken2 | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | ONT, PE, SE | +| kraken2 | **kraken2_args** | String | Allows a user to supply additional kraken2 command-line arguments | | Optional | ONT, PE, SE | +| kraken2 | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | ONT, PE, SE | +| kraken2 | **unclassified_out** | String | Allows user to rename unclassified FASTQ files output. Must include .fastq as the suffix | unclassified#.fastq | Optional | ONT, PE, SE | | krona | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | PE, SE | | krona | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | PE, SE | | krona | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/biocontainers/krona:2.7.1--pl526_5 | Optional | PE, SE | @@ -145,7 +146,13 @@ When assessing the taxonomic identity of a single isolate's sequence, it is norm [Krona](https://github.com/marbl/Krona) produces an interactive report that allows hierarchical data, such as the one from Kraken2, to be explored with zooming, multi-layered pie charts. These pie charts are intuitive and highly responsive. -Krona will only output hierarchical results for bacterial organisms in its current implementation. +!!! warning + + Krona will only output hierarchical results for **bacterial organisms** in its current implementation. + +!!! warning + + Krona is only available for Kraken reports generated with **Illumina data**, paired or singled-ended. ??? toggle "Example Krona report" @@ -158,4 +165,11 @@ Krona will only output hierarchical results for bacterial organisms in its curre | --- | --- | | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | | Software Documentation | | - | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://link.springer.com/article/10.1186/s13059-019-1891-0) | \ No newline at end of file + | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://link.springer.com/article/10.1186/s13059-019-1891-0) | + +!!! techdetails "Krona Technical Details" + | | Links | + | --- | --- | + | Software Source Code | [Krona on GitHub](https://github.com/marbl/Krona) | + | Software Documentation | | + | Original Publication(s) | [Interactive metagenomic visualization in a Web browser](https://doi.org/10.1186/1471-2105-12-385) | \ No newline at end of file diff --git a/docs/workflows/standalone/ncbi_scrub.md b/docs/workflows/standalone/ncbi_scrub.md index 0ae60c49b..e2bdaa130 100644 --- a/docs/workflows/standalone/ncbi_scrub.md +++ b/docs/workflows/standalone/ncbi_scrub.md @@ -77,10 +77,10 @@ This workflow is composed of two tasks, one to dehost the input reads and anothe | **Variable** | **Type** | **Description** | **Workflow** | |---|---|---|---| -| kraken_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | PE, SE | -| kraken_report_dehosted | File | Full Kraken report after host removal | PE, SE | -| kraken_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | PE, SE | -| kraken_version_dehosted | String | Version of Kraken2 software used | PE, SE | +| kraken2_human_dehosted | Float | Percent of human read data detected using the Kraken2 software after host removal | PE, SE | +| kraken2_report_dehosted | File | Full Kraken report after host removal | PE, SE | +| kraken2_sc2_dehosted | Float | Percent of SARS-CoV-2 read data detected using the Kraken2 software after host removal | PE, SE | +| kraken2_version_dehosted | String | Version of Kraken2 software used | PE, SE | | ncbi_scrub_docker | String | Docker image used to run HRRT | PE, SE | | ncbi_scrub_human_spots_removed | Int | Number of spots removed (or masked) | PE, SE | | ncbi_scrub_pe_analysis_date | String | Date of analysis | PE, SE | diff --git a/docs/workflows/standalone/theiavalidate.md b/docs/workflows/standalone/theiavalidate.md index a9c051f9c..429b7321d 100644 --- a/docs/workflows/standalone/theiavalidate.md +++ b/docs/workflows/standalone/theiavalidate.md @@ -21,7 +21,7 @@ In order to enable this workflow to function for different workflow series, we r !!! tool "Validation Starting Points" | Workflow Series | Validation Criteria TSV | Columns to Compare | |---|---|---| - | TheiaCoV Workflows | [TheiaCov Validation Criteria](../../assets/files/theiavalidate/theiacov-validation-criteria.txt) | abricate_flu_subtype,abricate_flu_type,assembly_length_unambiguous,assembly_mean_coverage,irma_subtype,irma_type,kraken_human,kraken_human_dehosted,kraken_sc2,kraken_sc2_dehosted,kraken_target_org,kraken_target_org_dehosted,nextclade_aa_dels,nextclade_aa_subs,nextclade_clade,nextclade_lineage,nextclade_tamiflu_resistance_aa_subs,num_reads_clean1,num_reads_clean2,number_N,pango_lineage,percent_reference_coverage,vadr_num_alerts | + | TheiaCoV Workflows | [TheiaCov Validation Criteria](../../assets/files/theiavalidate/theiacov-validation-criteria.txt) | abricate_flu_subtype,abricate_flu_type,assembly_length_unambiguous,assembly_mean_coverage,irma_subtype,irma_type,kraken2_human,kraken2_human_dehosted,kraken2_sc2,kraken2_sc2_dehosted,kraken2_target_org,kraken2_target_org_dehosted,nextclade_aa_dels,nextclade_aa_subs,nextclade_clade,nextclade_lineage,nextclade_tamiflu_resistance_aa_subs,num_reads_clean1,num_reads_clean2,number_N,pango_lineage,percent_reference_coverage,vadr_num_alerts | | TheiaEuk Workflows | [TheiaEuk Validation Criteria](../../assets/files/theiavalidate/theiaeuk-validation-criteria.txt) | assembly_length,busco_results,clade_type,est_coverage_clean,est_coverage_raw,gambit_predicted_taxon,n50_value,num_reads_clean1,num_reads_clean2,number_contigs,quast_gc_percent,theiaeuk_snippy_variants_hits | | TheiaProk Workflows | [TheiaProk Validation Criteria](../../assets/files/theiavalidate/theiaprok-validation-criteria.txt) | abricate_abaum_plasmid_type_genes,agrvate_agr_group,amrfinderplus_amr_core_genes,amrfinderplus_amr_plus_genes,amrfinderplus_stress_genes,amrfinderplus_virulence_genes,ani_highest_percent,ani_top_species_match,assembly_length,busco_results,ectyper_predicted_serotype,emmtypingtool_emm_type,est_coverage_clean,est_coverage_raw,gambit_predicted_taxon,genotyphi_final_genotype,hicap_genes,hicap_serotype,kaptive_k_type,kleborate_genomic_resistance_mutations,kleborate_key_resistance_genes,kleborate_mlst_sequence_type,legsta_predicted_sbt,lissero_serotype,meningotype_serogroup,midas_primary_genus,midas_secondary_genus,midas_secondary_genus_abundance,n50_value,ngmaster_ngmast_sequence_type,ngmaster_ngstar_sequence_type,num_reads_clean1,num_reads_clean2,number_contigs,pasty_serogroup,pbptyper_predicted_1A_2B_2X,plasmidfinder_plasmids,poppunk_gps_cluster,seqsero2_predicted_serotype,seroba_ariba_serotype,seroba_serotype,serotypefinder_serotype,shigatyper_ipaB_presence_absence,shigatyper_predicted_serotype,shigeifinder_cluster,shigeifinder_serotype,sistr_predicted_serotype,sonneityping_final_genotype,spatyper_type,srst2_vibrio_serogroup,staphopiasccmec_types_and_mecA_presence,tbprofiler_main_lineage,tbprofiler_resistance_genes,ts_mlst_predicted_st,virulencefinder_hits | diff --git a/tasks/quality_control/comparisons/task_qc_check_phb.wdl b/tasks/quality_control/comparisons/task_qc_check_phb.wdl index d76b05687..9ae8bdb8b 100644 --- a/tasks/quality_control/comparisons/task_qc_check_phb.wdl +++ b/tasks/quality_control/comparisons/task_qc_check_phb.wdl @@ -37,12 +37,12 @@ task qc_check_phb { Float? ani_highest_percent Float? ani_highest_percent_bases_aligned # theiacov inputs - Float? kraken_human - Float? kraken_human_dehosted - Float? kraken_sc2 - Float? kraken_sc2_dehosted - Float? kraken_target_organism - Float? kraken_target_organism_dehosted + Float? kraken2_human + Float? kraken2_human_dehosted + Float? kraken2_sc2 + Float? kraken2_sc2_dehosted + Float? kraken2_target_organism + Float? kraken2_target_organism_dehosted String? meanbaseq_trim Float? assembly_mean_coverage Int? number_N @@ -315,35 +315,35 @@ task qc_check_phb { qc_note, qc_status = compare(qc_note, "num_reads_clean2", int(~{num_reads_clean2}), ">=", int(taxon_df["num_reads_clean2"][0])) qc_check_metrics.remove("num_reads_clean2") - if ("kraken_human" in qc_check_metrics): # if this var is in the qc_check_metrics, - if ("~{kraken_human}"): # if kraken_human variable exists, - qc_note, qc_status = compare(qc_note, "kraken_human", float(~{kraken_human}), "<=", float(taxon_df["kraken_human"][0])) - qc_check_metrics.remove("kraken_human") - - if ("kraken_human_dehosted" in qc_check_metrics): # if this var is in the qc_check_metrics, - if ("~{kraken_human_dehosted}"): # if kraken_human_dehosted variable exists, - qc_note, qc_status = compare(qc_note, "kraken_human_dehosted", float(~{kraken_human_dehosted}), "<=", float(taxon_df["kraken_human_dehosted"][0])) - qc_check_metrics.remove("kraken_human_dehosted") - - if ("kraken_sc2" in qc_check_metrics): # if this var is in the qc_check_metrics, - if ("~{kraken_sc2}"): # if kraken_sc2 variable exists, - qc_note, qc_status = compare(qc_note, "kraken_sc2", float(~{kraken_sc2}), ">=", float(taxon_df["kraken_sc2"][0])) - qc_check_metrics.remove("kraken_sc2") - - if ("kraken_sc2_dehosted" in qc_check_metrics): # if this var is in the qc_check_metrics, - if ("~{kraken_sc2_dehosted}"): # if kraken_sc2_dehosted variable exists, - qc_note, qc_status = compare(qc_note, "kraken_sc2_dehosted", float(~{kraken_sc2_dehosted}), ">=", float(taxon_df["kraken_sc2_dehosted"][0])) - qc_check_metrics.remove("kraken_sc2_dehosted") - - if ("kraken_target_organism" in qc_check_metrics): # if this var is in the qc_check_metrics, - if ("~{kraken_target_organism}"): # if kraken_target_organism variable exists, - qc_note, qc_status = compare(qc_note, "kraken_target_organismanism", float(~{kraken_target_organism}), ">=", float(taxon_df["kraken_target_organism"][0])) - qc_check_metrics.remove("kraken_target_organism") - - if ("kraken_target_organism_dehosted" in qc_check_metrics): # if this var is in the qc_check_metrics, - if ("~{kraken_target_organism_dehosted}"): # if kraken_target_organism_dehosted variable exists, - qc_note, qc_status = compare(qc_note, "kraken_target_organism_dehosted", float(~{kraken_target_organism_dehosted}), ">=", float(taxon_df["kraken_target_organism_dehosted"][0])) - qc_check_metrics.remove("kraken_target_organism_dehosted") + if ("kraken2_human" in qc_check_metrics): # if this var is in the qc_check_metrics, + if ("~{kraken2_human}"): # if kraken2_human variable exists, + qc_note, qc_status = compare(qc_note, "kraken2_human", float(~{kraken2_human}), "<=", float(taxon_df["kraken2_human"][0])) + qc_check_metrics.remove("kraken2_human") + + if ("kraken2_human_dehosted" in qc_check_metrics): # if this var is in the qc_check_metrics, + if ("~{kraken2_human_dehosted}"): # if kraken2_human_dehosted variable exists, + qc_note, qc_status = compare(qc_note, "kraken2_human_dehosted", float(~{kraken2_human_dehosted}), "<=", float(taxon_df["kraken2_human_dehosted"][0])) + qc_check_metrics.remove("kraken2_human_dehosted") + + if ("kraken2_sc2" in qc_check_metrics): # if this var is in the qc_check_metrics, + if ("~{kraken2_sc2}"): # if kraken2_sc2 variable exists, + qc_note, qc_status = compare(qc_note, "kraken2_sc2", float(~{kraken2_sc2}), ">=", float(taxon_df["kraken2_sc2"][0])) + qc_check_metrics.remove("kraken2_sc2") + + if ("kraken2_sc2_dehosted" in qc_check_metrics): # if this var is in the qc_check_metrics, + if ("~{kraken2_sc2_dehosted}"): # if kraken2_sc2_dehosted variable exists, + qc_note, qc_status = compare(qc_note, "kraken2_sc2_dehosted", float(~{kraken2_sc2_dehosted}), ">=", float(taxon_df["kraken2_sc2_dehosted"][0])) + qc_check_metrics.remove("kraken2_sc2_dehosted") + + if ("kraken2_target_organism" in qc_check_metrics): # if this var is in the qc_check_metrics, + if ("~{kraken2_target_organism}"): # if kraken2_target_organism variable exists, + qc_note, qc_status = compare(qc_note, "kraken2_target_organismanism", float(~{kraken2_target_organism}), ">=", float(taxon_df["kraken2_target_organism"][0])) + qc_check_metrics.remove("kraken2_target_organism") + + if ("kraken2_target_organism_dehosted" in qc_check_metrics): # if this var is in the qc_check_metrics, + if ("~{kraken2_target_organism_dehosted}"): # if kraken2_target_organism_dehosted variable exists, + qc_note, qc_status = compare(qc_note, "kraken2_target_organism_dehosted", float(~{kraken2_target_organism_dehosted}), ">=", float(taxon_df["kraken2_target_organism_dehosted"][0])) + qc_check_metrics.remove("kraken2_target_organism_dehosted") if ("meanbaseq_trim" in qc_check_metrics): # if this var is in the qc_check_metrics, if ("~{meanbaseq_trim}"): # if meanbaseq_trim variable exists, diff --git a/tasks/taxon_id/contamination/task_kraken2.wdl b/tasks/taxon_id/contamination/task_kraken2.wdl index fb1522c75..6d574c724 100644 --- a/tasks/taxon_id/contamination/task_kraken2.wdl +++ b/tasks/taxon_id/contamination/task_kraken2.wdl @@ -1,84 +1,12 @@ version 1.0 -task kraken2_theiacov { - input { - File read1 - File? read2 - String samplename - String kraken2_db = "/kraken2-db" - Int cpu = 4 - Int memory = 8 - String? target_organism - Int disk_size = 100 - String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv" - } - command <<< - # date and version control - date | tee DATE - kraken2 --version | head -n1 | tee VERSION - num_reads=$(ls *fastq.gz 2> /dev/nul | wc -l) - if ! [ -z ~{read2} ]; then - mode="--paired" - fi - echo $mode - kraken2 $mode \ - --threads ~{cpu} \ - --db ~{kraken2_db} \ - ~{read1} ~{read2} \ - --report ~{samplename}_kraken2_report.txt \ - --output ~{samplename}.classifiedreads.txt - - # Compress and cleanup - gzip ~{samplename}.classifiedreads.txt - - percentage_human=$(grep "Homo sapiens" ~{samplename}_kraken2_report.txt | cut -f 1) - # | tee PERCENT_HUMAN - percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}_kraken2_report.txt | cut -f1 ) - # | tee PERCENT_COV - if [ -z "$percentage_human" ] ; then percentage_human="0" ; fi - if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi - echo $percentage_human | tee PERCENT_HUMAN - echo $percentage_sc2 | tee PERCENT_SC2 - # capture target org percentage - if [ ! -z "~{target_organism}" ]; then - echo "Target org designated: ~{target_organism}" - percent_target_organism=$(grep "~{target_organism}" ~{samplename}_kraken2_report.txt | cut -f1 | head -n1 ) - if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi - else - percent_target_organism="" - fi - echo $percent_target_organism | tee PERCENT_TARGET_ORGANISM - - >>> - output { - String date = read_string("DATE") - String version = read_string("VERSION") - File kraken_report = "~{samplename}_kraken2_report.txt" - Float percent_human = read_float("PERCENT_HUMAN") - Float percent_sc2 = read_float("PERCENT_SC2") - String percent_target_organism = read_string("PERCENT_TARGET_ORGANISM") - String? kraken_target_organism = target_organism - File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz" - String docker = docker_image - String database = kraken2_db - } - runtime { - docker: docker_image - memory: "~{memory} GB" - cpu: cpu - disks: "local-disk " + disk_size + " SSD" - disk: disk_size + " GB" # TES - preemptible: 0 - maxRetries: 3 - } -} - # standalone version (no default database included) task kraken2_standalone { input { File read1 File? read2 File kraken2_db + String? target_organism String samplename String docker = "us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db" String kraken2_args = "" @@ -128,7 +56,27 @@ task kraken2_standalone { percentage_human=$(grep "Homo sapiens" ~{samplename}.report.txt | cut -f 1) if [ -z "$percentage_human" ] ; then percentage_human="0" ; fi echo $percentage_human | tee PERCENT_HUMAN - + + # capture target org percentage + if [ ! -z "~{target_organism}" ]; then + echo "Target org designated: ~{target_organism}" + # if target organisms is sc2, report it in a special legacy column called PERCENT_SC2 + if [[ "~{target_organism}" == "Severe acute respiratory syndrome coronavirus 2" ]]; then + percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}.report.txt | cut -f1 ) + percent_target_organism="" + if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi + else + percentage_sc2="" + percent_target_organism=$(grep "~{target_organism}" ~{samplename}.report.txt | cut -f1 | head -n1 ) + if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi + fi + else + percent_target_organism="" + percentage_sc2="" + fi + echo $percentage_sc2 | tee PERCENT_SC2 + echo $percent_target_organism | tee PERCENT_TARGET_ORGANISM + # rename classified and unclassified read files if SE if [ -e "~{samplename}.classified#.fastq.gz" ]; then mv "~{samplename}.classified#.fastq.gz" ~{samplename}.classified_1.fastq.gz @@ -141,13 +89,16 @@ task kraken2_standalone { output { String kraken2_version = read_string("VERSION") String kraken2_docker = docker - String analysis_date = read_string("DATE") + String kraken2_analysis_date = read_string("DATE") File kraken2_report = "~{samplename}.report.txt" File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz" File kraken2_unclassified_read1 = "~{samplename}.unclassified_1.fastq.gz" File? kraken2_unclassified_read2 = "~{samplename}.unclassified_2.fastq.gz" File kraken2_classified_read1 = "~{samplename}.classified_1.fastq.gz" Float kraken2_percent_human = read_float("PERCENT_HUMAN") + String kraken2_percent_sc2 = read_string("PERCENT_SC2") + String kraken2_percent_target_organism = read_string("PERCENT_TARGET_ORGANISM") + String? kraken2_target_organism = target_organism File? kraken2_classified_read2 = "~{samplename}.classified_2.fastq.gz" String kraken2_database = kraken2_db } @@ -206,30 +157,35 @@ task kraken2_parse_classified { # theiacov parsing blocks - percent human, sc2 and target organism percentage_human=$(grep "Homo sapiens" ~{samplename}.report_parsed.txt | cut -f 1) - percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}.report_parsed.txt | cut -f1 ) - if [ -z "$percentage_human" ] ; then percentage_human="0" ; fi - if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi echo $percentage_human | tee PERCENT_HUMAN - echo $percentage_sc2 | tee PERCENT_SC2 # capture target org percentage if [ ! -z "~{target_organism}" ]; then echo "Target org designated: ~{target_organism}" - percent_target_organism=$(grep "~{target_organism}" ~{samplename}.report_parsed.txt | cut -f1 | head -n1 ) - if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi + if [[ "~{target_organism}" == "Severe acute respiratory syndrome coronavirus 2" ]]; then + percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}.report_parsed.txt | cut -f1 ) + percent_target_organism="" + if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi + else + percentage_sc2="" + percent_target_organism=$(grep "~{target_organism}" ~{samplename}.report_parsed.txt | cut -f1 | head -n1 ) + if [ -z "$percent_target_organism" ] ; then percent_target_organism="0" ; fi + fi else percent_target_organism="" + percentage_sc2="" fi - echo $percent_target_organism | tee PERCENT_TARGET_ORG + echo $percentage_sc2 | tee PERCENT_SC2 + echo $percent_target_organism | tee PERCENT_TARGET_ORGANISM >>> output { - File kraken_report = "~{samplename}.report_parsed.txt" + File kraken2_parsed_report = "~{samplename}.report_parsed.txt" Float percent_human = read_float("PERCENT_HUMAN") - Float percent_sc2 = read_float("PERCENT_SC2") - String percent_target_organism = read_string("PERCENT_TARGET_ORG") - String? kraken_target_organism = target_organism + String percent_sc2 = read_string("PERCENT_SC2") + String percent_target_organism = read_string("PERCENT_TARGET_ORGANISM") + String? kraken2_target_organism = target_organism } runtime { docker: docker diff --git a/tests/data/theiacov/databases/github_kraken2_test_db.tar.gz b/tests/data/theiacov/databases/github_kraken2_test_db.tar.gz new file mode 100644 index 000000000..4dc2a5ec2 Binary files /dev/null and b/tests/data/theiacov/databases/github_kraken2_test_db.tar.gz differ diff --git a/tests/inputs/theiacov/wf_theiacov_clearlabs.json b/tests/inputs/theiacov/wf_theiacov_clearlabs.json index 10351330a..e41f48ad8 100644 --- a/tests/inputs/theiacov/wf_theiacov_clearlabs.json +++ b/tests/inputs/theiacov/wf_theiacov_clearlabs.json @@ -3,5 +3,6 @@ "theiacov_clearlabs.read1": "tests/data/theiacov/fastqs/clearlabs/clearlabs.fastq.gz", "theiacov_clearlabs.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_clearlabs.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", - "theiacov_clearlabs.organism_parameters.gene_locations_bed_file": "tests/inputs/sc2_gene_locations.bed" + "theiacov_clearlabs.organism_parameters.gene_locations_bed_file": "tests/inputs/sc2_gene_locations.bed", + "theiacov_clearlabs.kraken2_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/inputs/theiacov/wf_theiacov_illumina_pe.json b/tests/inputs/theiacov/wf_theiacov_illumina_pe.json index 467bcf94d..1558d5a7e 100644 --- a/tests/inputs/theiacov/wf_theiacov_illumina_pe.json +++ b/tests/inputs/theiacov/wf_theiacov_illumina_pe.json @@ -5,5 +5,6 @@ "theiacov_illumina_pe.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_illumina_pe.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", "theiacov_illumina_pe.reference_gff": "tests/inputs/completely-empty-for-test.txt", - "theiacov_illumina_pe.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed" + "theiacov_illumina_pe.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed", + "theiacov_illumina_pe.kraken2_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/inputs/theiacov/wf_theiacov_illumina_se.json b/tests/inputs/theiacov/wf_theiacov_illumina_se.json index b9b4381de..b331c8339 100644 --- a/tests/inputs/theiacov/wf_theiacov_illumina_se.json +++ b/tests/inputs/theiacov/wf_theiacov_illumina_se.json @@ -4,5 +4,6 @@ "theiacov_illumina_se.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_illumina_se.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", "theiacov_illumina_se.reference_gff": "tests/inputs/completely-empty-for-test.txt", - "theiacov_illumina_se.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed" + "theiacov_illumina_se.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed", + "theiacov_illumina_se.kraken2_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/inputs/theiacov/wf_theiacov_ont.json b/tests/inputs/theiacov/wf_theiacov_ont.json index 4c551d73b..d275ad546 100644 --- a/tests/inputs/theiacov/wf_theiacov_ont.json +++ b/tests/inputs/theiacov/wf_theiacov_ont.json @@ -3,5 +3,6 @@ "theiacov_ont.read1": "tests/data/theiacov/fastqs/ont/ont.fastq.gz", "theiacov_ont.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed", "theiacov_ont.reference_genome": "tests/data/theiacov/reference/MN908947.fasta", - "theiacov_ont.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed" + "theiacov_ont.reference_gene_locations_bed": "tests/inputs/sc2_gene_locations.bed", + "theiacov_ont.kraken2_db": "tests/data/theiacov/databases/github_kraken2_test_db.tar.gz" } diff --git a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml index 599fec45e..2bdbf04c9 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml @@ -149,7 +149,7 @@ - path: miniwdl_run/call-fastq_scan_raw_reads/work/clearlabs_fastq-scan.json md5sum: 869dd2e934c600bba35f30f08e2da7c9 - path: miniwdl_run/call-kraken2_dehosted/command - md5sum: 0f9db3341b5f58fb8d145d6d94222827 + md5sum: 6a4f2f3d3d1376edbfc7f8510717c8e6 - path: miniwdl_run/call-kraken2_dehosted/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-kraken2_dehosted/outputs.json @@ -161,18 +161,18 @@ contains: ["wdl", "theiacov_clearlabs", "kraken2_dehosted", "done"] - path: miniwdl_run/call-kraken2_dehosted/work/DATE - path: miniwdl_run/call-kraken2_dehosted/work/PERCENT_HUMAN - md5sum: 4fd4dcef994592f9865e9bc8807f32f4 + md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-kraken2_dehosted/work/PERCENT_SC2 - md5sum: 9fc4759d176a0e0d240c418dbaaafeb2 + md5sum: 86b6b8aa9ad17f169f04c02b0e2bf1b1 - path: miniwdl_run/call-kraken2_dehosted/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-kraken2_dehosted/work/VERSION - md5sum: 379b99c23325315c502e74614c035e7d + md5sum: 37b34738ddce9734ae7857ca4d8ee6d3 - path: miniwdl_run/call-kraken2_dehosted/work/_miniwdl_inputs/0/clearlabs_R1_dehosted.fastq.gz - - path: miniwdl_run/call-kraken2_dehosted/work/clearlabs_kraken2_report.txt - md5sum: 35841fa2d77ec202c275b1de548b8d98 + - path: miniwdl_run/call-kraken2_dehosted/work/clearlabs.report.txt + md5sum: b66dbcf8d229c1b6fcfff4dd786068bd - path: miniwdl_run/call-kraken2_raw/command - md5sum: a9dabf08bff8e183fd792901ce24fc57 + md5sum: c25b4822de0e75fc5d80e98bd42b0710 - path: miniwdl_run/call-kraken2_raw/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-kraken2_raw/outputs.json @@ -184,16 +184,16 @@ contains: ["wdl", "theiacov_clearlabs", "kraken2_raw", "done"] - path: miniwdl_run/call-kraken2_raw/work/DATE - path: miniwdl_run/call-kraken2_raw/work/PERCENT_HUMAN - md5sum: 4fd4dcef994592f9865e9bc8807f32f4 + md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-kraken2_raw/work/PERCENT_SC2 - md5sum: 9fc4759d176a0e0d240c418dbaaafeb2 + md5sum: 86b6b8aa9ad17f169f04c02b0e2bf1b1 - path: miniwdl_run/call-kraken2_raw/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - path: miniwdl_run/call-kraken2_raw/work/VERSION - md5sum: 379b99c23325315c502e74614c035e7d + md5sum: 37b34738ddce9734ae7857ca4d8ee6d3 - path: miniwdl_run/call-kraken2_raw/work/_miniwdl_inputs/0/clearlabs.fastq.gz - - path: miniwdl_run/call-kraken2_raw/work/clearlabs_kraken2_report.txt - md5sum: 35841fa2d77ec202c275b1de548b8d98 + - path: miniwdl_run/call-kraken2_raw/work/clearlabs.report.txt + md5sum: b66dbcf8d229c1b6fcfff4dd786068bd - path: miniwdl_run/call-ncbi_scrub_se/command contains: ["read1", "scrubber", "gzip"] - path: miniwdl_run/call-ncbi_scrub_se/inputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml index 4c7542334..1fa6222bf 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml @@ -84,7 +84,7 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/VERSION # kraken2 dehosted - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/command - md5sum: 2031501aaf268d2987b6dbc3b8b32dfa + md5sum: bf23be02912876a001a0c0936cd1ebbd - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/inputs.json contains: ["read1", "read2", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/outputs.json @@ -97,14 +97,14 @@ - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/PERCENT_HUMAN md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/PERCENT_SC2 - md5sum: 494a4bf9ab740c0a0fab64f670549883 + md5sum: 6baf8bb11094b9011d8dc34e66743712 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/SRR13687078_kraken2_report.txt - md5sum: 2ccc036a9a93b3cf096a5c4dda49a579 + - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_dehosted/work/SRR13687078.report.txt + md5sum: 565954ac2bb6ef427754de3b43430728 # kraken2 raw - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/command - md5sum: a16205bdb8cf133a112c4552e8f67f97 + md5sum: 66318d2ef05bfba05da6ca9b78d54ba5 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/outputs.json @@ -115,13 +115,13 @@ - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/task.log contains: ["wdl", "theiacov_illumina_pe", "kraken2_theiacov_raw", "done"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_HUMAN - md5sum: 414f4efa514540a2527a4f27124575f2 + md5sum: 897316929176464ebc9ad085f31e7284 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_SC2 - md5sum: 2bf2d20f083d8fa09abf6c25f8970e2e + md5sum: cfefab882d84cf0f2a1bde9c19eec318 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/SRR13687078_kraken2_report.txt - md5sum: 3544d9ca35d45093c03cdead46677765 + - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/SRR13687078.report.txt + md5sum: 8ea92e13d401e1c955336edfdcd4f1ba # ncbi scrub - path: miniwdl_run/call-read_QC_trim/call-ncbi_scrub_pe/command md5sum: 8c7ca800fa98305009cfb9116a4b60b8 diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml index 0742c19a9..2f4597958 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml @@ -74,7 +74,7 @@ - path: miniwdl_run/call-read_QC_trim/call-fastq_scan_raw/work/VERSION # kraken2 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/command - md5sum: ca22e45a62c5c26c4447cdafe75a26ab + md5sum: 3ec415a3aeb60e3dd82b3d60edba6616 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/inputs.json contains: ["read1", "samplename"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/outputs.json @@ -85,13 +85,13 @@ - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/task.log contains: ["wdl", "theiacov_illumina_se", "kraken2_theiacov_raw", "done"] - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_HUMAN - md5sum: 1576d5d341223ea9d44b0b8a213bb9da + md5sum: 4fd4dcef994592f9865e9bc8807f32f4 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_SC2 - md5sum: 7cc2eb659e21f15fa902b11812eae1f6 + md5sum: adbe14d7547234f3743f80907ed33179 - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/PERCENT_TARGET_ORGANISM md5sum: 68b329da9893e34099c7d8ad5cb9c940 - - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/ERR6319327_kraken2_report.txt - md5sum: 9a089b8920e55c9cc7bc8cd7d18f9a8e + - path: miniwdl_run/call-read_QC_trim/call-kraken2_theiacov_raw/work/ERR6319327.report.txt + md5sum: cb58af9eb139d109b55ce65d6d2344d6 # clean read screen - path: miniwdl_run/call-clean_check_reads/command md5sum: 80a361915a627e86743baacfc383b2b5 diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index 71f5bd4a2..3b5d0f249 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -625,13 +625,13 @@ - path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl md5sum: 2aa70eab24868920f6c28843dd3b5613 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_kraken2.wdl - md5sum: 0ea83681884800bda1e3c4e116f2b19d + md5sum: 98cdffa73c61276319d4607c8f81d0fc - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl md5sum: 4d69a6539b68503af9f3f1c2787ff920 - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl - md5sum: 3cb5c86b15e931b0c0b98ed784386438 + md5sum: e027d8861100ff03545fa0c996b78ab4 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl md5sum: ea5cff6eff8c2c42046cf2eae6f16b6f - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_pe.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 88584182b..c3a5fdd53 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -588,16 +588,16 @@ - path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl md5sum: 2aa70eab24868920f6c28843dd3b5613 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_kraken2.wdl - md5sum: 0ea83681884800bda1e3c4e116f2b19d + md5sum: 98cdffa73c61276319d4607c8f81d0fc - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl md5sum: 4d69a6539b68503af9f3f1c2787ff920 - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl - md5sum: fdb66b59ac886501a4ae90a25cefd633 + md5sum: 22895fbf47f03043d868374df3a4a1fb - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl md5sum: ea5cff6eff8c2c42046cf2eae6f16b6f - path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl - md5sum: d11bfe33fdd96eab28892be5a01c1c7d + md5sum: 301184671a2e40ccfc0d311f1864e514 - path: miniwdl_run/workflow.log contains: ["wdl", "theiaprok_illumina_se", "NOTICE", "done"] diff --git a/workflows/freyja/wf_freyja_fastq.wdl b/workflows/freyja/wf_freyja_fastq.wdl index 9a4446bf8..72f65a22a 100644 --- a/workflows/freyja/wf_freyja_fastq.wdl +++ b/workflows/freyja/wf_freyja_fastq.wdl @@ -22,6 +22,8 @@ workflow freyja_fastq { String samplename Int? depth_cutoff Boolean ont = false + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" + } if (defined(read2)) { call read_qc_pe.read_QC_trim_pe as read_QC_trim_pe { @@ -30,7 +32,9 @@ workflow freyja_fastq { read1 = read1, read2 = select_first([read2]), trim_min_length = trimmomatic_min_length, - workflow_series = "theiacov" + workflow_series = "theiacov", + kraken2_db = kraken2_db, + target_organism = "Severe acute respiratory syndrome coronavirus 2" } } if (! defined(read2) && ! ont) { @@ -39,7 +43,9 @@ workflow freyja_fastq { samplename = samplename, read1 = read1, trim_min_length = trimmomatic_min_length, - workflow_series = "theiacov" + workflow_series = "theiacov", + kraken2_db = kraken2_db, + target_organism = "Severe acute respiratory syndrome coronavirus 2" } } if (ont) { @@ -57,7 +63,9 @@ workflow freyja_fastq { input: samplename = samplename, read1 = read1, - workflow_series = "theiacov" + workflow_series = "theiacov", + kraken2_db = kraken2_db, + target_organism = "Severe acute respiratory syndrome coronavirus 2" } call nanoplot_task.nanoplot as nanoplot_clean { input: @@ -171,13 +179,13 @@ workflow freyja_fastq { File read1_dehosted = select_first([read_QC_trim_pe.read1_dehosted, read_QC_trim_se.read1_dehosted, read_QC_trim_ont.read1_dehosted]) File? read2_dehosted = read_QC_trim_pe.read2_dehosted # Read QC - kraken outputs - all - String kraken_version = select_first([read_QC_trim_pe.kraken_version, read_QC_trim_se.kraken_version, read_QC_trim_ont.kraken_version]) - Float kraken_human = select_first([read_QC_trim_pe.kraken_human, read_QC_trim_se.kraken_human, read_QC_trim_ont.kraken_human]) - Float kraken_sc2 = select_first([read_QC_trim_pe.kraken_sc2, read_QC_trim_se.kraken_sc2, read_QC_trim_ont.kraken_sc2]) - String kraken_report = select_first([read_QC_trim_pe.kraken_report, read_QC_trim_se.kraken_report, read_QC_trim_ont.kraken_report]) - Float kraken_human_dehosted = select_first([read_QC_trim_pe.kraken_human_dehosted, read_QC_trim_se.kraken_human_dehosted, read_QC_trim_ont.kraken_human_dehosted]) - Float kraken_sc2_dehosted = select_first([read_QC_trim_pe.kraken_sc2_dehosted, read_QC_trim_se.kraken_sc2_dehosted, read_QC_trim_ont.kraken_sc2_dehosted]) - File kraken_report_dehosted = select_first([read_QC_trim_pe.kraken_report_dehosted, read_QC_trim_se.kraken_report_dehosted, read_QC_trim_ont.kraken_report_dehosted]) + String kraken2_version = select_first([read_QC_trim_pe.kraken2_version, read_QC_trim_se.kraken2_version, read_QC_trim_ont.kraken2_version]) + Float kraken2_human = select_first([read_QC_trim_pe.kraken2_human, read_QC_trim_se.kraken2_human, read_QC_trim_ont.kraken2_human]) + String kraken2_sc2 = select_first([read_QC_trim_pe.kraken2_sc2, read_QC_trim_se.kraken2_sc2, read_QC_trim_ont.kraken2_sc2]) + String kraken2_report = select_first([read_QC_trim_pe.kraken2_report, read_QC_trim_se.kraken2_report, read_QC_trim_ont.kraken2_report]) + Float kraken2_human_dehosted = select_first([read_QC_trim_pe.kraken2_human_dehosted, read_QC_trim_se.kraken2_human_dehosted, read_QC_trim_ont.kraken2_human_dehosted]) + String kraken2_sc2_dehosted = select_first([read_QC_trim_pe.kraken2_sc2_dehosted, read_QC_trim_se.kraken2_sc2_dehosted, read_QC_trim_ont.kraken2_sc2_dehosted]) + File kraken2_report_dehosted = select_first([read_QC_trim_pe.kraken2_report_dehosted, read_QC_trim_se.kraken2_report_dehosted, read_QC_trim_ont.kraken2_report_dehosted]) # Read Alignment - bwa outputs String? bwa_version = bwa.bwa_version String? alignment_method = alignment_method_technology diff --git a/workflows/standalone_modules/wf_kraken2_ont.wdl b/workflows/standalone_modules/wf_kraken2_ont.wdl index 87387d88f..1111373e6 100644 --- a/workflows/standalone_modules/wf_kraken2_ont.wdl +++ b/workflows/standalone_modules/wf_kraken2_ont.wdl @@ -1,7 +1,7 @@ version 1.0 import "../../tasks/task_versioning.wdl" as versioning -import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken2 +import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken2_task workflow kraken2_ont_wf { meta { @@ -12,31 +12,31 @@ workflow kraken2_ont_wf { File read1 File kraken2_db } - call kraken2.kraken2_standalone as kraken2_se { + call kraken2_task.kraken2_standalone as kraken2 { input: samplename = samplename, read1 = read1, kraken2_db = kraken2_db } - call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances { + call kraken2_task.kraken2_parse_classified as kraken2_recalculate_abundances { input: samplename = samplename, - kraken2_report = kraken2_se.kraken2_report, - kraken2_classified_report = kraken2_se.kraken2_classified_report + kraken2_report = kraken2.kraken2_report, + kraken2_classified_report = kraken2.kraken2_classified_report } call versioning.version_capture { input: } output { # PHB Version Captures - String kraken2_se_wf_version = version_capture.phb_version - String kraken2_se_wf_analysis_date = version_capture.date + String kraken2_ont_wf_version = version_capture.phb_version + String kraken2_ont_wf_analysis_date = version_capture.date # Kraken2 - String kraken2_version = kraken2_se.kraken2_version - String kraken2_docker = kraken2_se.kraken2_docker - File kraken2_classified_report = kraken2_se.kraken2_classified_report - File kraken2_report = kraken2_recalculate_abundances.kraken_report - File kraken2_unclassified_read1 = kraken2_se.kraken2_unclassified_read1 - File kraken2_classified_read1 = kraken2_se.kraken2_classified_read1 + String kraken2_version = kraken2.kraken2_version + String kraken2_docker = kraken2.kraken2_docker + File kraken2_classified_report = kraken2.kraken2_classified_report + File kraken2_report = kraken2_recalculate_abundances.kraken2_parsed_report + File kraken2_unclassified_read1 = kraken2.kraken2_unclassified_read1 + File kraken2_classified_read1 = kraken2.kraken2_classified_read1 } } diff --git a/workflows/standalone_modules/wf_kraken2_pe.wdl b/workflows/standalone_modules/wf_kraken2_pe.wdl index d81b25ddf..02e797992 100644 --- a/workflows/standalone_modules/wf_kraken2_pe.wdl +++ b/workflows/standalone_modules/wf_kraken2_pe.wdl @@ -1,7 +1,7 @@ version 1.0 import "../../tasks/task_versioning.wdl" as versioning -import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken2 +import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken2_task import "../../tasks/taxon_id/contamination/task_krona.wdl" as krona_task workflow kraken2_pe_wf { @@ -14,7 +14,7 @@ workflow kraken2_pe_wf { File read2 File kraken2_db } - call kraken2.kraken2_standalone as kraken2_pe { + call kraken2_task.kraken2_standalone as kraken2 { input: samplename = samplename, read1 = read1, @@ -23,7 +23,7 @@ workflow kraken2_pe_wf { } call krona_task.krona as krona { input: - kraken2_report = kraken2_pe.kraken2_report, + kraken2_report = kraken2.kraken2_report, samplename = samplename } call versioning.version_capture { @@ -34,14 +34,14 @@ workflow kraken2_pe_wf { String kraken2_pe_wf_version = version_capture.phb_version String kraken2_pe_wf_analysis_date = version_capture.date # Kraken2 - String kraken2_version = kraken2_pe.kraken2_version - String kraken2_docker = kraken2_pe.kraken2_docker - File kraken2_report = kraken2_pe.kraken2_report - File kraken2_classified_report = kraken2_pe.kraken2_classified_report - File kraken2_unclassified_read1 = kraken2_pe.kraken2_unclassified_read1 - File kraken2_unclassified_read2 = select_first([kraken2_pe.kraken2_unclassified_read2]) - File kraken2_classified_read1 = kraken2_pe.kraken2_classified_read1 - File kraken2_classified_read2 = select_first([kraken2_pe.kraken2_classified_read2]) + String kraken2_version = kraken2.kraken2_version + String kraken2_docker = kraken2.kraken2_docker + File kraken2_report = kraken2.kraken2_report + File kraken2_classified_report = kraken2.kraken2_classified_report + File kraken2_unclassified_read1 = kraken2.kraken2_unclassified_read1 + File kraken2_unclassified_read2 = select_first([kraken2.kraken2_unclassified_read2]) + File kraken2_classified_read1 = kraken2.kraken2_classified_read1 + File kraken2_classified_read2 = select_first([kraken2.kraken2_classified_read2]) # Krona outputs String krona_version = krona.krona_version String krona_docker = krona.krona_docker diff --git a/workflows/standalone_modules/wf_kraken2_se.wdl b/workflows/standalone_modules/wf_kraken2_se.wdl index 9a9ea35f2..d6c951a28 100644 --- a/workflows/standalone_modules/wf_kraken2_se.wdl +++ b/workflows/standalone_modules/wf_kraken2_se.wdl @@ -1,7 +1,7 @@ version 1.0 import "../../tasks/task_versioning.wdl" as versioning -import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken2 +import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken2_task import "../../tasks/taxon_id/contamination/task_krona.wdl" as krona_task workflow kraken2_se_wf { @@ -13,7 +13,7 @@ workflow kraken2_se_wf { File read1 File kraken2_db } - call kraken2.kraken2_standalone as kraken2_se { + call kraken2_task.kraken2_standalone as kraken2 { input: samplename = samplename, read1 = read1, @@ -21,7 +21,7 @@ workflow kraken2_se_wf { } call krona_task.krona as krona { input: - kraken2_report = kraken2_se.kraken2_report, + kraken2_report = kraken2.kraken2_report, samplename = samplename } call versioning.version_capture { @@ -32,12 +32,12 @@ workflow kraken2_se_wf { String kraken2_se_wf_version = version_capture.phb_version String kraken2_se_wf_analysis_date = version_capture.date # Kraken2 - String kraken2_version = kraken2_se.kraken2_version - String kraken2_docker = kraken2_se.kraken2_docker - File kraken2_report = kraken2_se.kraken2_report - File kraken2_classified_report = kraken2_se.kraken2_classified_report - File kraken2_unclassified_read1 = kraken2_se.kraken2_unclassified_read1 - File kraken2_classified_read1 = kraken2_se.kraken2_classified_read1 + String kraken2_version = kraken2.kraken2_version + String kraken2_docker = kraken2.kraken2_docker + File kraken2_report = kraken2.kraken2_report + File kraken2_classified_report = kraken2.kraken2_classified_report + File kraken2_unclassified_read1 = kraken2.kraken2_unclassified_read1 + File kraken2_classified_read1 = kraken2.kraken2_classified_read1 # Krona outputs String krona_version = krona.krona_version String krona_docker = krona.krona_docker diff --git a/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl b/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl index 3cbedd30a..8065ae419 100644 --- a/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl +++ b/workflows/theiacov/updates/wf_ncbi_scrub_pe.wdl @@ -9,6 +9,8 @@ workflow dehost_pe { String samplename File read1 File read2 + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" + } call ncbi_scrub.ncbi_scrub_pe { input: @@ -16,11 +18,13 @@ workflow dehost_pe { read1 = read1, read2 = read2 } - call kraken.kraken2_theiacov as kraken2 { + call kraken.kraken2_standalone as kraken2 { input: samplename = samplename, read1 = ncbi_scrub_pe.read1_dehosted, - read2 = ncbi_scrub_pe.read2_dehosted + read2 = ncbi_scrub_pe.read2_dehosted, + kraken2_db = kraken2_db, + target_organism = "Severe acute respiratory syndrome coronavirus 2" } call versioning.version_capture { input: @@ -32,9 +36,9 @@ workflow dehost_pe { File read2_dehosted = ncbi_scrub_pe.read2_dehosted Int ncbi_scrub_human_spots_removed = ncbi_scrub_pe.human_spots_removed String ncbi_scrub_docker = ncbi_scrub_pe.ncbi_scrub_docker - Float kraken_human_dehosted = kraken2.percent_human - Float kraken_sc2_dehosted = kraken2.percent_sc2 - File kraken_report_dehosted = kraken2.kraken_report - String kraken_version_dehosted = kraken2.version + Float kraken2_human_dehosted = kraken2.kraken2_percent_human + String kraken2_sc2_dehosted = kraken2.kraken2_percent_sc2 + File kraken2_report_dehosted = kraken2.kraken2_report + String kraken2_version_dehosted = kraken2.kraken2_version } } \ No newline at end of file diff --git a/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl b/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl index 7e0a25d88..3d44cb633 100644 --- a/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl +++ b/workflows/theiacov/updates/wf_ncbi_scrub_se.wdl @@ -8,16 +8,20 @@ workflow dehost_se { input { String samplename File read1 + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" + } call ncbi_scrub.ncbi_scrub_se { input: samplename = samplename, read1 = read1 } - call kraken.kraken2_theiacov as kraken2 { + call kraken.kraken2_standalone as kraken2 { input: samplename = samplename, - read1 = ncbi_scrub_se.read1_dehosted + read1 = ncbi_scrub_se.read1_dehosted, + kraken2_db = kraken2_db, + target_organism = "Severe acute respiratory syndrome coronavirus 2" } call versioning.version_capture { input: @@ -28,9 +32,9 @@ workflow dehost_se { File read1_dehosted = ncbi_scrub_se.read1_dehosted String ncbi_scrub_docker = ncbi_scrub_se.ncbi_scrub_docker Int ncbi_scrub_human_spots_removed = ncbi_scrub_se.human_spots_removed - Float kraken_human_dehosted = kraken2.percent_human - Float kraken_sc2_dehosted = kraken2.percent_sc2 - String kraken_version_dehosted = kraken2.version - File kraken_report_dehosted = kraken2.kraken_report + Float kraken2_human_dehosted = kraken2.kraken2_percent_human + String kraken2_sc2_dehosted = kraken2.kraken2_percent_sc2 + String kraken2_version_dehosted = kraken2.kraken2_version + File kraken2_report_dehosted = kraken2.kraken2_report } } \ No newline at end of file diff --git a/workflows/theiacov/updates/wf_pangolin_update.wdl b/workflows/theiacov/updates/wf_pangolin_update.wdl index b777325db..2028cfec9 100644 --- a/workflows/theiacov/updates/wf_pangolin_update.wdl +++ b/workflows/theiacov/updates/wf_pangolin_update.wdl @@ -34,7 +34,7 @@ workflow pangolin_update { vadr_options = "", primer_bed_file = "gs://theiagen-public-files/terra/theiacov-files/empty.bed", gene_locations_bed_file = "gs://theiagen-public-files/terra/theiacov-files/empty.bed", - kraken_target_organism_input = "", + kraken2_target_organism_input = "", hiv_primer_version = "" } call pangolin.pangolin4 { diff --git a/workflows/theiacov/wf_theiacov_clearlabs.wdl b/workflows/theiacov/wf_theiacov_clearlabs.wdl index 8932c983f..2569bc5d3 100644 --- a/workflows/theiacov/wf_theiacov_clearlabs.wdl +++ b/workflows/theiacov/wf_theiacov_clearlabs.wdl @@ -37,6 +37,8 @@ workflow theiacov_clearlabs { String? target_organism # qc check parameters File? qc_check_table + # kraken inputs + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" } call set_organism_defaults.organism_parameters { input: @@ -44,7 +46,7 @@ workflow theiacov_clearlabs { reference_genome = reference_genome, nextclade_dataset_tag_input = nextclade_dataset_tag, nextclade_dataset_name_input = nextclade_dataset_name, - kraken_target_organism_input = target_organism + kraken2_target_organism_input = target_organism } call fastq_scan.fastq_scan_se as fastq_scan_raw_reads { input: @@ -59,17 +61,19 @@ workflow theiacov_clearlabs { input: read1 = ncbi_scrub_se.read1_dehosted } - call kraken2.kraken2_theiacov as kraken2_raw { + call kraken2.kraken2_standalone as kraken2_raw { input: samplename = samplename, read1 = read1, - target_organism = organism_parameters.kraken_target_organism + target_organism = organism_parameters.kraken2_target_organism, + kraken2_db = kraken2_db } - call kraken2.kraken2_theiacov as kraken2_dehosted { + call kraken2.kraken2_standalone as kraken2_dehosted { input: samplename = samplename, read1 = ncbi_scrub_se.read1_dehosted, - target_organism = organism_parameters.kraken_target_organism + target_organism = organism_parameters.kraken2_target_organism, + kraken2_db = kraken2_db } call artic_consensus.consensus { input: @@ -139,12 +143,12 @@ workflow theiacov_clearlabs { expected_taxon = organism_parameters.standardized_organism, num_reads_raw1 = fastq_scan_raw_reads.read1_seq, num_reads_clean1 = fastq_scan_clean_reads.read1_seq, - kraken_human = kraken2_raw.percent_human, - # kraken_sc2 = kraken2_raw.percent_sc2, - # kraken_target_organism = kraken2_raw.percent_target_organism, - kraken_human_dehosted = kraken2_dehosted.percent_human, - # kraken_sc2_dehosted = kraken2_dehosted.percent_sc2, - # kraken_target_organism_dehosted = kraken2_dehosted.percent_target_organism, + kraken2_human = kraken2_raw.kraken2_percent_human, + # kraken2_sc2 = kraken2_raw.percent_sc2, + # kraken2_target_organism = kraken2_raw.percent_target_organism, + kraken2_human_dehosted = kraken2_dehosted.kraken2_percent_human, + # kraken2_sc2_dehosted = kraken2_dehosted.percent_sc2, + # kraken2_target_organism_dehosted = kraken2_dehosted.percent_target_organism, meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq, assembly_mean_coverage = stats_n_coverage_primtrim.depth, number_N = consensus_qc.number_N, @@ -172,16 +176,17 @@ workflow theiacov_clearlabs { Int fastq_scan_num_reads_clean1 = fastq_scan_clean_reads.read1_seq String fastq_scan_version = fastq_scan_raw_reads.version # Read QC - kraken outputs - String kraken_version = kraken2_raw.version - Float kraken_human = kraken2_raw.percent_human - Float kraken_sc2 = kraken2_raw.percent_sc2 - String kraken_target_organism = kraken2_raw.percent_target_organism - String kraken_target_organism_name = organism_parameters.kraken_target_organism - File kraken_report = kraken2_raw.kraken_report - Float kraken_human_dehosted = kraken2_dehosted.percent_human - Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 - String kraken_target_organism_dehosted = kraken2_dehosted.percent_target_organism - File kraken_report_dehosted = kraken2_dehosted.kraken_report + String kraken2_version = kraken2_raw.kraken2_version + Float kraken2_human = kraken2_raw.kraken2_percent_human + String kraken2_sc2 = kraken2_raw.kraken2_percent_sc2 + String kraken2_target_organism = kraken2_raw.kraken2_percent_target_organism + String kraken2_target_organism_name = organism_parameters.kraken2_target_organism + File kraken2_report = kraken2_raw.kraken2_report + Float kraken2_human_dehosted = kraken2_dehosted.kraken2_percent_human + String kraken2_sc2_dehosted = kraken2_dehosted.kraken2_percent_sc2 + String kraken2_target_organism_dehosted = kraken2_dehosted.kraken2_percent_target_organism + File kraken2_report_dehosted = kraken2_dehosted.kraken2_report + String kraken2_database = kraken2_dehosted.kraken2_database # Read Alignment - Artic consensus outputs File aligned_bam = consensus.trim_sorted_bam File aligned_bai = consensus.trim_sorted_bai diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index ece3c201a..dbaf775ac 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -63,6 +63,7 @@ workflow theiacov_illumina_pe { String? pangolin_docker_image # kraken parameters String? target_organism + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" # qc check parameters File? qc_check_table } @@ -81,7 +82,7 @@ workflow theiacov_illumina_pe { vadr_mem = vadr_memory, primer_bed_file = primer_bed, pangolin_docker_image = pangolin_docker_image, - kraken_target_organism_input = target_organism + kraken2_target_organism_input = target_organism } call screen.check_reads as raw_check_reads { input: @@ -110,7 +111,9 @@ workflow theiacov_illumina_pe { trim_min_length = trim_min_length, trim_quality_min_score = trim_quality_min_score, trim_window_size = trim_window_size, - target_organism = organism_parameters.kraken_target_organism + target_organism = organism_parameters.kraken2_target_organism, + kraken2_db = kraken2_db, + call_kraken2 = true } call screen.check_reads as clean_check_reads { input: @@ -227,8 +230,8 @@ workflow theiacov_illumina_pe { num_reads_raw2 = read_QC_trim.fastq_scan_raw2, num_reads_clean1 = read_QC_trim.fastq_scan_clean1, num_reads_clean2 = read_QC_trim.fastq_scan_clean2, - kraken_human = read_QC_trim.kraken_human, - kraken_human_dehosted = read_QC_trim.kraken_human_dehosted, + kraken2_human = read_QC_trim.kraken2_human, + kraken2_human_dehosted = read_QC_trim.kraken2_human_dehosted, meanbaseq_trim = ivar_consensus.meanbaseq_trim, assembly_mean_coverage = ivar_consensus.assembly_mean_coverage, number_N = consensus_qc.number_N, @@ -287,16 +290,17 @@ workflow theiacov_illumina_pe { File? read1_dehosted = read_QC_trim.read1_dehosted File? read2_dehosted = read_QC_trim.read2_dehosted # Read QC - kraken outputs - String? kraken_version = read_QC_trim.kraken_version - Float? kraken_human = read_QC_trim.kraken_human - Float? kraken_sc2 = read_QC_trim.kraken_sc2 - String? kraken_target_organism = read_QC_trim.kraken_target_organism - String? kraken_target_organism_name = read_QC_trim.kraken_target_organism_name - File? kraken_report = read_QC_trim.kraken_report - Float? kraken_human_dehosted = read_QC_trim.kraken_human_dehosted - Float? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted - String? kraken_target_organism_dehosted = read_QC_trim.kraken_target_organism_dehosted - File? kraken_report_dehosted = read_QC_trim.kraken_report_dehosted + String? kraken2_version = read_QC_trim.kraken2_version + Float? kraken2_human = read_QC_trim.kraken2_human + String? kraken2_sc2 = read_QC_trim.kraken2_sc2 + String? kraken2_target_organism = read_QC_trim.kraken2_target_organism + String? kraken2_target_organism_name = read_QC_trim.kraken2_target_organism_name + File? kraken2_report = read_QC_trim.kraken2_report + Float? kraken2_human_dehosted = read_QC_trim.kraken2_human_dehosted + String? kraken2_sc2_dehosted = read_QC_trim.kraken2_sc2_dehosted + String? kraken2_target_organism_dehosted = read_QC_trim.kraken2_target_organism_dehosted + File? kraken2_report_dehosted = read_QC_trim.kraken2_report_dehosted + String? kraken2_database = read_QC_trim.kraken2_database # Read Alignment - bwa outputs String? bwa_version = ivar_consensus.bwa_version String? samtools_version = ivar_consensus.samtools_version diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl index fa1044c24..84e1f6a18 100644 --- a/workflows/theiacov/wf_theiacov_illumina_se.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl @@ -59,6 +59,9 @@ workflow theiacov_illumina_se { String? pangolin_docker_image # qc check parameters File? qc_check_table + # Kraken parameters + String? target_organism + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" } call set_organism_defaults.organism_parameters { input: @@ -74,7 +77,8 @@ workflow theiacov_illumina_se { vadr_options = vadr_options, vadr_mem = vadr_memory, primer_bed_file = primer_bed, - pangolin_docker_image = pangolin_docker_image + pangolin_docker_image = pangolin_docker_image, + kraken2_target_organism_input = target_organism } call screen.check_reads_se as raw_check_reads { input: @@ -101,7 +105,9 @@ workflow theiacov_illumina_se { adapters = adapters, phix = phix, workflow_series = "theiacov", - target_organism = organism_parameters.kraken_target_organism + target_organism = organism_parameters.kraken2_target_organism, + kraken2_db = kraken2_db, + call_kraken2 = true } call screen.check_reads_se as clean_check_reads { input: @@ -187,7 +193,7 @@ workflow theiacov_illumina_se { expected_taxon = organism_parameters.standardized_organism, num_reads_raw1 = read_QC_trim.fastq_scan_raw1, num_reads_clean1 = read_QC_trim.fastq_scan_clean1, - kraken_human = read_QC_trim.kraken_human, + kraken2_human = read_QC_trim.kraken2_human, meanbaseq_trim = ivar_consensus.meanbaseq_trim, assembly_mean_coverage = ivar_consensus.assembly_mean_coverage, number_N = consensus_qc.number_N, @@ -231,17 +237,18 @@ workflow theiacov_illumina_se { # Read QC - bbduk outputs File? read1_clean = read_QC_trim.read1_clean String? bbduk_docker = read_QC_trim.bbduk_docker - # Read QC - kraken outputs - String? kraken_version = read_QC_trim.kraken_version - Float? kraken_human = read_QC_trim.kraken_human - Float? kraken_sc2 = read_QC_trim.kraken_sc2 - String? kraken_target_organism = read_QC_trim.kraken_target_organism - String? kraken_target_organism_name = read_QC_trim.kraken_target_organism_name - File? kraken_report = read_QC_trim.kraken_report - Float? kraken_human_dehosted = read_QC_trim.kraken_human_dehosted - Float? kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted - String? kraken_target_organism_dehosted = read_QC_trim.kraken_target_organism_dehosted - File? kraken_report_dehosted = read_QC_trim.kraken_report_dehosted + # Read QC - kraken2 outputs + Float? kraken2_human = read_QC_trim.kraken2_human + String? kraken2_sc2 = read_QC_trim.kraken2_sc2 + String? kraken2_target_organism = read_QC_trim.kraken2_target_organism + String? kraken2_target_organism_name = read_QC_trim.kraken2_target_organism_name + String? kraken2_version = read_QC_trim.kraken2_version + File? kraken2_report = read_QC_trim.kraken2_report + Float? kraken2_human_dehosted = read_QC_trim.kraken2_human_dehosted + String? kraken2_sc2_dehosted = read_QC_trim.kraken2_sc2_dehosted + String? kraken2_target_organism_dehosted = read_QC_trim.kraken2_target_organism_dehosted + File? kraken2_report_dehosted = read_QC_trim.kraken2_report_dehosted + String? kraken2_database = read_QC_trim.kraken2_database # Read Alignment - bwa outputs String? bwa_version = ivar_consensus.bwa_version String? samtools_version = ivar_consensus.samtools_version diff --git a/workflows/theiacov/wf_theiacov_ont.wdl b/workflows/theiacov/wf_theiacov_ont.wdl index 984934062..f6ff06bd7 100644 --- a/workflows/theiacov/wf_theiacov_ont.wdl +++ b/workflows/theiacov/wf_theiacov_ont.wdl @@ -40,6 +40,7 @@ workflow theiacov_ont { Int? genome_length # kraken inputs String? target_organism + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/kraken2_humanGRCh38_viralRefSeq_20240828.tar.gz" # read screen parameters Int min_reads = 57 # min basepairs / 300 (which is the longest available read length of an Illumina product) Int min_basepairs = 17000 # 10x coverage of hepatitis delta virus @@ -64,7 +65,7 @@ workflow theiacov_ont { reference_genome = reference_genome, gene_locations_bed_file = reference_gene_locations_bed, genome_length_input = genome_length, - kraken_target_organism_input = target_organism, + kraken2_target_organism_input = target_organism, nextclade_dataset_tag_input = nextclade_dataset_tag, nextclade_dataset_name_input = nextclade_dataset_name, vadr_max_length = vadr_max_length, @@ -100,8 +101,10 @@ workflow theiacov_ont { min_length = min_length, max_length = max_length, run_prefix = run_prefix, - target_organism = organism_parameters.kraken_target_organism, - workflow_series = "theiacov" + target_organism = organism_parameters.kraken2_target_organism, + workflow_series = "theiacov", + kraken2_db = kraken2_db, + call_kraken2 = true } call screen.check_reads_se as clean_check_reads { input: @@ -232,7 +235,7 @@ workflow theiacov_ont { expected_taxon = organism_parameters.standardized_organism, num_reads_raw1 = nanoplot_raw.num_reads, num_reads_clean1 = nanoplot_clean.num_reads, - kraken_human = read_qc_trim.kraken_human, + kraken2_human = read_qc_trim.kraken2_human, meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq, assembly_mean_coverage = stats_n_coverage_primtrim.depth, number_N = consensus_qc.number_N, @@ -283,19 +286,20 @@ workflow theiacov_ont { Float? nanoplot_r1_mean_q_clean = nanoplot_clean.mean_q Float? nanoplot_r1_median_q_clean = nanoplot_clean.median_q Float? nanoplot_r1_est_coverage_clean = nanoplot_clean.est_coverage - # Read QC - kraken outputs general - String? kraken_version = read_qc_trim.kraken_version - String? kraken_target_organism_name = read_qc_trim.kraken_target_organism_name - # Read QC - kraken outputs raw - Float? kraken_human = read_qc_trim.kraken_human - Float? kraken_sc2 = read_qc_trim.kraken_sc2 - String? kraken_target_organism = read_qc_trim.kraken_target_organism - File? kraken_report = read_qc_trim.kraken_report - # Read QC - kraken outputs dehosted - Float? kraken_human_dehosted = read_qc_trim.kraken_human_dehosted - Float? kraken_sc2_dehosted = read_qc_trim.kraken_sc2_dehosted - String? kraken_target_organism_dehosted = read_qc_trim.kraken_target_organism_dehosted - File? kraken_report_dehosted = read_qc_trim.kraken_report_dehosted + # Read QC - kraken2 outputs general + String? kraken2_version = read_qc_trim.kraken2_version + String? kraken2_target_organism_name = read_qc_trim.kraken2_target_organism_name + # Read QC - kraken2 outputs raw + Float? kraken2_human = read_qc_trim.kraken2_human + String? kraken2_sc2 = read_qc_trim.kraken2_sc2 + String? kraken2_target_organism = read_qc_trim.kraken2_target_organism + File? kraken2_report = read_qc_trim.kraken2_report + String? kraken2_database = read_qc_trim.kraken2_database + # Read QC - kraken2 outputs dehosted + Float? kraken2_human_dehosted = read_qc_trim.kraken2_human_dehosted + String? kraken2_sc2_dehosted = read_qc_trim.kraken2_sc2_dehosted + String? kraken2_target_organism_dehosted = read_qc_trim.kraken2_target_organism_dehosted + File? kraken2_report_dehosted = read_qc_trim.kraken2_report_dehosted # Read Alignment - Artic consensus outputs String assembly_fasta = select_first([consensus.consensus_seq, flu_track.irma_assembly_fasta, "Assembly could not be generated"]) File? aligned_bam = consensus.trim_sorted_bam diff --git a/workflows/theiameta/wf_theiameta_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_illumina_pe.wdl index 51f1a0054..c80f76a5e 100644 --- a/workflows/theiameta/wf_theiameta_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_illumina_pe.wdl @@ -5,7 +5,7 @@ import "../../tasks/alignment/task_minimap2.wdl" as minimap2_task import "../../tasks/assembly/task_semibin.wdl" as semibin_task import "../../tasks/quality_control/basic_statistics/task_quast.wdl" as quast_task import "../../tasks/task_versioning.wdl" as versioning -import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken_task +import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken2_task import "../../tasks/taxon_id/contamination/task_krona.wdl" as krona_task import "../../tasks/utilities/data_handling/task_parse_mapping.wdl" as parse_mapping_task import "../../tasks/assembly/task_metaspades.wdl" as metaspades_task @@ -24,7 +24,7 @@ workflow theiameta_illumina_pe { File kraken2_db = "gs://theiagen-public-files-rp/terra/theiaprok-files/k2_standard_08gb_20230605.tar.gz" Boolean output_additional_files = false } - call kraken_task.kraken2_standalone as kraken2_raw { + call kraken2_task.kraken2_standalone as kraken2_raw { input: samplename = samplename, read1 = read1, @@ -45,12 +45,12 @@ workflow theiameta_illumina_pe { read1 = read1, read2 = read2, workflow_series = "theiameta", - kraken_db = kraken2_db, - call_kraken = false, - kraken_disk_size = 100, - kraken_memory = 8 + kraken2_db = kraken2_db, + call_kraken2 = false, + kraken2_disk_size = 100, + kraken2_memory = 8 } - call kraken_task.kraken2_standalone as kraken2_clean { + call kraken2_task.kraken2_standalone as kraken2_clean { input: samplename = samplename, read1 = read_QC_trim.read1_clean, diff --git a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl index d71c5e324..60d94aca0 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl @@ -562,8 +562,8 @@ workflow theiaprok_illumina_pe { midas_secondary_genus = read_QC_trim.midas_secondary_genus, midas_secondary_genus_abundance = read_QC_trim.midas_secondary_genus_abundance, midas_secondary_genus_coverage = read_QC_trim.midas_secondary_genus_coverage, - kraken2_version = read_QC_trim.kraken_version, - kraken2_report = read_QC_trim.kraken_report, + kraken2_version = read_QC_trim.kraken2_version, + kraken2_report = read_QC_trim.kraken2_report, pasty_serogroup = merlin_magic.pasty_serogroup, pasty_serogroup_coverage = merlin_magic.pasty_serogroup_coverage, pasty_serogroup_fragments = merlin_magic.pasty_serogroup_fragments, @@ -658,11 +658,11 @@ workflow theiaprok_illumina_pe { String? midas_secondary_genus = read_QC_trim.midas_secondary_genus Float? midas_secondary_genus_abundance = read_QC_trim.midas_secondary_genus_abundance Float? midas_secondary_genus_coverage = read_QC_trim.midas_secondary_genus_coverage - # Read QC - kraken outputs - String? kraken2_version = read_QC_trim.kraken_version - String? kraken2_report = read_QC_trim.kraken_report - String? kraken2_database = read_QC_trim.kraken_database - String? kraken_docker = read_QC_trim.kraken_docker + # Read QC - kraken2 outputs + String? kraken2_version = read_QC_trim.kraken2_version + String? kraken2_report = read_QC_trim.kraken2_report + String? kraken2_database = read_QC_trim.kraken2_database + String? kraken2_docker = read_QC_trim.kraken2_docker # Assembly - shovill outputs File? assembly_fasta = shovill_pe.assembly_fasta File? contigs_gfa = shovill_pe.contigs_gfa diff --git a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl index 1c3eee081..0c936dad3 100644 --- a/workflows/theiaprok/wf_theiaprok_illumina_se.wdl +++ b/workflows/theiaprok/wf_theiaprok_illumina_se.wdl @@ -516,9 +516,9 @@ workflow theiaprok_illumina_se { midas_secondary_genus = read_QC_trim.midas_secondary_genus, midas_secondary_genus_abundance = read_QC_trim.midas_secondary_genus_abundance, midas_secondary_genus_coverage = read_QC_trim.midas_secondary_genus_coverage, - kraken2_version = read_QC_trim.kraken_version, - kraken2_docker = read_QC_trim.kraken_docker, - kraken2_report = read_QC_trim.kraken_report, + kraken2_version = read_QC_trim.kraken2_version, + kraken2_docker = read_QC_trim.kraken2_docker, + kraken2_report = read_QC_trim.kraken2_report, pasty_serogroup = merlin_magic.pasty_serogroup, pasty_serogroup_coverage = merlin_magic.pasty_serogroup_coverage, pasty_serogroup_fragments = merlin_magic.pasty_serogroup_fragments, @@ -600,10 +600,10 @@ workflow theiaprok_illumina_se { Float? midas_secondary_genus_abundance = read_QC_trim.midas_secondary_genus_abundance Float? midas_secondary_genus_coverage = read_QC_trim.midas_secondary_genus_coverage # Read QC - kraken outputs - String? kraken2_version = read_QC_trim.kraken_version - String? kraken2_report = read_QC_trim.kraken_report - String? kraken2_docker = read_QC_trim.kraken_docker - String? kraken2_database = read_QC_trim.kraken_database + String? kraken2_version = read_QC_trim.kraken2_version + String? kraken2_report = read_QC_trim.kraken2_report + String? kraken2_docker = read_QC_trim.kraken2_docker + String? kraken2_database = read_QC_trim.kraken2_database #Assembly - shovill outputs File? assembly_fasta = shovill_se.assembly_fasta File? contigs_gfa = shovill_se.contigs_gfa diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index a7eb9143e..644bc2b7a 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -581,10 +581,10 @@ workflow theiaprok_ont { String? nanoplot_version = nanoplot_raw.nanoplot_version String? nanoplot_docker = nanoplot_raw.nanoplot_docker # Read QC - kraken outputs - String? kraken2_version = read_qc_trim.kraken_version - String? kraken2_report = read_qc_trim.kraken_report - String? kraken2_database = read_qc_trim.kraken_database - String? kraken_docker = read_qc_trim.kraken_docker + String? kraken2_version = read_qc_trim.kraken2_version + String? kraken2_report = read_qc_trim.kraken2_report + String? kraken2_database = read_qc_trim.kraken2_database + String? kraken2_docker = read_qc_trim.kraken2_docker # Read QC - rasusa outputs String? rasusa_version = read_qc_trim.rasusa_version # Read QC - tiptoft outputs diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index 3c88ccb0f..486aae111 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -36,7 +36,7 @@ workflow organism_parameters { String? pangolin_docker_image # kraken parameters - String? kraken_target_organism_input + String? kraken2_target_organism_input # augur parameters Int? min_num_unambig @@ -54,6 +54,7 @@ workflow organism_parameters { String sc2_gene_locations_bed = "gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed" String sc2_nextclade_ds_tag = "2024-07-17--12-57-03Z" String sc2_nextclade_ds_name = "nextstrain/sars-cov-2/wuhan-hu-1/orfs" + String sc2_kraken2_target_organism = "Severe acute respiratory syndrome coronavirus 2" String sc2_pangolin_docker = "us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29" Int sc2_genome_len = 29903 Int sc2_vadr_max_length = 30000 @@ -67,7 +68,7 @@ workflow organism_parameters { String mpox_gene_locations_bed = "gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed" String mpox_nextclade_ds_tag = "2024-04-19--07-50-39Z" String mpox_nextclade_ds_name = "nextstrain/mpox/lineage-b.1" - String mpox_kraken_target_organism = "Monkeypox virus" + String mpox_kraken2_target_organism = "Monkeypox virus" String mpox_primer_bed_file = "gs://theiagen-public-files/terra/mpxv-files/MPXV.primer.bed" String mpox_reference_gff_file = "gs://theiagen-public-files/terra/mpxv-files/Mpox-MT903345.1.reference.gff3" String mpox_vadr_options = "--glsearch -s -r --nomisc --mkey mpxv --r_lowsimok --r_lowsimxd 100 --r_lowsimxl 2000 --alt_pass discontn,dupregin --out_allfasta --minimap2 --s_overhang 150" @@ -91,7 +92,7 @@ workflow organism_parameters { if (organism == "WNV" || organism == "wnv" || organism == "West Nile virus") { String wnv_org_name = "WNV" String wnv_reference_genome = "gs://theiagen-public-files/terra/theiacov-files/WNV/NC_009942.1_wnv_L1.fasta" - String wnv_kraken_target_organism = "West Nile virus" + String wnv_kraken2_target_organism = "West Nile virus" String wnv_primer_bed_file = "gs://theiagen-public-files/terra/theiacov-files/WNV/WNV-L1_primer.bed" Int wnv_genome_len = 11000 String wnv_vadr_options = "--mkey flavi --mdir /opt/vadr/vadr-models-flavi/ --nomisc --noprotid --out_allfasta" @@ -200,7 +201,7 @@ workflow organism_parameters { String rsv_a_nextclade_ds_tag = "2024-08-01--22-31-31Z" String rsv_a_nextclade_ds_name = "nextstrain/rsv/a/EPI_ISL_412866" Int rsv_a_genome_len = 15500 - String rsv_a_kraken_target_organism = "Respiratory syncytial virus" + String rsv_a_kraken2_target_organism = "Human respiratory syncytial virus A" String rsv_a_vadr_options = "-r --mkey rsv --xnocomp" Int rsv_a_vadr_max_length = 15500 Int rsv_a_vadr_skip_length = 5000 @@ -224,7 +225,7 @@ workflow organism_parameters { String rsv_b_nextclade_ds_tag = "2024-08-01--22-31-31Z" String rsv_b_nextclade_ds_name = "nextstrain/rsv/b/EPI_ISL_1653999" Int rsv_b_genome_len = 15500 - String rsv_b_kraken_target_organism = "Human orthopneumovirus" + String rsv_b_kraken2_target_organism = "human respiratory syncytial virus" String rsv_b_vadr_options = "-r --mkey rsv --xnocomp" Int rsv_b_vadr_max_length = 15500 Int rsv_b_vadr_skip_length = 5000 @@ -279,7 +280,7 @@ workflow organism_parameters { Int vadr_memory = select_first([vadr_mem, sc2_vadr_memory, mpox_vadr_memory, wnv_vadr_memory, flu_vadr_memory, rsv_a_vadr_memory, rsv_b_vadr_memory, 0]) Int vadr_skiplength = select_first([vadr_skip_length, sc2_vadr_skip_length, mpox_vadr_skip_length, wnv_vadr_skip_length, flu_vadr_skip_length, rsv_a_vadr_skip_length, rsv_b_vadr_skip_length, 0]) # kraken options - String kraken_target_organism = select_first([kraken_target_organism_input, mpox_kraken_target_organism, wnv_kraken_target_organism, hiv_v1_target_organism, hiv_v2_target_organism, rsv_a_kraken_target_organism, rsv_b_kraken_target_organism, ""]) + String kraken2_target_organism = select_first([kraken2_target_organism_input, sc2_kraken2_target_organism, mpox_kraken2_target_organism, wnv_kraken2_target_organism, hiv_v1_target_organism, hiv_v2_target_organism, rsv_a_kraken2_target_organism, rsv_b_kraken2_target_organism, ""]) # augur options Int augur_min_num_unambig = select_first([min_num_unambig, mpox_min_num_unambig, flu_min_num_unambig, rsv_a_min_num_unambig, rsv_b_min_num_unambig, 0]) File augur_clades_tsv = select_first([clades_tsv, h1n1_ha_clades_tsv, h3n2_ha_clades_tsv, vic_ha_clades_tsv, yam_ha_clades_tsv, h5n1_ha_clades_tsv, rsv_a_clades_tsv, rsv_b_clades_tsv, mpox_clades_tsv, "gs://theiagen-public-files-rp/terra/augur-defaults/minimal-clades.tsv"]) diff --git a/workflows/utilities/wf_read_QC_trim_ont.wdl b/workflows/utilities/wf_read_QC_trim_ont.wdl index c03141251..957b18206 100644 --- a/workflows/utilities/wf_read_QC_trim_ont.wdl +++ b/workflows/utilities/wf_read_QC_trim_ont.wdl @@ -29,11 +29,11 @@ workflow read_QC_trim_ont { # kraken inputs String? target_organism - Boolean call_kraken = false - Int? kraken_disk_size - Int? kraken_memory - Int? kraken_cpu - File? kraken_db + Boolean call_kraken2 = false + Int? kraken2_disk_size + Int? kraken2_memory + Int? kraken2_cpu + File? kraken2_db # rasusa downsampling Float downsampling_coverage = 150 @@ -52,52 +52,54 @@ workflow read_QC_trim_ont { max_length = max_length, run_prefix = run_prefix } - call kraken2.kraken2_theiacov as kraken2_raw { + call kraken2.kraken2_standalone as kraken2_theiacov_raw { input: samplename = samplename, read1 = read1, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = select_first([kraken2_db]) } call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances_raw { input: samplename = samplename, - kraken2_report = kraken2_raw.kraken_report, - kraken2_classified_report = kraken2_raw.kraken2_classified_report, + kraken2_report = kraken2_theiacov_raw.kraken2_report, + kraken2_classified_report = kraken2_theiacov_raw.kraken2_classified_report, target_organism = target_organism } - call kraken2.kraken2_theiacov as kraken2_dehosted { + call kraken2.kraken2_standalone as kraken2_theiacov_dehosted { input: samplename = samplename, read1 = ncbi_scrub_se.read1_dehosted, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = select_first([kraken2_db]) } call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances_dehosted { input: samplename = samplename, - kraken2_report = kraken2_dehosted.kraken_report, - kraken2_classified_report = kraken2_dehosted.kraken2_classified_report, + kraken2_report = kraken2_theiacov_dehosted.kraken2_report, + kraken2_classified_report = kraken2_theiacov_dehosted.kraken2_classified_report, target_organism = target_organism } } if ("~{workflow_series}" == "theiaprok") { - if ((call_kraken) && defined(kraken_db)) { - call kraken2.kraken2_standalone as kraken2_se { + if ((call_kraken2) && defined(kraken2_db)) { + call kraken2.kraken2_standalone as kraken2_theiaprok { input: samplename = samplename, read1 = read1, - kraken2_db = select_first([kraken_db]), - disk_size = kraken_disk_size, - memory = kraken_memory, - cpu = kraken_cpu + kraken2_db = select_first([kraken2_db]), + disk_size = kraken2_disk_size, + memory = kraken2_memory, + cpu = kraken2_cpu } call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances { input: samplename = samplename, - kraken2_report = kraken2_se.kraken2_report, - kraken2_classified_report = kraken2_se.kraken2_classified_report + kraken2_report = kraken2_theiaprok.kraken2_report, + kraken2_classified_report = kraken2_theiaprok.kraken2_classified_report } - } if ((call_kraken) && ! defined(kraken_db)) { - String kraken_db_warning = "Kraken database not defined" + } if ((call_kraken2) && ! defined(kraken2_db)) { + String kraken2_db_warning = "Kraken2 database not defined" } # rasusa for random downsampling @@ -127,18 +129,18 @@ workflow read_QC_trim_ont { File? read1_dehosted = ncbi_scrub_se.read1_dehosted # kraken2 - theiacov and theiapro - String kraken_version = select_first([kraken2_raw.version, kraken2_se.kraken2_version, ""]) - String kraken_docker = select_first([kraken2_raw.docker, kraken2_se.kraken2_docker, ""]) - Float? kraken_human = kraken2_recalculate_abundances_raw.percent_human - Float? kraken_sc2 = kraken2_recalculate_abundances_raw.percent_sc2 - String? kraken_target_organism = kraken2_recalculate_abundances_raw.percent_target_organism - String? kraken_target_organism_name = kraken2_raw.kraken_target_organism - String kraken_report = select_first([kraken2_recalculate_abundances_raw.kraken_report, kraken2_recalculate_abundances.kraken_report, ""]) - Float? kraken_human_dehosted = kraken2_recalculate_abundances_dehosted.percent_human - Float? kraken_sc2_dehosted = kraken2_recalculate_abundances_dehosted.percent_sc2 - String? kraken_target_organism_dehosted = kraken2_recalculate_abundances_dehosted.percent_target_organism - File? kraken_report_dehosted = kraken2_recalculate_abundances_dehosted.kraken_report - String kraken_database = select_first([kraken2_raw.database, kraken2_se.kraken2_database, kraken_db_warning, ""]) + String kraken2_version = select_first([kraken2_theiacov_raw.kraken2_version, kraken2_theiaprok.kraken2_version, ""]) + String kraken2_docker = select_first([kraken2_theiacov_raw.kraken2_docker, kraken2_theiaprok.kraken2_docker, ""]) + Float? kraken2_human = kraken2_recalculate_abundances_raw.percent_human + String? kraken2_sc2 = kraken2_recalculate_abundances_raw.percent_sc2 + String? kraken2_target_organism = kraken2_recalculate_abundances_raw.percent_target_organism + String? kraken2_target_organism_name = kraken2_theiacov_raw.kraken2_target_organism + String kraken2_report = select_first([kraken2_recalculate_abundances_raw.kraken2_parsed_report, kraken2_recalculate_abundances.kraken2_parsed_report, ""]) + Float? kraken2_human_dehosted = kraken2_recalculate_abundances_dehosted.percent_human + String? kraken2_sc2_dehosted = kraken2_recalculate_abundances_dehosted.percent_sc2 + String? kraken2_target_organism_dehosted = kraken2_recalculate_abundances_dehosted.percent_target_organism + File? kraken2_report_dehosted = kraken2_recalculate_abundances_dehosted.kraken2_parsed_report + String kraken2_database = select_first([kraken2_theiacov_raw.kraken2_database, kraken2_theiaprok.kraken2_database, kraken2_db_warning, ""]) # estimated genome length -- by default for TheiaProk this is 5Mb Int est_genome_length = genome_length diff --git a/workflows/utilities/wf_read_QC_trim_pe.wdl b/workflows/utilities/wf_read_QC_trim_pe.wdl index bb79c260d..57ea3fe36 100644 --- a/workflows/utilities/wf_read_QC_trim_pe.wdl +++ b/workflows/utilities/wf_read_QC_trim_pe.wdl @@ -24,11 +24,11 @@ workflow read_QC_trim_pe { Int bbduk_memory = 8 Boolean call_midas = false File? midas_db - Boolean call_kraken = false - Int? kraken_disk_size - Int? kraken_memory - Int? kraken_cpu - File? kraken_db + Boolean call_kraken2 = false + Int? kraken2_disk_size + Int? kraken2_memory + Int? kraken2_cpu + File? kraken2_db String? target_organism File? adapters File? phix @@ -47,19 +47,21 @@ workflow read_QC_trim_pe { } } if ("~{workflow_series}" == "theiacov") { - call kraken.kraken2_theiacov as kraken2_theiacov_raw { + call kraken.kraken2_standalone as kraken2_theiacov_raw { input: samplename = samplename, read1 = read1, read2 = read2, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = select_first([kraken2_db]) } - call kraken.kraken2_theiacov as kraken2_theiacov_dehosted { + call kraken.kraken2_standalone as kraken2_theiacov_dehosted { input: samplename = samplename, read1 = select_first([ncbi_scrub_pe.read1_dehosted]), read2 = ncbi_scrub_pe.read2_dehosted, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = select_first([kraken2_db]) } } if (read_processing == "trimmomatic") { @@ -131,19 +133,19 @@ workflow read_QC_trim_pe { } } if ("~{workflow_series}" == "theiaprok") { - if ((call_kraken) && defined(kraken_db)) { - call kraken.kraken2_standalone { + if ((call_kraken2) && defined(kraken2_db)) { + call kraken.kraken2_standalone as kraken2_theiaprok { input: samplename = samplename, read1 = read1, read2 = read2, - kraken2_db = select_first([kraken_db]), - disk_size = kraken_disk_size, - memory = kraken_memory, - cpu = kraken_cpu + kraken2_db = select_first([kraken2_db]), + disk_size = kraken2_disk_size, + memory = kraken2_memory, + cpu = kraken2_cpu } - } if ((call_kraken) && ! defined(kraken_db)) { - String kraken_db_warning = "Kraken database not defined" + } if ((call_kraken2) && ! defined(kraken2_db)) { + String kraken2_db_warning = "Kraken2 database not defined" } } if ("~{workflow_series}" == "theiameta") { @@ -190,18 +192,18 @@ workflow read_QC_trim_pe { File? fastqc_clean2_html = fastqc_clean.read2_fastqc_html # kraken2 - theiacov and theiaprok - String kraken_version = select_first([kraken2_theiacov_raw.version, kraken2_standalone.kraken2_version, ""]) - Float? kraken_human = kraken2_theiacov_raw.percent_human - Float? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 - String? kraken_target_organism = kraken2_theiacov_raw.percent_target_organism - String kraken_report = select_first([kraken2_theiacov_raw.kraken_report, kraken2_standalone.kraken2_report, ""]) - Float? kraken_human_dehosted = kraken2_theiacov_dehosted.percent_human - Float? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 - String? kraken_target_organism_dehosted = kraken2_theiacov_dehosted.percent_target_organism - String? kraken_target_organism_name = target_organism - File? kraken_report_dehosted = kraken2_theiacov_dehosted.kraken_report - String kraken_docker = select_first([kraken2_theiacov_raw.docker, kraken2_standalone.kraken2_docker, ""]) - String kraken_database = select_first([kraken2_theiacov_raw.database, kraken2_standalone.kraken2_database, kraken_db_warning, ""]) + String kraken2_version = select_first([kraken2_theiacov_raw.kraken2_version, kraken2_theiaprok.kraken2_version, ""]) + Float? kraken2_human = kraken2_theiacov_raw.kraken2_percent_human + String? kraken2_sc2 = kraken2_theiacov_raw.kraken2_percent_sc2 + String? kraken2_target_organism = kraken2_theiacov_raw.kraken2_percent_target_organism + String kraken2_report = select_first([kraken2_theiacov_raw.kraken2_report, kraken2_theiaprok.kraken2_report, ""]) + Float? kraken2_human_dehosted = kraken2_theiacov_dehosted.kraken2_percent_human + String? kraken2_sc2_dehosted = kraken2_theiacov_dehosted.kraken2_percent_sc2 + String? kraken2_target_organism_dehosted = kraken2_theiacov_dehosted.kraken2_percent_target_organism + String? kraken2_target_organism_name = target_organism + File? kraken2_report_dehosted = kraken2_theiacov_dehosted.kraken2_report + String kraken2_docker = select_first([kraken2_theiacov_raw.kraken2_docker, kraken2_theiaprok.kraken2_docker, ""]) + String kraken2_database = select_first([kraken2_theiacov_raw.kraken2_database, kraken2_theiaprok.kraken2_database, kraken2_db_warning, ""]) # trimming versioning String? trimmomatic_version = trimmomatic_pe.version diff --git a/workflows/utilities/wf_read_QC_trim_se.wdl b/workflows/utilities/wf_read_QC_trim_se.wdl index d652014ce..dae1f254f 100644 --- a/workflows/utilities/wf_read_QC_trim_se.wdl +++ b/workflows/utilities/wf_read_QC_trim_se.wdl @@ -27,11 +27,11 @@ workflow read_QC_trim_se { String? trimmomatic_args Boolean call_midas = false File? midas_db - Boolean call_kraken = false - File? kraken_db - Int? kraken_disk_size - Int? kraken_memory - Int? kraken_cpu + Boolean call_kraken2 = false + File? kraken2_db + Int? kraken2_disk_size + Int? kraken2_memory + Int? kraken2_cpu String read_processing = "trimmomatic" # options: trimmomatic, fastp String read_qc = "fastq_scan" # options: fastq_scan, fastqc String fastp_args = "-g -5 20 -3 20" @@ -96,17 +96,19 @@ workflow read_QC_trim_se { } } if ("~{workflow_series}" == "theiacov") { - call kraken.kraken2_theiacov as kraken2_theiacov_raw { + call kraken.kraken2_standalone as kraken2_theiacov_raw { input: samplename = samplename, read1 = read1, - target_organism = target_organism + target_organism = target_organism, + kraken2_db = select_first([kraken2_db]) } - call kraken.kraken2_theiacov as kraken2_theiacov_dehosted { + call kraken.kraken2_standalone as kraken2_theiacov_dehosted { input: samplename = samplename, read1 = select_first([ncbi_scrub_se.read1_dehosted]), - target_organism = target_organism + target_organism = target_organism, + kraken2_db = select_first([kraken2_db]) } } if ("~{workflow_series}" == "theiaprok") { @@ -120,18 +122,18 @@ workflow read_QC_trim_se { } } if ("~{workflow_series}" == "theiaprok") { - if ((call_kraken) && defined(kraken_db)) { - call kraken.kraken2_standalone { + if ((call_kraken2) && defined(kraken2_db)) { + call kraken.kraken2_standalone as kraken2_theiaprok { input: samplename = samplename, read1 = read1, - kraken2_db = select_first([kraken_db]), - disk_size = kraken_disk_size, - memory = kraken_memory, - cpu = kraken_cpu + kraken2_db = select_first([kraken2_db]), + disk_size = kraken2_disk_size, + memory = kraken2_memory, + cpu = kraken2_cpu } - } if ((call_kraken) && ! defined(kraken_db)) { - String kraken_db_warning = "Kraken database not defined" + } if ((call_kraken2) && ! defined(kraken2_db)) { + String kraken2_db_warning = "Kraken database not defined" } } output { @@ -159,18 +161,18 @@ workflow read_QC_trim_se { File? fastqc_clean1_html = fastqc_clean.read1_fastqc_html # kraken2 - raw and dehosted - String kraken_version = select_first([kraken2_theiacov_raw.version, kraken2_standalone.kraken2_version, ""]) - Float? kraken_human = kraken2_theiacov_raw.percent_human - Float? kraken_sc2 = kraken2_theiacov_raw.percent_sc2 - String? kraken_target_organism = kraken2_theiacov_raw.percent_target_organism - String kraken_report = select_first([kraken2_theiacov_raw.kraken_report, kraken2_standalone.kraken2_report, ""]) - Float? kraken_human_dehosted = kraken2_theiacov_dehosted.percent_human - Float? kraken_sc2_dehosted = kraken2_theiacov_dehosted.percent_sc2 - String? kraken_target_organism_dehosted = kraken2_theiacov_dehosted.percent_target_organism - String? kraken_target_organism_name = target_organism - File? kraken_report_dehosted = kraken2_theiacov_dehosted.kraken_report - String kraken_docker = select_first([kraken2_theiacov_raw.docker, kraken2_standalone.kraken2_docker, ""]) - String kraken_database = select_first([kraken2_theiacov_raw.database, kraken2_standalone.kraken2_database, kraken_db_warning, ""]) + String kraken2_version = select_first([kraken2_theiacov_raw.kraken2_version, kraken2_theiaprok.kraken2_version, ""]) + Float? kraken2_human = kraken2_theiacov_raw.kraken2_percent_human + String? kraken2_sc2 = kraken2_theiacov_raw.kraken2_percent_sc2 + String? kraken2_target_organism = kraken2_theiacov_raw.kraken2_percent_target_organism + String kraken2_report = select_first([kraken2_theiacov_raw.kraken2_report, kraken2_theiaprok.kraken2_report, ""]) + Float? kraken2_human_dehosted = kraken2_theiacov_dehosted.kraken2_percent_human + String? kraken2_sc2_dehosted = kraken2_theiacov_dehosted.kraken2_percent_sc2 + String? kraken2_target_organism_dehosted = kraken2_theiacov_dehosted.kraken2_percent_target_organism + String? kraken2_target_organism_name = target_organism + File? kraken2_report_dehosted = kraken2_theiacov_dehosted.kraken2_report + String kraken2_docker = select_first([kraken2_theiacov_raw.kraken2_docker, kraken2_theiaprok.kraken2_docker, ""]) + String kraken2_database = select_first([kraken2_theiacov_raw.kraken2_database, kraken2_theiaprok.kraken2_database, kraken2_db_warning, ""]) # trimming versioning String? trimmomatic_version = trimmomatic_se.version