From ac7ba44f5fdbd4476fd24254e00cdbe01d19a216 Mon Sep 17 00:00:00 2001 From: James Richard Otieno Date: Mon, 20 Nov 2023 22:11:01 -0500 Subject: [PATCH 1/3] TheiaCoV_ONT_PHB Influenza Track (#233) * adding flu ont task * update input JSON "read1" instead of "demultiplexed_reads" * updated irma docker image to v1.1.3 * added seq_method as input to irma task --- tasks/assembly/task_irma.wdl | 27 ++- tests/inputs/theiacov/wf_theiacov_ont.json | 2 +- .../theiacov/wf_theiacov_illumina_pe.wdl | 1 + workflows/theiacov/wf_theiacov_ont.wdl | 188 ++++++++++++------ 4 files changed, 151 insertions(+), 67 deletions(-) diff --git a/tasks/assembly/task_irma.wdl b/tasks/assembly/task_irma.wdl index 21dee0978..9b894902e 100644 --- a/tasks/assembly/task_irma.wdl +++ b/tasks/assembly/task_irma.wdl @@ -3,12 +3,12 @@ version 1.0 task irma { input { File read1 - File read2 + File? read2 + String seq_method String samplename Boolean keep_ref_deletions = true - String irma_module = "FLU" String read_basename = basename(read1) - String docker = "us-docker.pkg.dev/general-theiagen/staphb/irma:1.0.3" + String docker = "cdcgov/irma:v1.1.3" Int memory = 8 Int cpu = 4 Int disk_size = 100 @@ -17,7 +17,9 @@ task irma { date | tee DATE #capture reads as bash variables read1=~{read1} - read2=~{read2} + if [[ "~{read2}" ]]; then + read2=~{read2} + fi # set cat command based on compression if [[ "~{read1}" == *".gz" ]] ; then cat_reads="zcat" @@ -38,15 +40,20 @@ task irma { echo "Read headers may lead to IRMA failure; reformatting to meet IRMA input requirements" sra_id=$(echo "~{read_basename}" | awk -F "_" '{ print $1 }') eval "${cat_reads} ~{read1}" | awk '{print (NR%4 == 1) ? "@'${sra_id}'-" ++i " 1:1" : $0}' | gzip -c > "${sra_id}-irmafix_R1.fastq.gz" - eval "${cat_reads} ~{read2}" | awk '{print (NR%4 == 1) ? "@'${sra_id}'-" ++i " 2:2" : $0}' | gzip -c > "${sra_id}-irmafix_R2.fastq.gz" - #modify read variables read1="${sra_id}-irmafix_R1.fastq.gz" - read2="${sra_id}-irmafix_R2.fastq.gz" + if [[ "~{read2}" ]]; then + eval "${cat_reads} ~{read2}" | awk '{print (NR%4 == 1) ? "@'${sra_id}'-" ++i " 2:2" : $0}' | gzip -c > "${sra_id}-irmafix_R2.fastq.gz" + read2="${sra_id}-irmafix_R2.fastq.gz" + fi else echo "Read headers match IRMA formatting requirements" fi - # run IRMA - IRMA "~{irma_module}" "${read1}" "${read2}" ~{samplename} + # set IRMA module depending on sequencing technology + if [[ ~{seq_method} == "OXFORD_NANOPORE" ]]; then + IRMA "FLU-minion" "${read1}" ~{samplename} + else + IRMA "FLU" "${read1}" "${read2}" ~{samplename} + fi # capture IRMA type if compgen -G "~{samplename}/*fasta"; then echo "Type_"$(basename "$(echo "$(find ~{samplename}/*.fasta | head -n1)")" | cut -d_ -f1) > IRMA_TYPE @@ -102,4 +109,4 @@ task irma { maxRetries: 3 preemptible: 0 } -} \ No newline at end of file +} diff --git a/tests/inputs/theiacov/wf_theiacov_ont.json b/tests/inputs/theiacov/wf_theiacov_ont.json index 8182f4d35..b537a5e52 100644 --- a/tests/inputs/theiacov/wf_theiacov_ont.json +++ b/tests/inputs/theiacov/wf_theiacov_ont.json @@ -1,5 +1,5 @@ { "theiacov_ont.samplename": "ont", - "theiacov_ont.demultiplexed_reads": "tests/data/theiacov/fastqs/ont/ont.fastq.gz", + "theiacov_ont.read1": "tests/data/theiacov/fastqs/ont/ont.fastq.gz", "theiacov_ont.primer_bed": "tests/data/theiacov/primers/artic-v3.primers.bed" } diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index 8dff1ee42..5a2860ef3 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -133,6 +133,7 @@ workflow theiacov_illumina_pe { read1 = read_QC_trim.read1_clean, read2 = read_QC_trim.read2_clean, samplename = samplename, + seq_method = seq_method } if (defined(irma.irma_assemblies)) { call abricate.abricate_flu { diff --git a/workflows/theiacov/wf_theiacov_ont.wdl b/workflows/theiacov/wf_theiacov_ont.wdl index 74b1759c8..31656e6b7 100644 --- a/workflows/theiacov/wf_theiacov_ont.wdl +++ b/workflows/theiacov/wf_theiacov_ont.wdl @@ -1,6 +1,7 @@ version 1.0 import "../../tasks/assembly/task_artic_consensus.wdl" as artic_consensus +import "../../tasks/assembly/task_irma.wdl" as irma_task import "../../tasks/quality_control/task_assembly_metrics.wdl" as assembly_metrics import "../../tasks/quality_control/task_vadr.wdl" as vadr_task import "../../tasks/quality_control/task_consensus_qc.wdl" as consensus_qc_task @@ -10,6 +11,7 @@ import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_task import "../../tasks/species_typing/task_pangolin.wdl" as pangolin import "../../tasks/species_typing/task_quasitools.wdl" as quasitools import "../../tasks/gene_typing/task_sc2_gene_coverage.wdl" as sc2_calculation +import "../../tasks/gene_typing/task_abricate.wdl" as abricate import "../../tasks/quality_control/task_qc_check_phb.wdl" as qc_check import "../../tasks/task_versioning.wdl" as versioning import "../utilities/wf_read_QC_trim_ont.wdl" as read_qc_trim_workflow @@ -20,19 +22,29 @@ workflow theiacov_ont { } input { String samplename - File demultiplexed_reads - String organism = "sars-cov-2" + File read1 + String organism = "sars-cov-2" # options: "sars-cov-2", "HIV", "flu" # sequencing values String seq_method = "OXFORD_NANOPORE" - File primer_bed + File? primer_bed # assembly parameters Int normalise = 200 Int max_length = 700 Int min_length = 400 + Int min_depth = 20 # nextclade inputs + String nextclade_docker_image = "nextstrain/nextclade:2.14.0" String nextclade_dataset_reference = "MN908947" String nextclade_dataset_tag = "2023-09-21T12:00:00Z" String? nextclade_dataset_name + # nextclade flu inputs + String nextclade_flu_h1n1_ha_tag = "2023-04-02T12:00:00Z" + String nextclade_flu_h1n1_na_tag = "2023-04-02T12:00:00Z" + String nextclade_flu_h3n2_ha_tag = "2023-04-02T12:00:00Z" + String nextclade_flu_h3n2_na_tag = "2023-04-02T12:00:00Z" + String nextclade_flu_vic_ha_tag = "2023-04-02T12:00:00Z" + String nextclade_flu_vic_na_tag = "2023-04-02T12:00:00Z" + String nextclade_flu_yam_tag = "2022-07-27T12:00:00Z" # reference values File? reference_genome Int? genome_length @@ -41,24 +53,27 @@ workflow theiacov_ont { # read screen parameters Int min_reads = 113 # min basepairs / 300 (which is the longest available read length of an Illumina product) Int min_basepairs = 34000 # 20x coverage of hepatitis delta virus - Int min_genome_size = 1700 # size of hepatitis delta virus - Int max_genome_size = 2673870 # size of Pandoravirus salinus + 200 kb + Int min_genome_length = 1700 # size of hepatitis delta virus + Int max_genome_length = 2673870 # size of Pandoravirus salinus + 200 kb Int min_coverage = 10 Boolean skip_screen = false Boolean skip_mash = false # qc check parameters File? qc_check_table } + call versioning.version_capture{ + input: + } if (organism == "HIV") { # set HIV specific artic version String run_prefix = "artic_hiv" } call screen.check_reads_se as raw_check_reads { input: - read1 = demultiplexed_reads, + read1 = read1, min_reads = min_reads, min_basepairs = min_basepairs, - min_genome_size = min_genome_size, - max_genome_size = max_genome_size, + min_genome_size = min_genome_length, + max_genome_size = max_genome_length, min_coverage = min_coverage, skip_screen = skip_screen, skip_mash = skip_mash, @@ -69,7 +84,7 @@ workflow theiacov_ont { if (raw_check_reads.read_screen == "PASS") { call read_qc_trim_workflow.read_QC_trim_ont as read_qc_trim { input: - read1 = demultiplexed_reads, + read1 = read1, samplename = samplename, genome_size = genome_length, min_length = min_length, @@ -83,8 +98,8 @@ workflow theiacov_ont { read1 = read_qc_trim.read1_clean, min_reads = min_reads, min_basepairs = min_basepairs, - min_genome_size = min_genome_size, - max_genome_size = max_genome_size, + min_genome_size = min_genome_length, + max_genome_size = max_genome_length, min_coverage = min_coverage, skip_screen = skip_screen, skip_mash = skip_mash, @@ -93,24 +108,62 @@ workflow theiacov_ont { expected_genome_size = genome_length } if (clean_check_reads.read_screen == "PASS") { - call artic_consensus.consensus { - input: - samplename = samplename, - organism = organism, - filtered_reads = read_qc_trim.read1_clean, - primer_bed = primer_bed, - normalise = normalise, - reference_genome = reference_genome + # assembly via artic_consensus for sars-cov-2 and HIV + if (organism != "flu") { + call artic_consensus.consensus { + input: + samplename = samplename, + organism = organism, + filtered_reads = read_qc_trim.read1_clean, + primer_bed = select_first([primer_bed]), + normalise = normalise, + reference_genome = reference_genome + } + call assembly_metrics.stats_n_coverage { + input: + samplename = samplename, + bamfile = consensus.sorted_bam + } + call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { + input: + samplename = samplename, + bamfile = consensus.trim_sorted_bam + } } + # assembly via irma for flu organisms + if (organism == "flu") { + call irma_task.irma { + input: + read1 = read_qc_trim.read1_clean, + samplename = samplename, + seq_method = seq_method + } + if (defined(irma.irma_assemblies)) { + call abricate.abricate_flu { + input: + assembly = select_first([irma.irma_assembly_fasta]), + samplename = samplename, + nextclade_flu_h1n1_ha_tag = nextclade_flu_h1n1_ha_tag, + nextclade_flu_h1n1_na_tag = nextclade_flu_h1n1_na_tag, + nextclade_flu_h3n2_ha_tag = nextclade_flu_h3n2_ha_tag, + nextclade_flu_h3n2_na_tag = nextclade_flu_h3n2_na_tag, + nextclade_flu_vic_ha_tag = nextclade_flu_vic_ha_tag, + nextclade_flu_vic_na_tag = nextclade_flu_vic_na_tag, + nextclade_flu_yam_tag = nextclade_flu_yam_tag + } + } + } + # consensus QC check call consensus_qc_task.consensus_qc { input: - assembly_fasta = consensus.consensus_seq, - reference_genome = reference_genome + assembly_fasta = select_first([irma.irma_assembly_fasta, consensus.consensus_seq]), + reference_genome = reference_genome, + genome_length = genome_length } # nanoplot for basic QC metrics call nanoplot_task.nanoplot as nanoplot_raw { input: - read1 = demultiplexed_reads, + read1 = read1, samplename = samplename, est_genome_size = select_first([genome_length, consensus_qc.number_Total]) } @@ -120,28 +173,18 @@ workflow theiacov_ont { samplename = samplename, est_genome_size = select_first([genome_length, consensus_qc.number_Total]) } - call assembly_metrics.stats_n_coverage { - input: - samplename = samplename, - bamfile = consensus.sorted_bam - } - call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { - input: - samplename = samplename, - bamfile = consensus.trim_sorted_bam - } if (organism == "sars-cov-2") { # sars-cov-2 specific tasks call pangolin.pangolin4 { input: samplename = samplename, - fasta = consensus.consensus_seq + fasta = select_first([consensus.consensus_seq]) } call sc2_calculation.sc2_gene_coverage { input: samplename = samplename, - bamfile = consensus.trim_sorted_bam, - min_depth = 20 + bamfile = select_first([consensus.trim_sorted_bam]), + min_depth = min_depth } } if (organism == "MPXV") { @@ -150,14 +193,16 @@ workflow theiacov_ont { if (organism == "WNV") { # WNV specific tasks (none yet, just adding as placeholder for future) } - if (organism == "MPXV" || organism == "sars-cov-2"){ + # run organism-specific typing + if (organism == "MPXV" || organism == "sars-cov-2" || organism == "flu" && select_first([abricate_flu.run_nextclade])){ # tasks specific to either MPXV or sars-cov-2 call nextclade_task.nextclade { input: - genome_fasta = consensus.consensus_seq, - dataset_name = select_first([nextclade_dataset_name, organism,]), - dataset_reference = nextclade_dataset_reference, - dataset_tag = nextclade_dataset_tag + docker = nextclade_docker_image, + genome_fasta = select_first([consensus.consensus_seq, irma.seg_ha_assembly]), + dataset_name = select_first([abricate_flu.nextclade_name_ha, nextclade_dataset_name, organism]), + dataset_reference = select_first([abricate_flu.nextclade_ref_ha, nextclade_dataset_reference]), + dataset_tag = select_first([abricate_flu.nextclade_ds_tag_ha, nextclade_dataset_tag]) } call nextclade_task.nextclade_output_parser { input: @@ -165,11 +210,32 @@ workflow theiacov_ont { organism = organism } } + if (organism == "flu" && select_first([abricate_flu.run_nextclade]) && defined(irma.seg_na_assembly)) { + # tasks specific to flu NA - run nextclade a second time + call nextclade_task.nextclade as nextclade_flu_na { + input: + docker = nextclade_docker_image, + genome_fasta = select_first([irma.seg_na_assembly]), + dataset_name = select_first([abricate_flu.nextclade_name_na, nextclade_dataset_name, organism]), + dataset_reference = select_first([abricate_flu.nextclade_ref_na, nextclade_dataset_reference]), + dataset_tag = select_first([abricate_flu.nextclade_ds_tag_na, nextclade_dataset_tag]) + } + call nextclade_task.nextclade_output_parser as nextclade_output_parser_flu_na { + input: + nextclade_tsv = nextclade_flu_na.nextclade_tsv, + organism = organism, + NA_segment = true + } + # concatenate tag, aa subs and aa dels for HA and NA segments + String ha_na_nextclade_ds_tag= "~{abricate_flu.nextclade_ds_tag_ha + ',' + abricate_flu.nextclade_ds_tag_na}" + String ha_na_nextclade_aa_subs= "~{nextclade_output_parser.nextclade_aa_subs + ',' + nextclade_output_parser_flu_na.nextclade_aa_subs}" + String ha_na_nextclade_aa_dels= "~{nextclade_output_parser.nextclade_aa_dels + ',' + nextclade_output_parser_flu_na.nextclade_aa_dels}" + } if (organism == "MPXV" || organism == "sars-cov-2" || organism == "WNV"){ # tasks specific to MPXV, sars-cov-2, and WNV call vadr_task.vadr { input: - genome_fasta = consensus.consensus_seq, + genome_fasta = select_first([consensus.consensus_seq]), assembly_length_unambiguous = consensus_qc.number_ATCG } } @@ -206,9 +272,6 @@ workflow theiacov_ont { } } } - call versioning.version_capture{ - input: - } output { # Version Capture String theiacov_ont_version = version_capture.phb_version @@ -249,10 +312,10 @@ workflow theiacov_ont { String? kraken_target_org_dehosted = read_qc_trim.kraken_target_org_dehosted File? kraken_report_dehosted = read_qc_trim.kraken_report_dehosted # Read Alignment - Artic consensus outputs + File? assembly_fasta = select_first([consensus.consensus_seq, irma.irma_assembly_fasta, ""]) File? aligned_bam = consensus.trim_sorted_bam File? aligned_bai = consensus.trim_sorted_bai - File? variants_from_ref_vcf = consensus.medaka_pass_vcf - File? assembly_fasta = consensus.consensus_seq + File? medaka_vcf = consensus.medaka_pass_vcf File? read1_aligned = consensus.reads_aligned File? read1_trimmed = consensus.trim_fastq # Read Alignment - Artic consensus versioning outputs @@ -260,7 +323,7 @@ workflow theiacov_ont { String? artic_docker = consensus.artic_pipeline_docker String? medaka_reference = consensus.medaka_reference String? primer_bed_name = consensus.primer_bed_name - String? assembly_method = "TheiaCoV (~{version_capture.phb_version}): ~{consensus.artic_pipeline_version}" + String? assembly_method = "TheiaCoV (~{version_capture.phb_version}): " + select_first([consensus.artic_pipeline_version, irma.irma_version, ""]) # Assembly QC - consensus assembly qc outputs File? consensus_stats = stats_n_coverage.stats File? consensus_flagstat = stats_n_coverage.flagstat @@ -291,16 +354,18 @@ workflow theiacov_ont { String? pangolin_docker = pangolin4.pangolin_docker String? pangolin_versions = pangolin4.pangolin_versions # Nextclade outputs - File? nextclade_json = nextclade.nextclade_json - File? auspice_json = nextclade.auspice_json - File? nextclade_tsv = nextclade.nextclade_tsv - String? nextclade_version = nextclade.nextclade_version - String? nextclade_docker = nextclade.nextclade_docker - String nextclade_ds_tag = nextclade_dataset_tag - String? nextclade_aa_subs = nextclade_output_parser.nextclade_aa_subs - String? nextclade_aa_dels = nextclade_output_parser.nextclade_aa_dels - String? nextclade_clade = nextclade_output_parser.nextclade_clade + String? nextclade_json = select_first([nextclade.nextclade_json, ""]) + String? auspice_json = select_first([ nextclade.auspice_json, ""]) + String? nextclade_tsv = select_first([nextclade.nextclade_tsv, ""]) + String? nextclade_version = select_first([nextclade.nextclade_version, ""]) + String? nextclade_docker = select_first([nextclade.nextclade_docker, ""]) + String? nextclade_ds_tag = select_first([ha_na_nextclade_ds_tag, abricate_flu.nextclade_ds_tag_ha, nextclade_dataset_tag, ""]) + String? nextclade_aa_subs = select_first([ha_na_nextclade_aa_subs, nextclade_output_parser.nextclade_aa_subs, ""]) + String? nextclade_aa_dels = select_first([ha_na_nextclade_aa_dels, nextclade_output_parser.nextclade_aa_dels, ""]) + String? nextclade_clade = select_first([nextclade_output_parser.nextclade_clade, ""]) String? nextclade_lineage = nextclade_output_parser.nextclade_lineage + # Nextclade Flu outputs - NA specific columns - tamiflu mutation + String? nextclade_tamiflu_resistance_aa_subs = nextclade_output_parser_flu_na.nextclade_tamiflu_aa_subs # VADR Annotation QC File? vadr_alerts_list = vadr.alerts_list String? vadr_num_alerts = vadr.num_alerts @@ -316,5 +381,16 @@ workflow theiacov_ont { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard + # Flu Outputs + String? irma_version = irma.irma_version + String? irma_type = irma.irma_type + String? irma_subtype = irma.irma_subtype + File? irma_ha_segment = irma.seg_ha_assembly + File? irma_na_segment = irma.seg_na_assembly + String? abricate_flu_type = abricate_flu.abricate_flu_type + String? abricate_flu_subtype = abricate_flu.abricate_flu_subtype + File? abricate_flu_results = abricate_flu.abricate_flu_results + String? abricate_flu_database = abricate_flu.abricate_flu_database + String? abricate_flu_version = abricate_flu.abricate_flu_version } } \ No newline at end of file From 358941a58b82a4a5f894aa69fa54047242c038aa Mon Sep 17 00:00:00 2001 From: Sage Wright <40403716+sage-wright@users.noreply.github.com> Date: Wed, 22 Nov 2023 15:44:03 -0500 Subject: [PATCH 2/3] TheiaCoV_FASTA_Batch: TheiaCoV_FASTA, for many samples at once (#238) * skellieton * continuation of skeleton workflow * very alpha iteration of the theiacov fasta wrangling and upload task * progress * newline issue * fix inputs * follow methods through * solve newline issue * to-dos * to-dos * fix organism logic * fix syntax * revert to wdl 1.0 * split pango and nextclade files * add to dockstore * enable table upload * fix old 1.1 syntax * not sure what went wrong with disk size before * disk size change for real this time * add slash to path * remove excess outputs * add check for nextclade json and pango lineage report existance before splitting and upload to google bucket * fix dastardly typo --------- Co-authored-by: cimendes --- .dockstore.yml | 5 + tasks/utilities/task_theiacov_fasta_batch.wdl | 225 ++++++++++++++++++ .../theiacov/wf_theiacov_fasta_batch.wdl | 88 +++++++ 3 files changed, 318 insertions(+) create mode 100644 tasks/utilities/task_theiacov_fasta_batch.wdl create mode 100644 workflows/theiacov/wf_theiacov_fasta_batch.wdl diff --git a/.dockstore.yml b/.dockstore.yml index 86bb712e5..78a9464da 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -50,6 +50,11 @@ workflows: primaryDescriptorPath: /workflows/theiacov/wf_theiacov_fasta.wdl testParameterFiles: - empty.json + - name: TheiaCoV_FASTA_Batch_PHB + subclass: WDL + primaryDescriptorPath: /workflows/theiacov/wf_theiacov_fasta_batch.wdl + testParameterFiles: + - empty.json - name: Mercury_Prep_N_Batch_PHB subclass: WDL primaryDescriptorPath: /workflows/submission/wf_mercury_prep_n_batch.wdl diff --git a/tasks/utilities/task_theiacov_fasta_batch.wdl b/tasks/utilities/task_theiacov_fasta_batch.wdl new file mode 100644 index 000000000..3ec438d5f --- /dev/null +++ b/tasks/utilities/task_theiacov_fasta_batch.wdl @@ -0,0 +1,225 @@ +version 1.0 + +task sm_theiacov_fasta_wrangling { # the sm stands for supermassive + input { + String table_name + String workspace_name + String project_name + String bucket_name + + Array[Pair[String, File]] sample_to_fasta + String organism = "sars-cov-2" + + File? nextclade_tsv + File? nextclade_json + String? nextclade_docker + String? nextclade_version + String? nextclade_ds_tag + + File? pango_lineage_report + String? pangolin_docker + + String seq_platform + String assembly_method + String theiacov_fasta_analysis_date + String theiacov_fasta_version + + Int disk_size = 100 + } + command <<< + # convert the map into a JSON file for use in the python block + # example map: {ERR4439752.test: /mnt/miniwdl_task_container/work/_miniwdl_inputs/0/ERR4439752.ivar.consensus.fasta} + cp -v ~{write_json(sample_to_fasta)} sample_to_fasta.json + + # check if nextclade json file exists + if [ -f ~{nextclade_json} ]; then + # this line splits into individual json files + jq -c '.results = (.results[] | [.]) ' ~{nextclade_json} | awk '{ print > "out" NR ".json"}' + + # rename each individual json file with the sample name + for file in out*.json; do + samplename=$(jq -r '.results[].seqName' ${file}) + cp -v ${file} ${samplename}.nextclade.json + done + + # transfer all the json files to the bucket for access in Terra -- not sure if this works on Terra + gcloud storage cp -v *.nextclade.json gs://~{bucket_name}/theiacov_fasta_batch-~{theiacov_fasta_analysis_date}/nextclade_json/ + fi + + # check if pangolin lineage report file exists + if [ -f ~{pango_lineage_report} ]; then + # split the pangolin lineage report into individual csv files named by the taxon + awk 'BEGIN {FS=","} NR==1 {header=$0; next} {print header > $1".pangolin_report.csv" ; print $0 >> $1".pangolin_report.csv"}' ~{pango_lineage_report} + + # transfer all pangolin lineage report files to the bucket for access in Terra + gcloud storage cp -v *pangolin_report.csv gs://~{bucket_name}/theiacov_fasta_batch-~{theiacov_fasta_analysis_date}/pangolin_report/ + fi + + echo "DEBUG: Now entering Python block to perform parsing of data for populating sample-level table" + + python3 <>> + output { + File terra_table = "terra-table-to-upload.tsv" + } + runtime { + docker: "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-28-v4" + memory: "8 GB" + cpu: 4 + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + preemptible: 0 + } +} \ No newline at end of file diff --git a/workflows/theiacov/wf_theiacov_fasta_batch.wdl b/workflows/theiacov/wf_theiacov_fasta_batch.wdl new file mode 100644 index 000000000..5934c93cc --- /dev/null +++ b/workflows/theiacov/wf_theiacov_fasta_batch.wdl @@ -0,0 +1,88 @@ +version 1.0 + +import "../../tasks/species_typing/task_pangolin.wdl" as pangolin_task +import "../../tasks/task_versioning.wdl" as versioning +import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_task +import "../../tasks/utilities/task_file_handling.wdl" as concatenate +import "../../tasks/utilities/task_theiacov_fasta_batch.wdl" as theiacov_fasta_wrangling_task + +workflow theiacov_fasta_batch { + meta { + description: "TheiaCoV_FASTA for multiple samples" + } + input { + Array[String] samplenames + Array[File] assembly_fastas + String organism = "sars-cov-2" + # sequencing values + String seq_method + String input_assembly_method + # nextclade inputs + String nextclade_dataset_reference = "MN908947" + String nextclade_dataset_tag = "2023-09-21T12:00:00Z" + String? nextclade_dataset_name + # workspace values + String table_name + String workspace_name + String project_name + String bucket_name + } + call versioning.version_capture{ + input: + } + call concatenate.cat_files { + input: + files_to_cat = assembly_fastas, + concatenated_file_name = "concatenated_assemblies.fasta" + } + if (organism == "sars-cov-2") { + # sars-cov-2 specific tasks + call pangolin_task.pangolin4 { + input: + samplename = "concatenated_assemblies", + fasta = cat_files.concatenated_files + } + } + if (organism == "MPXV" || organism == "sars-cov-2"){ + # tasks specific to either MPXV or sars-cov-2 + call nextclade_task.nextclade { + input: + genome_fasta = cat_files.concatenated_files, + dataset_name = select_first([nextclade_dataset_name, organism]), + dataset_reference = nextclade_dataset_reference, + dataset_tag = nextclade_dataset_tag + } + } + call theiacov_fasta_wrangling_task.sm_theiacov_fasta_wrangling { + input: + table_name = table_name, + workspace_name = workspace_name, + project_name = project_name, + bucket_name = bucket_name, + sample_to_fasta = zip(samplenames, assembly_fastas), + organism = organism, + nextclade_tsv = nextclade.nextclade_tsv, + nextclade_docker = nextclade.nextclade_docker, + nextclade_version = nextclade.nextclade_version, + nextclade_ds_tag = nextclade_dataset_tag, + nextclade_json = nextclade.nextclade_json, + pango_lineage_report = pangolin4.pango_lineage_report, + pangolin_docker = pangolin4.pangolin_docker, + seq_platform = seq_method, + assembly_method = input_assembly_method, + theiacov_fasta_analysis_date = version_capture.date, + theiacov_fasta_version = version_capture.phb_version + } + output { + # Version Capture + String theiacov_fasta_batch_version = version_capture.phb_version + String theiacov_fasta_batch_analysis_date = version_capture.date + # Pangolin outputs + File? pango_lineage_report = pangolin4.pango_lineage_report + # Nextclade outputs + File? nextclade_json = nextclade.nextclade_json + File? nextclade_tsv = nextclade.nextclade_tsv + # Wrangling outputs + File datatable = sm_theiacov_fasta_wrangling.terra_table + } +} \ No newline at end of file From 203ec3b580ed42bf0dd55566a639e8d0f7d63a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?In=C3=AAs=20Mendes?= Date: Mon, 27 Nov 2023 18:01:26 +0000 Subject: [PATCH 3/3] Add krona task to TheiaMeta_Illumina_PE (#213) * create krona task; add krona to raw and clean keaken reports in theiameta_illumina_pe * fix bug --------- Co-authored-by: Sage Wright --- tasks/taxon_id/task_krona.wdl | 33 +++++++++++++++++++ .../metagenomics/wf_theiameta_illumina_pe.wdl | 16 +++++++++ 2 files changed, 49 insertions(+) create mode 100644 tasks/taxon_id/task_krona.wdl diff --git a/tasks/taxon_id/task_krona.wdl b/tasks/taxon_id/task_krona.wdl new file mode 100644 index 000000000..7924a57d4 --- /dev/null +++ b/tasks/taxon_id/task_krona.wdl @@ -0,0 +1,33 @@ +version 1.0 + +task krona { + input { + File kraken2_report + String samplename + String docker = "us-docker.pkg.dev/general-theiagen/biocontainers/krona:2.7.1--pl526_5" + Int memory = 8 + Int cpu = 4 + } + command <<< + # Get VERSION + ktImportTaxonomy 2>&1 | sed -n '/KronaTools /p' | sed 's/^.*KronaTools //; s/ - ktImportTaxonomy.*//' | tee VERSION + + # Get taxonomy file + ktUpdateTaxonomy.sh taxonomy + + # Run krona with taxonomy on krakren report + ktImportTaxonomy -o ~{samplename}_krona.html ~{kraken2_report} -tax taxonomy + >>> + output { + String krona_version = read_string("VERSION") + String krona_docker = docker + File krona_html = "~{samplename}_krona.html" + } + runtime { + docker: "~{docker}" + memory: "~{memory} GB" + cpu: cpu + disks: "local-disk 100 SSD" + preemptible: 0 + } +} \ No newline at end of file diff --git a/workflows/metagenomics/wf_theiameta_illumina_pe.wdl b/workflows/metagenomics/wf_theiameta_illumina_pe.wdl index 184a105fd..8391038c7 100644 --- a/workflows/metagenomics/wf_theiameta_illumina_pe.wdl +++ b/workflows/metagenomics/wf_theiameta_illumina_pe.wdl @@ -3,6 +3,7 @@ version 1.0 import "../utilities/wf_read_QC_trim_pe.wdl" as read_qc_wf import "../utilities/wf_metaspades_assembly.wdl" as metaspades_assembly_wf import "../../tasks/taxon_id/task_kraken2.wdl" as kraken_task +import "../../tasks/taxon_id/task_krona.wdl" as krona_task import "../../tasks/alignment/task_minimap2.wdl" as minimap2_task import "../../tasks/utilities/task_parse_mapping.wdl" as parse_mapping_task import "../../tasks/quality_control/task_quast.wdl" as quast_task @@ -30,6 +31,11 @@ workflow theiameta_illumina_pe { classified_out = "classified#.fastq", unclassified_out = "unclassified#.fastq" } + call krona_task.krona as krona_raw { + input: + kraken2_report = kraken2_raw.kraken2_report, + samplename = samplename + } call read_qc_wf.read_QC_trim_pe as read_QC_trim { input: samplename = samplename, @@ -47,6 +53,11 @@ workflow theiameta_illumina_pe { classified_out = "classified#.fastq", unclassified_out = "unclassified#.fastq" } + call krona_task.krona as krona_clean { + input: + kraken2_report = kraken2_clean.kraken2_report, + samplename = samplename + } call metaspades_assembly_wf.metaspades_assembly_pe as metaspades { input: read1 = read_QC_trim.read1_clean, @@ -134,6 +145,11 @@ workflow theiameta_illumina_pe { Float kraken2_percent_human_raw = kraken2_raw.kraken2_percent_human File kraken2_report_clean = kraken2_clean.kraken2_report Float kraken2_percent_human_clean = kraken2_clean.kraken2_percent_human + # Krona outputs + String krona_version = krona_raw.krona_version + String krona_docker = krona_raw.krona_docker + File krona_html_raw = krona_raw.krona_html + File krona_html_clean = krona_clean.krona_html # Read QC - dehosting outputs File? read1_dehosted = read_QC_trim.read1_dehosted File? read2_dehosted = read_QC_trim.read2_dehosted