Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Kraken2] Add module to recalculate abundances based on fragment length - Kraken2_ont wf and TheiaCoV_ONT wf #240

Merged
merged 18 commits into from
Feb 20, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ workflows:
primaryDescriptorPath: /workflows/standalone_modules/wf_kraken2_se.wdl
testParameterFiles:
- /tests/inputs/empty.json
- name: Kraken2_ONT_PHB
subclass: WDL
primaryDescriptorPath: /workflows/standalone_modules/wf_kraken2_ont.wdl
testParameterFiles:
- /tests/inputs/empty.json
- name: RASUSA_PHB
subclass: WDL
primaryDescriptorPath: /workflows/standalone_modules/wf_rasusa.wdl
Expand Down
89 changes: 87 additions & 2 deletions tasks/taxon_id/task_kraken2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ task kraken2_theiacov {
--threads ~{cpu} \
--db ~{kraken2_db} \
~{read1} ~{read2} \
--report ~{samplename}_kraken2_report.txt >/dev/null
--report ~{samplename}_kraken2_report.txt \
--output ~{samplename}.classifiedreads.txt

# Compress and cleanup
gzip ~{samplename}.classifiedreads.txt

percentage_human=$(grep "Homo sapiens" ~{samplename}_kraken2_report.txt | cut -f 1)
# | tee PERCENT_HUMAN
Expand All @@ -37,7 +41,7 @@ task kraken2_theiacov {
if [ ! -z "~{target_org}" ]; then
echo "Target org designated: ~{target_org}"
percent_target_org=$(grep "~{target_org}" ~{samplename}_kraken2_report.txt | cut -f1 | head -n1 )
if [-z "$percent_target_org" ] ; then percent_target_org="0" ; fi
if [ -z "$percent_target_org" ] ; then percent_target_org="0" ; fi
else
percent_target_org=""
fi
Expand All @@ -52,6 +56,7 @@ task kraken2_theiacov {
Float percent_sc2 = read_float("PERCENT_SC2")
String percent_target_org = read_string("PERCENT_TARGET_ORG")
String? kraken_target_org = target_org
File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz"
cimendes marked this conversation as resolved.
Show resolved Hide resolved
String docker = "us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.0.8-beta_hv"
}
runtime {
Expand Down Expand Up @@ -142,4 +147,84 @@ task kraken2_standalone {
disks: "local-disk " + disk_size + " SSD"
preemptible: 0
}
}

task kraken2_parse_classified {
input {
File kraken2_classified_report
File kraken2_report
String samplename
String? target_org
Int cpu = 4
Int disk_size = 100
}
command <<<

gunzip -c ~{kraken2_classified_report} > ~{samplename}.classifiedreads.txt

python3 <<CODE
import pandas as pd

# load files into dataframe for parsing
reads_table = pd.read_csv("~{samplename}.classifiedreads.txt", names=["classified_unclassified","read_id","taxon_id","read_len","other"], header=None, delimiter="\t")
report_table = pd.read_csv("~{kraken2_report}", names=["percent","num_reads","num_reads_with_taxon","rank","taxon_id","name"], header=None, delimiter="\t")

# create dataframe to store results
results_table = pd.DataFrame(columns=["percent","num_basepairs","rank","taxon_id","name"])

# calculate total basepairs
total_basepairs = reads_table["read_len"].sum()

# for each taxon_id in the report table, check if exists in the classified reads output and if so get the percent, num basepairs, rank, and name
# write results to dataframe
for taxon_id in report_table["taxon_id"].unique():
if taxon_id in reads_table["taxon_id"].unique():

taxon_name = report_table.loc[report_table["taxon_id"] == taxon_id, "name"].values[0].strip()
rank = report_table.loc[report_table["taxon_id"] == taxon_id, "rank"].values[0].strip()
taxon_basepairs = reads_table.loc[reads_table["taxon_id"] == taxon_id, "read_len"].sum()
taxon_percent = taxon_basepairs / total_basepairs * 100

results_table = pd.concat([results_table, pd.DataFrame({"percent":taxon_percent, "num_basepairs":taxon_basepairs, "rank":rank, "taxon_id":taxon_id, "name":taxon_name}, index=[0])], ignore_index=True)

# write results to file
results_table.to_csv("~{samplename}.report_parsed.txt", sep="\t", index=False, header=False)
CODE

# theiacov parsing blocks - percent human, sc2 and target organism
percentage_human=$(grep "Homo sapiens" ~{samplename}.report_parsed.txt | cut -f 1)
percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}.report_parsed.txt | cut -f1 )

if [ -z "$percentage_human" ] ; then percentage_human="0" ; fi
if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi
echo $percentage_human | tee PERCENT_HUMAN
echo $percentage_sc2 | tee PERCENT_SC2

# capture target org percentage
if [ ! -z "~{target_org}" ]; then
echo "Target org designated: ~{target_org}"
percent_target_org=$(grep "~{target_org}" ~{samplename}.report_parsed.txt | cut -f1 | head -n1 )
if [ -z "$percent_target_org" ] ; then percent_target_org="0" ; fi
else
percent_target_org=""
fi
echo $percent_target_org | tee PERCENT_TARGET_ORG

>>>
output {
File kraken_report = "~{samplename}.report_parsed.txt"
Float percent_human = read_float("PERCENT_HUMAN")
Float percent_sc2 = read_float("PERCENT_SC2")
String percent_target_org = read_string("PERCENT_TARGET_ORG")
String? kraken_target_org = target_org
}
runtime {
docker: "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-28-v4"
memory: "8 GB"
cpu: cpu
disks: "local-disk " + disk_size + " SSD"
disk: disk_size + " GB" # TES
preemptible: 0
maxRetries: 0
}
}
4 changes: 2 additions & 2 deletions tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@
- path: miniwdl_run/call-fastq_scan_raw_reads/work/clearlabs_fastq-scan.json
md5sum: 869dd2e934c600bba35f30f08e2da7c9
- path: miniwdl_run/call-kraken2_dehosted/command
md5sum: 70955302d0fd9f693606ec964978367b
md5sum: 9eab22466a4bb80de52ac457b5a00885
- path: miniwdl_run/call-kraken2_dehosted/inputs.json
contains: ["read1", "samplename"]
- path: miniwdl_run/call-kraken2_dehosted/outputs.json
Expand All @@ -172,7 +172,7 @@
- path: miniwdl_run/call-kraken2_dehosted/work/clearlabs_kraken2_report.txt
md5sum: 35841fa2d77ec202c275b1de548b8d98
- path: miniwdl_run/call-kraken2_raw/command
md5sum: a0ea125003ce38efdfd3905c3c38030b
md5sum: f590826e868e7a6a27135582c44d8adc
- path: miniwdl_run/call-kraken2_raw/inputs.json
contains: ["read1", "samplename"]
- path: miniwdl_run/call-kraken2_raw/outputs.json
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@
- path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl
md5sum: 4e1d4f6b8085a209f9721748a0c0fef0
- path: miniwdl_run/wdl/tasks/taxon_id/task_kraken2.wdl
md5sum: 37a1bccc53b5ff8e1968f07a85bb328f
md5sum: 8bd6ffb6d89dca981a0a9272d25a10f2
- path: miniwdl_run/wdl/tasks/taxon_id/task_midas.wdl
md5sum: faacd87946ee3fbdf70f3a15b79ce547
- path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@
- path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl
md5sum: 4e1d4f6b8085a209f9721748a0c0fef0
- path: miniwdl_run/wdl/tasks/taxon_id/task_kraken2.wdl
md5sum: 37a1bccc53b5ff8e1968f07a85bb328f
md5sum: 8bd6ffb6d89dca981a0a9272d25a10f2
- path: miniwdl_run/wdl/tasks/taxon_id/task_midas.wdl
md5sum: faacd87946ee3fbdf70f3a15b79ce547
- path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl
Expand Down
42 changes: 42 additions & 0 deletions workflows/standalone_modules/wf_kraken2_ont.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
version 1.0

import "../../tasks/taxon_id/task_kraken2.wdl" as kraken2
import "../../tasks/task_versioning.wdl" as versioning

workflow kraken2_ont_wf {
meta {
description: "Classify ONT single-end reads using Kraken2 by recalculating abundances based on read length"
}
input {
String samplename
File read1
File kraken2_db
}
call kraken2.kraken2_standalone as kraken2_se {
input:
samplename = samplename,
read1 = read1,
kraken2_db = kraken2_db
}
call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances {
input:
samplename = samplename,
kraken2_report = kraken2_se.kraken2_report,
kraken2_classified_report = kraken2_se.kraken2_classified_report
}
call versioning.version_capture{
input:
}
output {
# PHB Version Captures
String kraken2_se_wf_version = version_capture.phb_version
String kraken2_se_wf_analysis_date = version_capture.date
# Kraken2
String kraken2_version = kraken2_se.kraken2_version
String kraken2_docker = kraken2_se.kraken2_docker
File kraken2_classified_report = kraken2_se.kraken2_classified_report
sage-wright marked this conversation as resolved.
Show resolved Hide resolved
File kraken2_report = kraken2_recalculate_abundances.kraken_report
File kraken2_unclassified_read1 = kraken2_se.kraken2_unclassified_read1
File kraken2_classified_read1 = kraken2_se.kraken2_classified_read1
}
}
28 changes: 20 additions & 8 deletions workflows/utilities/wf_read_QC_trim_ont.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,25 @@ workflow read_QC_trim_ont {
samplename = samplename,
read1 = read1,
target_org = target_org
}
call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances_raw {
input:
samplename = samplename,
kraken2_report = kraken2_raw.kraken_report,
kraken2_classified_report = kraken2_raw.kraken2_classified_report
}
call kraken2.kraken2_theiacov as kraken2_dehosted {
input:
samplename = samplename,
read1 = ncbi_scrub_se.read1_dehosted,
target_org = target_org
}
call kraken2.kraken2_parse_classified as kraken2_recalculate_abundances_dehosted {
input:
samplename = samplename,
kraken2_report = kraken2_dehosted.kraken_report,
kraken2_classified_report = kraken2_dehosted.kraken2_classified_report
cimendes marked this conversation as resolved.
Show resolved Hide resolved
}
}
if ("~{workflow_series}" == "theiaprok") {
# kmc for genome size estimation
Expand Down Expand Up @@ -92,16 +104,16 @@ workflow read_QC_trim_ont {
String? kraken_target_org_name = kraken2_raw.kraken_target_org

# kraken outputs raw
Float? kraken_human = kraken2_raw.percent_human
Float? kraken_sc2 = kraken2_raw.percent_sc2
String? kraken_target_org = kraken2_raw.percent_target_org
File? kraken_report = kraken2_raw.kraken_report
Float? kraken_human = kraken2_recalculate_abundances_raw.percent_human
Float? kraken_sc2 = kraken2_recalculate_abundances_raw.percent_sc2
String? kraken_target_org = kraken2_recalculate_abundances_raw.percent_target_org
File? kraken_report = kraken2_recalculate_abundances_raw.kraken_report

# kraken outputs dehosted
Float? kraken_human_dehosted = kraken2_dehosted.percent_human
Float? kraken_sc2_dehosted = kraken2_dehosted.percent_sc2
String? kraken_target_org_dehosted = kraken2_dehosted.percent_target_org
File? kraken_report_dehosted = kraken2_dehosted.kraken_report
Float? kraken_human_dehosted = kraken2_recalculate_abundances_dehosted.percent_human
Float? kraken_sc2_dehosted = kraken2_recalculate_abundances_dehosted.percent_sc2
String? kraken_target_org_dehosted = kraken2_recalculate_abundances_dehosted.percent_target_org
File? kraken_report_dehosted = kraken2_recalculate_abundances_dehosted.kraken_report

# theiaprok outputs
# kmc outputs
Expand Down