Skip to content

Commit

Permalink
Adds:
Browse files Browse the repository at this point in the history
    - bin summary script and module
    - Add parameter for GTDBTk mash db
    - remove AA bins as they didn't meaningfully speed up downstream
      processes
    - Fix linting
    - Add descriptions in json schema for all parameters and groups
    - switch DASTOOL fastatocontig2bin to local module (faster)
    - patch checkm2/predict module to rename output tsv file
    - refactor bin refinement subwf to remove duplicated steps
  • Loading branch information
Jim Downie committed Dec 13, 2024
1 parent 674b4ad commit 4cac07b
Show file tree
Hide file tree
Showing 33 changed files with 486 additions and 588 deletions.
141 changes: 141 additions & 0 deletions bin/bin_summary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env Rscript

library(optparse)
library(tidyverse)

parser <- OptionParser()
parser <- add_option(
object = parser,
opt_str = c("-s", "--stats"),
type = "character",
action = "store",
default = NULL,
help = "Comma-separated list of TSV files output by seqkit stats",
metavar="filename"
)

parser <- add_option(
object = parser,
opt_str = c("-c", "--checkm2"),
type = "character",
action = "store",
default = NULL,
help = "Comma-separated list of TSV files output by checkm2 predict",
metavar="filename"
)

parser <- add_option(
object = parser,
opt_str = c("-t", "--taxonomy"),
type = "character",
action = "store",
default = NULL,
help = "Comma-separated list of TSV files output by GTDB-Tk",
metavar="filename"
)

parser <- add_option(
object = parser,
opt_str = c("-o", "--prefix"),
type = "character",
action = "store",
default = "output",
help = "Output file prefix",
metavar="filename"
)

parser <- add_option(
object = parser,
opt_str = c("-x", "--completeness_score"),
type = "numeric",
action = "store",
default = 5,
help = "Output file prefix",
metavar="filename"
)

parser <- add_option(
object = parser,
opt_str = c("-y", "--contam_score"),
type = "numeric",
action = "store",
default = 1,
help = "Output file prefix",
metavar="filename"
)

input <- parse_args(parser)

read_stats <- function(file) {
df <- read_tsv(file) |>
mutate(
file = str_extract(file, "(.*)\\.fa", group = 1),
assembler = str_split(file, "[\\.|_]", simplify = TRUE)[,2],
binner = str_split(file, "[\\.|_]", simplify = TRUE)[,3]
) |>
select(bin = file, assembler, binner, num_seqs, sum_len, min_len, max_len, N50, L50 = N50_num, GC = `GC(%)`)

return(df)
}

read_checkm <- function(file) {
df <- read_tsv(file) |>
select(bin = Name,
completeness = Completeness,
contamination = Contamination,
checkm2_model = Completeness_Model_Used
)

return(df)
}

read_taxonomy <- function(file) {
df <- read_tsv(file)
if(ncol(df) > 3) {
df <- select(df,
bin = `Genome ID`,
gtdb_classification = `GTDB classification`,
ncbi_classification = `Majority vote NCBI classification`)
} else {
df <- select(df,
bin = `Genome ID`,
gtdb_classification = `GTDB classification`,
ncbi_classification = `Majority vote NCBI classification`,
taxid)
}

return(df)
}

data <- list()
if(rlang::has_name(input, "stats")) {
stats_files <- unlist(str_split(input$stats, ","))
stats_df <- map(stats_files, read_stats) |> list_rbind()
data <- c(data, list(stats_df))
} else {
stop("Error: no stats file provided!")
}

if(rlang::has_name(input, "checkm2")) {
checkm_files <- unlist(str_split(input$checkm2, ","))
checkm_df <- map(checkm_files, read_checkm) |> list_rbind()
data <- c(data, list(checkm_df))
}

if(rlang::has_name(input, "taxonomy")) {
tax_files <- unlist(str_split(input$taxonomy, ","))
tax_df <- map(tax_files, read_taxonomy) |> list_rbind()
data <- c(data, list(tax_df))
}

summary <- reduce(data, \(x, y) left_join(x, y, by = "bin"))

write_tsv(summary, glue::glue("{input$prefix}.bin_summary.tsv"))

writeLines(
c("BIN_SUMMARY:",
paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),
paste0(" tidyverse: ", packageVersion("tidyverse"))
),
"versions.yml"
)
Empty file modified bin/gtdb_to_ncbi_majority_vote.py
100644 → 100755
Empty file.
65 changes: 32 additions & 33 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@ process {
]
}

withName: 'BIN_SUMMARY' {
ext.prefix = { "${meta.id}" }
publishDir = [
path: { "${params.outdir}/summary/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'BWAMEM2_INDEX' {
tag = { "${meta.id}_${meta.assembler}" }
}
Expand All @@ -46,7 +55,7 @@ process {
}

withName: 'CHECKM2_PREDICT' {
ext.args = { "--genes --extension faa" }
ext.args = { "--extension fa" }
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
publishDir = [
Expand All @@ -56,33 +65,27 @@ process {
]
}

withName: 'COVERM_CONTIG' {
ext.args = { "-m metabat" }
ext.prefix = { "${meta.id}_${meta.assembler}_depth" }
tag = { "${meta.id}_${meta.assembler}" }
withName: 'CONTIG2BINTOFASTA' {
ext.prefix = { "${meta.id}_${meta.assembler}_${binner}" }
tag = { "${meta.id}_${meta.assembler}_${binner}" }
publishDir = [
path: { "${params.outdir}/mapping/pacbio/" },
path: { "${params.outdir}/bins_refined/${meta.binner}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'DASTOOL_BINS' {
ext.prefix = { "${meta.id}_${meta.assembler}_dastool" }
tag = { "${meta.id}_${meta.assembler}" }
withName: 'COVERM_CONTIG' {
ext.args = { "-m metabat" }
ext.prefix = { "${meta.id}_${meta.assembler}_depth" }
tag = { "${meta.id}_${meta.assembler}" }
publishDir = [
path: { "${params.outdir}/bins_refined/dastool" },
path: { "${params.outdir}/mapping/pacbio/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'DASTOOL_FASTATOCONTIG2BIN' {
ext.args = { "" }
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_contig2bin" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
}

withName: 'DASTOOL_DASTOOL' {
ext.args = { "--write_bin_evals" }
ext.prefix = { "${meta.id}_${meta.assembler}_dastool" }
Expand All @@ -94,6 +97,12 @@ process {
]
}

withName: 'FASTATOCONTIG2BIN' {
ext.args = { "" }
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_contig2bin" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
}

withName: 'GAWK_MAGSCOT_PROCESS_CONTIG2BIN' {
ext.args = { "-v FS='\\t'" }
ext.args2 = { "'{OFS = FS} {print \$2,\$1,\"${meta.binner}\"}'" }
Expand All @@ -117,15 +126,15 @@ process {
tag = { "${meta.id}_${meta.assembler}" }
}

withName: 'GAWK_RENAME_DASTOOL_BINS' {
ext.prefix = { "${meta.id}_${meta.assembler}_dastool_contig2bin" }
withName: 'GAWK_RENAME_BINS' {
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_contig2bin" }
ext.args = { "" }
ext.args2 = { "'{if(\$2 != prev) { count++ }; print \$1,\"${meta.id}_${meta.assembler}_dastool_\"count; prev = \$2}'" }
tag = { "${meta.id}_${meta.assembler}_dastool" }
ext.args2 = { "'{if(\$2 != prev) { count++ }; print \$1,\"${meta.id}_${meta.assembler}_${meta.binner}_\"count; prev = \$2}'" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
}

withName: 'GTDBTK_CLASSIFYWF' {
ext.args = "--extension faa --genes"
ext.args = "--extension fa"
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
publishDir = [
Expand All @@ -151,16 +160,6 @@ process {
tag = { "${meta.id}_${meta.assembler}" }
}

withName: 'MAGSCOT_BINS' {
ext.prefix = { "${meta.id}_${meta.assembler}_magscot" }
tag = { "${meta.id}_${meta.assembler}" }
publishDir = [
path: { "${params.outdir}/bins_refined/magscot" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'MAXBIN2' {
ext.args = { "" }
ext.prefix = { "${meta.id}_${meta.assembler}_maxbin2" }
Expand All @@ -185,7 +184,7 @@ process {

withName: 'METATOR_PIPELINE' {
ext.args = { "--start fastq --aligner bwa" }
ext.prefix = { "${meta.id}_${meta.assembler}_metator" }
ext.prefix = { "${meta.id}_${meta.assembler}" } // metator already appends "_metator" to files
tag = { "${meta.id}_${meta.assembler}" }
publishDir = [
path: { "${params.outdir}/bins_raw/metator" },
Expand Down Expand Up @@ -219,7 +218,7 @@ process {

withName: 'SEQKIT_STATS' {
ext.args = { "-b -a" }
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}" }
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_stats" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
publishDir = [
path: { "${params.outdir}/qc/stats/" },
Expand Down
13 changes: 6 additions & 7 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

include { LONGREADMAG } from './workflows/longreadmag'
include { LONGREADMAG } from './workflows/longreadmag'
include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_longreadmag_pipeline'
include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_longreadmag_pipeline'
/*
Expand Down Expand Up @@ -77,12 +77,11 @@ workflow {
//
// SUBWORKFLOW: Run completion tasks
//

// PIPELINE_COMPLETION (
// params.outdir,
// params.monochrome_logs,
// SANGERTOL_LONGREADMAG.out.multiqc_report
// )
PIPELINE_COMPLETION (
params.outdir,
params.monochrome_logs,
[]
)

}

Expand Down
14 changes: 5 additions & 9 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
"checkm2/predict": {
"branch": "master",
"git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d",
"installed_by": ["modules"]
"installed_by": ["modules"],
"patch": "modules/nf-core/checkm2/predict/checkm2-predict.diff"
},
"coverm/contig": {
"branch": "master",
Expand All @@ -36,11 +37,6 @@
"git_sha": "6d736588f4af4bc9b615bd7189442bd4272ae26e",
"installed_by": ["modules"]
},
"dastool/fastatocontig2bin": {
"branch": "master",
"git_sha": "6d736588f4af4bc9b615bd7189442bd4272ae26e",
"installed_by": ["modules"]
},
"gawk": {
"branch": "master",
"git_sha": "caab1314ca62679b629da4c79afa9a4cab2bb8ee",
Expand Down Expand Up @@ -123,17 +119,17 @@
"nf-core": {
"utils_nextflow_pipeline": {
"branch": "master",
"git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082",
"git_sha": "c2b22d85f30a706a3073387f30380704fcae013b",
"installed_by": ["subworkflows"]
},
"utils_nfcore_pipeline": {
"branch": "master",
"git_sha": "1b6b9a3338d011367137808b49b923515080e3ba",
"git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a",
"installed_by": ["subworkflows"]
},
"utils_nfschema_plugin": {
"branch": "master",
"git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c",
"git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e",
"installed_by": ["subworkflows"]
}
}
Expand Down
1 change: 1 addition & 0 deletions modules/local/bin_summary/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ channels:
dependencies:
- conda-forge::r-base=4.4
- conda-forge::r-tidyverse=2.0.0
- conda-forge::r-optparse=1.7.5
18 changes: 15 additions & 3 deletions modules/local/bin_summary/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,28 @@ process BIN_SUMMARY {
label "process_low"

conda "${moduleDir}/environment.yml"
container "docker.io/rocker/tidyverse:4.4"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'oras://community.wave.seqera.io/library/r-base_r-tidyverse_r-optparse:d348292153ed2a3e':
'community.wave.seqera.io/library/r-base_r-tidyverse_r-optparse:fb0e94661e2bf4e0' }"

input:
tuple val(meta), path(stats), path(checkm2), path(taxonomy)

output:
tuple val(meta), path("bin_summary.tsv")
tuple val(meta), path("*.bin_summary.tsv"), emit: summary
path("versions.yml") , emit: versions

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def stats_input = stats ? "--stats ${stats.join(",")}" : ""
def checkm_input = checkm2 ? "--checkm ${checkm2.join(",")}" : ""
def tax_input = taxonomy ? "--taxonomy ${taxonomy.join(",")}" : ""
"""
echo test > bin_summary.tsv
bin_summary.R \\
${stats_input} \\
${checkm_input} \\
${tax_input} \\
${args}
"""
}
Loading

0 comments on commit 4cac07b

Please sign in to comment.