From 4cac07b20bcca06a6aae8fc33cc40eef2947fe57 Mon Sep 17 00:00:00 2001 From: Jim Downie Date: Fri, 13 Dec 2024 21:06:32 +0000 Subject: [PATCH] Adds: - bin summary script and module - Add parameter for GTDBTk mash db - remove AA bins as they didn't meaningfully speed up downstream processes - Fix linting - Add descriptions in json schema for all parameters and groups - switch DASTOOL fastatocontig2bin to local module (faster) - patch checkm2/predict module to rename output tsv file - refactor bin refinement subwf to remove duplicated steps --- bin/bin_summary.R | 141 ++++++++++++++++++ bin/gtdb_to_ncbi_majority_vote.py | 0 conf/modules.config | 65 ++++---- main.nf | 13 +- modules.json | 14 +- modules/local/bin_summary/environment.yml | 1 + modules/local/bin_summary/main.nf | 18 ++- .../environment.yml | 0 .../main.nf | 10 +- .../fastatocontig2bin/environment.yml | 2 +- modules/local/fastatocontig2bin/main.nf | 49 ++++++ modules/local/metator/pipeline/main.nf | 3 +- .../checkm2/predict/checkm2-predict.diff | 33 ++++ modules/nf-core/checkm2/predict/main.nf | 8 +- .../nf-core/dastool/fastatocontig2bin/main.nf | 52 ------- .../dastool/fastatocontig2bin/meta.yml | 61 -------- .../fastatocontig2bin/tests/main.nf.test | 83 ----------- .../fastatocontig2bin/tests/main.nf.test.snap | 72 --------- modules/nf-core/gtdbtk/classifywf/main.nf | 2 +- nextflow.config | 9 +- nextflow_schema.json | 101 +++++++++---- subworkflows/local/bin_qc.nf | 3 +- subworkflows/local/bin_refinement.nf | 78 +++++----- subworkflows/local/bin_taxonomy.nf | 14 +- subworkflows/local/binning.nf | 8 +- .../utils_nfcore_longreadmag_pipeline/main.nf | 2 +- .../nf-core/utils_nextflow_pipeline/main.nf | 2 + .../tests/main.workflow.nf.test | 10 +- .../nf-core/utils_nfcore_pipeline/main.nf | 89 +++-------- .../tests/main.function.nf.test | 46 +++--- .../tests/main.function.nf.test.snap | 30 ---- .../utils_nfschema_plugin/tests/main.nf.test | 4 +- workflows/longreadmag.nf | 51 ++----- 33 files changed, 486 insertions(+), 588 deletions(-) create mode 100755 bin/bin_summary.R mode change 100644 => 100755 bin/gtdb_to_ncbi_majority_vote.py rename modules/local/{contig2bin2fasta => contig2bintofasta}/environment.yml (100%) rename modules/local/{contig2bin2fasta => contig2bintofasta}/main.nf (67%) rename modules/{nf-core/dastool => local}/fastatocontig2bin/environment.yml (64%) create mode 100644 modules/local/fastatocontig2bin/main.nf create mode 100644 modules/nf-core/checkm2/predict/checkm2-predict.diff delete mode 100644 modules/nf-core/dastool/fastatocontig2bin/main.nf delete mode 100644 modules/nf-core/dastool/fastatocontig2bin/meta.yml delete mode 100644 modules/nf-core/dastool/fastatocontig2bin/tests/main.nf.test delete mode 100644 modules/nf-core/dastool/fastatocontig2bin/tests/main.nf.test.snap diff --git a/bin/bin_summary.R b/bin/bin_summary.R new file mode 100755 index 0000000..9da3386 --- /dev/null +++ b/bin/bin_summary.R @@ -0,0 +1,141 @@ +#!/usr/bin/env Rscript + +library(optparse) +library(tidyverse) + +parser <- OptionParser() +parser <- add_option( + object = parser, + opt_str = c("-s", "--stats"), + type = "character", + action = "store", + default = NULL, + help = "Comma-separated list of TSV files output by seqkit stats", + metavar="filename" +) + +parser <- add_option( + object = parser, + opt_str = c("-c", "--checkm2"), + type = "character", + action = "store", + default = NULL, + help = "Comma-separated list of TSV files output by checkm2 predict", + metavar="filename" +) + +parser <- add_option( + object = parser, + opt_str = c("-t", "--taxonomy"), + type = "character", + action = "store", + default = NULL, + help = "Comma-separated list of TSV files output by GTDB-Tk", + metavar="filename" +) + +parser <- add_option( + object = parser, + opt_str = c("-o", "--prefix"), + type = "character", + action = "store", + default = "output", + help = "Output file prefix", + metavar="filename" +) + +parser <- add_option( + object = parser, + opt_str = c("-x", "--completeness_score"), + type = "numeric", + action = "store", + default = 5, + help = "Output file prefix", + metavar="filename" +) + +parser <- add_option( + object = parser, + opt_str = c("-y", "--contam_score"), + type = "numeric", + action = "store", + default = 1, + help = "Output file prefix", + metavar="filename" +) + +input <- parse_args(parser) + +read_stats <- function(file) { + df <- read_tsv(file) |> + mutate( + file = str_extract(file, "(.*)\\.fa", group = 1), + assembler = str_split(file, "[\\.|_]", simplify = TRUE)[,2], + binner = str_split(file, "[\\.|_]", simplify = TRUE)[,3] + ) |> + select(bin = file, assembler, binner, num_seqs, sum_len, min_len, max_len, N50, L50 = N50_num, GC = `GC(%)`) + + return(df) +} + +read_checkm <- function(file) { + df <- read_tsv(file) |> + select(bin = Name, + completeness = Completeness, + contamination = Contamination, + checkm2_model = Completeness_Model_Used + ) + + return(df) +} + +read_taxonomy <- function(file) { + df <- read_tsv(file) + if(ncol(df) > 3) { + df <- select(df, + bin = `Genome ID`, + gtdb_classification = `GTDB classification`, + ncbi_classification = `Majority vote NCBI classification`) + } else { + df <- select(df, + bin = `Genome ID`, + gtdb_classification = `GTDB classification`, + ncbi_classification = `Majority vote NCBI classification`, + taxid) + } + + return(df) +} + +data <- list() +if(rlang::has_name(input, "stats")) { + stats_files <- unlist(str_split(input$stats, ",")) + stats_df <- map(stats_files, read_stats) |> list_rbind() + data <- c(data, list(stats_df)) +} else { + stop("Error: no stats file provided!") +} + +if(rlang::has_name(input, "checkm2")) { + checkm_files <- unlist(str_split(input$checkm2, ",")) + checkm_df <- map(checkm_files, read_checkm) |> list_rbind() + data <- c(data, list(checkm_df)) +} + +if(rlang::has_name(input, "taxonomy")) { + tax_files <- unlist(str_split(input$taxonomy, ",")) + tax_df <- map(tax_files, read_taxonomy) |> list_rbind() + data <- c(data, list(tax_df)) +} + +summary <- reduce(data, \(x, y) left_join(x, y, by = "bin")) + +write_tsv(summary, glue::glue("{input$prefix}.bin_summary.tsv")) + +writeLines( + c("BIN_SUMMARY:", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" tidyverse: ", packageVersion("tidyverse")) + ), + "versions.yml" +) diff --git a/bin/gtdb_to_ncbi_majority_vote.py b/bin/gtdb_to_ncbi_majority_vote.py old mode 100644 new mode 100755 diff --git a/conf/modules.config b/conf/modules.config index 55d3fad..9daeec9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -34,6 +34,15 @@ process { ] } + withName: 'BIN_SUMMARY' { + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/summary/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'BWAMEM2_INDEX' { tag = { "${meta.id}_${meta.assembler}" } } @@ -46,7 +55,7 @@ process { } withName: 'CHECKM2_PREDICT' { - ext.args = { "--genes --extension faa" } + ext.args = { "--extension fa" } ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}" } tag = { "${meta.id}_${meta.assembler}_${meta.binner}" } publishDir = [ @@ -56,33 +65,27 @@ process { ] } - withName: 'COVERM_CONTIG' { - ext.args = { "-m metabat" } - ext.prefix = { "${meta.id}_${meta.assembler}_depth" } - tag = { "${meta.id}_${meta.assembler}" } + withName: 'CONTIG2BINTOFASTA' { + ext.prefix = { "${meta.id}_${meta.assembler}_${binner}" } + tag = { "${meta.id}_${meta.assembler}_${binner}" } publishDir = [ - path: { "${params.outdir}/mapping/pacbio/" }, + path: { "${params.outdir}/bins_refined/${meta.binner}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'DASTOOL_BINS' { - ext.prefix = { "${meta.id}_${meta.assembler}_dastool" } - tag = { "${meta.id}_${meta.assembler}" } + withName: 'COVERM_CONTIG' { + ext.args = { "-m metabat" } + ext.prefix = { "${meta.id}_${meta.assembler}_depth" } + tag = { "${meta.id}_${meta.assembler}" } publishDir = [ - path: { "${params.outdir}/bins_refined/dastool" }, + path: { "${params.outdir}/mapping/pacbio/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'DASTOOL_FASTATOCONTIG2BIN' { - ext.args = { "" } - ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_contig2bin" } - tag = { "${meta.id}_${meta.assembler}_${meta.binner}" } - } - withName: 'DASTOOL_DASTOOL' { ext.args = { "--write_bin_evals" } ext.prefix = { "${meta.id}_${meta.assembler}_dastool" } @@ -94,6 +97,12 @@ process { ] } + withName: 'FASTATOCONTIG2BIN' { + ext.args = { "" } + ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_contig2bin" } + tag = { "${meta.id}_${meta.assembler}_${meta.binner}" } + } + withName: 'GAWK_MAGSCOT_PROCESS_CONTIG2BIN' { ext.args = { "-v FS='\\t'" } ext.args2 = { "'{OFS = FS} {print \$2,\$1,\"${meta.binner}\"}'" } @@ -117,15 +126,15 @@ process { tag = { "${meta.id}_${meta.assembler}" } } - withName: 'GAWK_RENAME_DASTOOL_BINS' { - ext.prefix = { "${meta.id}_${meta.assembler}_dastool_contig2bin" } + withName: 'GAWK_RENAME_BINS' { + ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_contig2bin" } ext.args = { "" } - ext.args2 = { "'{if(\$2 != prev) { count++ }; print \$1,\"${meta.id}_${meta.assembler}_dastool_\"count; prev = \$2}'" } - tag = { "${meta.id}_${meta.assembler}_dastool" } + ext.args2 = { "'{if(\$2 != prev) { count++ }; print \$1,\"${meta.id}_${meta.assembler}_${meta.binner}_\"count; prev = \$2}'" } + tag = { "${meta.id}_${meta.assembler}_${meta.binner}" } } withName: 'GTDBTK_CLASSIFYWF' { - ext.args = "--extension faa --genes" + ext.args = "--extension fa" ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}" } tag = { "${meta.id}_${meta.assembler}_${meta.binner}" } publishDir = [ @@ -151,16 +160,6 @@ process { tag = { "${meta.id}_${meta.assembler}" } } - withName: 'MAGSCOT_BINS' { - ext.prefix = { "${meta.id}_${meta.assembler}_magscot" } - tag = { "${meta.id}_${meta.assembler}" } - publishDir = [ - path: { "${params.outdir}/bins_refined/magscot" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'MAXBIN2' { ext.args = { "" } ext.prefix = { "${meta.id}_${meta.assembler}_maxbin2" } @@ -185,7 +184,7 @@ process { withName: 'METATOR_PIPELINE' { ext.args = { "--start fastq --aligner bwa" } - ext.prefix = { "${meta.id}_${meta.assembler}_metator" } + ext.prefix = { "${meta.id}_${meta.assembler}" } // metator already appends "_metator" to files tag = { "${meta.id}_${meta.assembler}" } publishDir = [ path: { "${params.outdir}/bins_raw/metator" }, @@ -219,7 +218,7 @@ process { withName: 'SEQKIT_STATS' { ext.args = { "-b -a" } - ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}" } + ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_stats" } tag = { "${meta.id}_${meta.assembler}_${meta.binner}" } publishDir = [ path: { "${params.outdir}/qc/stats/" }, diff --git a/main.nf b/main.nf index be0f108..3136128 100644 --- a/main.nf +++ b/main.nf @@ -13,7 +13,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { LONGREADMAG } from './workflows/longreadmag' +include { LONGREADMAG } from './workflows/longreadmag' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_longreadmag_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_longreadmag_pipeline' /* @@ -77,12 +77,11 @@ workflow { // // SUBWORKFLOW: Run completion tasks // - - // PIPELINE_COMPLETION ( - // params.outdir, - // params.monochrome_logs, - // SANGERTOL_LONGREADMAG.out.multiqc_report - // ) + PIPELINE_COMPLETION ( + params.outdir, + params.monochrome_logs, + [] + ) } diff --git a/modules.json b/modules.json index 9552ca6..41d97ce 100644 --- a/modules.json +++ b/modules.json @@ -24,7 +24,8 @@ "checkm2/predict": { "branch": "master", "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/checkm2/predict/checkm2-predict.diff" }, "coverm/contig": { "branch": "master", @@ -36,11 +37,6 @@ "git_sha": "6d736588f4af4bc9b615bd7189442bd4272ae26e", "installed_by": ["modules"] }, - "dastool/fastatocontig2bin": { - "branch": "master", - "git_sha": "6d736588f4af4bc9b615bd7189442bd4272ae26e", - "installed_by": ["modules"] - }, "gawk": { "branch": "master", "git_sha": "caab1314ca62679b629da4c79afa9a4cab2bb8ee", @@ -123,17 +119,17 @@ "nf-core": { "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082", + "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "1b6b9a3338d011367137808b49b923515080e3ba", + "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", - "git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c", + "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", "installed_by": ["subworkflows"] } } diff --git a/modules/local/bin_summary/environment.yml b/modules/local/bin_summary/environment.yml index d4d242d..3b7415f 100644 --- a/modules/local/bin_summary/environment.yml +++ b/modules/local/bin_summary/environment.yml @@ -4,3 +4,4 @@ channels: dependencies: - conda-forge::r-base=4.4 - conda-forge::r-tidyverse=2.0.0 + - conda-forge::r-optparse=1.7.5 diff --git a/modules/local/bin_summary/main.nf b/modules/local/bin_summary/main.nf index af8351d..b016234 100644 --- a/modules/local/bin_summary/main.nf +++ b/modules/local/bin_summary/main.nf @@ -2,16 +2,28 @@ process BIN_SUMMARY { label "process_low" conda "${moduleDir}/environment.yml" - container "docker.io/rocker/tidyverse:4.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/r-base_r-tidyverse_r-optparse:d348292153ed2a3e': + 'community.wave.seqera.io/library/r-base_r-tidyverse_r-optparse:fb0e94661e2bf4e0' }" input: tuple val(meta), path(stats), path(checkm2), path(taxonomy) output: - tuple val(meta), path("bin_summary.tsv") + tuple val(meta), path("*.bin_summary.tsv"), emit: summary + path("versions.yml") , emit: versions script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def stats_input = stats ? "--stats ${stats.join(",")}" : "" + def checkm_input = checkm2 ? "--checkm ${checkm2.join(",")}" : "" + def tax_input = taxonomy ? "--taxonomy ${taxonomy.join(",")}" : "" """ - echo test > bin_summary.tsv + bin_summary.R \\ + ${stats_input} \\ + ${checkm_input} \\ + ${tax_input} \\ + ${args} """ } diff --git a/modules/local/contig2bin2fasta/environment.yml b/modules/local/contig2bintofasta/environment.yml similarity index 100% rename from modules/local/contig2bin2fasta/environment.yml rename to modules/local/contig2bintofasta/environment.yml diff --git a/modules/local/contig2bin2fasta/main.nf b/modules/local/contig2bintofasta/main.nf similarity index 67% rename from modules/local/contig2bin2fasta/main.nf rename to modules/local/contig2bintofasta/main.nf index 5230b18..0f6564c 100644 --- a/modules/local/contig2bin2fasta/main.nf +++ b/modules/local/contig2bintofasta/main.nf @@ -1,4 +1,4 @@ -process CONTIG2BIN2FASTA { +process CONTIG2BINTOFASTA { tag "${meta.id}" label "process_low" @@ -9,7 +9,6 @@ process CONTIG2BIN2FASTA { input: tuple val(meta), path(contigs), path(contig2bin) - val input_is_prodigal_aa output: tuple val(meta), path("*.fa*"), emit: bins @@ -18,14 +17,11 @@ process CONTIG2BIN2FASTA { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def extension = input_is_prodigal_aa ? "faa" : "fa" - def input_aa = input_is_prodigal_aa ? "_.*" : "" - def seqkit_mode = input_is_prodigal_aa ? "-rf" : "-f" """ awk '{print \$2}' ${contig2bin} | sort -u | while read bin do - grep -w \${bin} ${contig2bin} | awk '{print \$1\"${input_aa}\"}' > \${bin}.ctglst - seqkit grep ${seqkit_mode} \${bin}.ctglst ${contigs} > \${bin}.${extension} + grep -w \${bin} ${contig2bin} | awk '{ print \$1 }' > \${bin}.ctglst + seqkit grep -f \${bin}.ctglst ${contigs} > \${bin}.fa done cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/dastool/fastatocontig2bin/environment.yml b/modules/local/fastatocontig2bin/environment.yml similarity index 64% rename from modules/nf-core/dastool/fastatocontig2bin/environment.yml rename to modules/local/fastatocontig2bin/environment.yml index 48ee9e0..315f6dc 100644 --- a/modules/nf-core/dastool/fastatocontig2bin/environment.yml +++ b/modules/local/fastatocontig2bin/environment.yml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - bioconda::das_tool=1.1.7 + - conda-forge::gawk=5.3.0 diff --git a/modules/local/fastatocontig2bin/main.nf b/modules/local/fastatocontig2bin/main.nf new file mode 100644 index 0000000..0fa29c1 --- /dev/null +++ b/modules/local/fastatocontig2bin/main.nf @@ -0,0 +1,49 @@ +process FASTATOCONTIG2BIN { + tag "${meta.id}" + label "process_low" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.3.0' : + 'biocontainers/gawk:5.3.0' }" + + input: + tuple val(meta), path(bins) + val(extension) + + output: + tuple val(meta), path("*.tsv"), emit: contig2bin + path("versions.yml") , emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + awk \\ + 'BEGIN { OFS = "\t" } + BEGINFILE { + cmd=sprintf("basename %s .%s", FILENAME, "${extension}") + cmd | getline bin + } + /^>/ { + sub(/>/, "", \$1) + print \$1,bin + }' ${bins} > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/metator/pipeline/main.nf b/modules/local/metator/pipeline/main.nf index 2a3cd9d..e73bc82 100644 --- a/modules/local/metator/pipeline/main.nf +++ b/modules/local/metator/pipeline/main.nf @@ -60,7 +60,8 @@ process METATOR_PIPELINE { """ touch bin_summary.txt touch binning.txt - mkdir final_bin + mkdir bins + touch bins/${prefix}_metator_1.fa cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/checkm2/predict/checkm2-predict.diff b/modules/nf-core/checkm2/predict/checkm2-predict.diff new file mode 100644 index 0000000..9d73086 --- /dev/null +++ b/modules/nf-core/checkm2/predict/checkm2-predict.diff @@ -0,0 +1,33 @@ +Changes in module 'nf-core/checkm2/predict' +'modules/nf-core/checkm2/predict/environment.yml' is unchanged +'modules/nf-core/checkm2/predict/meta.yml' is unchanged +Changes in 'checkm2/predict/main.nf': +--- modules/nf-core/checkm2/predict/main.nf ++++ modules/nf-core/checkm2/predict/main.nf +@@ -12,9 +12,9 @@ + tuple val(dbmeta), path(db) + + output: +- tuple val(meta), path("${prefix}") , emit: checkm2_output +- tuple val(meta), path("${prefix}/quality_report.tsv"), emit: checkm2_tsv +- path("versions.yml") , emit: versions ++ tuple val(meta), path("${prefix}") , emit: checkm2_output ++ tuple val(meta), path("${prefix}/*.quality_report.tsv"), emit: checkm2_tsv ++ path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when +@@ -30,6 +30,8 @@ + --threads ${task.cpus} \\ + --database_path ${db} \\ + ${args} ++ ++ mv ${prefix}/quality_report.tsv ${prefix}/${prefix}.quality_report.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +'modules/nf-core/checkm2/predict/tests/tags.yml' is unchanged +'modules/nf-core/checkm2/predict/tests/main.nf.test' is unchanged +'modules/nf-core/checkm2/predict/tests/main.nf.test.snap' is unchanged +************************************************************ diff --git a/modules/nf-core/checkm2/predict/main.nf b/modules/nf-core/checkm2/predict/main.nf index 25271ba..7a8f94f 100644 --- a/modules/nf-core/checkm2/predict/main.nf +++ b/modules/nf-core/checkm2/predict/main.nf @@ -12,9 +12,9 @@ process CHECKM2_PREDICT { tuple val(dbmeta), path(db) output: - tuple val(meta), path("${prefix}") , emit: checkm2_output - tuple val(meta), path("${prefix}/quality_report.tsv"), emit: checkm2_tsv - path("versions.yml") , emit: versions + tuple val(meta), path("${prefix}") , emit: checkm2_output + tuple val(meta), path("${prefix}/*.quality_report.tsv"), emit: checkm2_tsv + path("versions.yml") , emit: versions when: task.ext.when == null || task.ext.when @@ -31,6 +31,8 @@ process CHECKM2_PREDICT { --database_path ${db} \\ ${args} + mv ${prefix}/quality_report.tsv ${prefix}/${prefix}.quality_report.tsv + cat <<-END_VERSIONS > versions.yml "${task.process}": checkm2: \$(checkm2 --version) diff --git a/modules/nf-core/dastool/fastatocontig2bin/main.nf b/modules/nf-core/dastool/fastatocontig2bin/main.nf deleted file mode 100644 index 371188c..0000000 --- a/modules/nf-core/dastool/fastatocontig2bin/main.nf +++ /dev/null @@ -1,52 +0,0 @@ -process DASTOOL_FASTATOCONTIG2BIN { - tag "$meta.id" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/das_tool:1.1.7--r43hdfd78af_0' : - 'biocontainers/das_tool:1.1.7--r43hdfd78af_0' }" - - input: - tuple val(meta), path(fasta) - val(extension) - - output: - tuple val(meta), path("*.tsv"), emit: fastatocontig2bin - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def file_extension = extension ? extension : "fasta" - def clean_fasta = fasta.toString() - ".gz" - def decompress_fasta = fasta.toString() == clean_fasta ? "" : "gunzip -q -f $fasta" - """ - $decompress_fasta - - Fasta_to_Contig2Bin.sh \\ - $args \\ - -i . \\ - -e $file_extension \\ - > ${prefix}.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - dastool: \$( DAS_Tool --version 2>&1 | grep "DAS Tool" | sed 's/DAS Tool //' ) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - dastool: \$( DAS_Tool --version 2>&1 | grep "DAS Tool" | sed 's/DAS Tool //' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/dastool/fastatocontig2bin/meta.yml b/modules/nf-core/dastool/fastatocontig2bin/meta.yml deleted file mode 100644 index 97b140b..0000000 --- a/modules/nf-core/dastool/fastatocontig2bin/meta.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: dastool_fastatocontig2bin -description: Helper script to convert a set of bins in fasta format to tabular scaffolds2bin - format -keywords: - - binning - - das tool - - table - - de novo - - bins - - contigs - - assembly - - das_tool -tools: - - dastool: - description: | - DAS Tool is an automated method that integrates the results - of a flexible number of binning algorithms to calculate an optimized, non-redundant - set of bins from a single assembly. - homepage: https://github.com/cmks/DAS_Tool - documentation: https://github.com/cmks/DAS_Tool - tool_dev_url: https://github.com/cmks/DAS_Tool - doi: "10.1038/s41564-018-0171-1" - licence: ["BSD"] - identifier: biotools:dastool -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: Fasta of list of fasta files recommended to be gathered via with - .collect() of bins - pattern: "*.{fa,fa.gz,fas,fas.gz,fna,fna.gz,fasta,fasta.gz}" - - - extension: - type: string - description: Fasta file extension (fa | fas | fasta | ...), without .gz suffix, - if gzipped input. -output: - - fastatocontig2bin: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.tsv": - type: file - description: tabular contig2bin file for DAS tool input - pattern: "*.tsv" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@maxibor" - - "@jfy133" -maintainers: - - "@maxibor" - - "@jfy133" diff --git a/modules/nf-core/dastool/fastatocontig2bin/tests/main.nf.test b/modules/nf-core/dastool/fastatocontig2bin/tests/main.nf.test deleted file mode 100644 index f2901cb..0000000 --- a/modules/nf-core/dastool/fastatocontig2bin/tests/main.nf.test +++ /dev/null @@ -1,83 +0,0 @@ -nextflow_process { - - name "Test Process DASTOOL_FASTATOCONTIG2BIN" - script "../main.nf" - process "DASTOOL_FASTATOCONTIG2BIN" - - tag "modules" - tag "modules_nfcore" - tag "dastool" - tag "dastool/fastatocontig2bin" - tag "metabat2/metabat2" - tag "metabat2/jgisummarizebamcontigdepths" - - setup { - run("METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS") { - script "../../../metabat2/jgisummarizebamcontigdepths/main.nf" - process { - """ - input[0] = [ - [ id:'test', single_end:false ], // meta map - file(params.modules_testdata_base_path + "genomics/prokaryotes/bacteroides_fragilis/illumina/bam/test1.sorted.bam", checkIfExists: true), - file(params.modules_testdata_base_path + "genomics/prokaryotes/bacteroides_fragilis/illumina/bam/test1.sorted.bam.bai", checkIfExists: true), - ] - """ - } - } - run("METABAT2_METABAT2") { - script "../../../metabat2/metabat2/main.nf" - process { - """ - input[0] = Channel.of([ - [ id:'test', single_end:false ], - file(params.modules_testdata_base_path + "genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz", checkIfExists: true) - ]).join(METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.depth) - """ - } - } - } - - test("dastool fastatocontig2bin - bacteroides fragilis") { - - when { - process { - """ - input[0] = METABAT2_METABAT2.out.fasta.collect() - input[1] = 'fa' - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("dastool fastatocontig2bin - bacteroides fragilis - stub") { - - options "-stub" - - when { - process { - """ - input[0] = METABAT2_METABAT2.out.fasta.collect() - input[1] = 'fa' - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - -} - diff --git a/modules/nf-core/dastool/fastatocontig2bin/tests/main.nf.test.snap b/modules/nf-core/dastool/fastatocontig2bin/tests/main.nf.test.snap deleted file mode 100644 index 32a05ba..0000000 --- a/modules/nf-core/dastool/fastatocontig2bin/tests/main.nf.test.snap +++ /dev/null @@ -1,72 +0,0 @@ -{ - "dastool fastatocontig2bin - bacteroides fragilis - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - "versions.yml:md5,902d9b18eee9c90141d379e80c1297b9" - ], - "fastatocontig2bin": [ - [ - { - "id": "test", - "single_end": false - }, - "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,902d9b18eee9c90141d379e80c1297b9" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.2" - }, - "timestamp": "2024-12-10T09:12:48.89685928" - }, - "dastool fastatocontig2bin - bacteroides fragilis": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.tsv:md5,6e46c0be14dded7cb13af38f54feea47" - ] - ], - "1": [ - "versions.yml:md5,902d9b18eee9c90141d379e80c1297b9" - ], - "fastatocontig2bin": [ - [ - { - "id": "test", - "single_end": false - }, - "test.tsv:md5,6e46c0be14dded7cb13af38f54feea47" - ] - ], - "versions": [ - "versions.yml:md5,902d9b18eee9c90141d379e80c1297b9" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.2" - }, - "timestamp": "2024-12-05T13:56:41.454584667" - } -} \ No newline at end of file diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf index 22a1087..edeea4d 100644 --- a/modules/nf-core/gtdbtk/classifywf/main.nf +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -56,7 +56,7 @@ process GTDBTK_CLASSIFYWF { --gtdbtk_output_dir . \\ ${bac_md} \\ ${ar_md} \\ - --gtdbtk_prefix ${prefix} \\ + --gtdbtk_prefix gtdbtk.${prefix} \\ --output_file gtdbtk.${prefix}_ncbi.tsv fi diff --git a/nextflow.config b/nextflow.config index cca9e0f..bfe0155 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,12 +43,16 @@ params { enable_taxonomy = true enable_gtdbtk = true gtdbtk_db = null - gtdb_ar53_metadata = null - gtdb_bac120_metadata = null + gtdbtk_mash_db = null + gtdb_ar53_metadata = "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz" + gtdb_bac120_metadata = "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz" gtdbtk_min_completeness = 50 gtdbtk_max_contamination = 10 ncbi_taxonomy_dir = null + // Summary options + enable_summary = true + // MultiQC options multiqc_config = null multiqc_title = null @@ -194,7 +198,6 @@ profiles { includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" // Load sanger-tol/longreadmag custom profiles from different institutions. -// TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs // includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/longreadmag.config" : "/dev/null" // Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index db8e13f..699c561 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -18,7 +18,7 @@ "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.(yaml|yml)$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", + "description": "Path to YAML file describing the input data. TODO: define yaml", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", "fa_icon": "fas fa-file-csv" }, @@ -38,138 +38,177 @@ "assembly_options": { "title": "Assembly options", "type": "object", - "description": "", + "description": "Options relating to metagenomic de-novo assembly.", "default": "", "properties": { "enable_assembly": { "type": "boolean", - "default": true + "default": true, + "description": "Enable de-novo assembly of HiFi reads." }, "enable_metamdbg": { "type": "boolean", - "default": true + "default": true, + "description": "Enable assembly with metaMDBG." } } }, "binning_options": { "title": "Binning options", "type": "object", - "description": "", + "description": "Options relating to binning of de-novo metagenome assemblies.", "default": "", "properties": { "enable_binning": { "type": "boolean", - "default": true + "default": true, + "description": "Enable binning of metagenomes into genome bins." }, "enable_metabat2": { "type": "boolean", - "default": true + "default": true, + "description": "Enable binning with Metabat2." }, "enable_maxbin2": { "type": "boolean", - "default": true + "default": true, + "description": "Enable binning with MaxBin2." }, "enable_bin3c": { "type": "boolean", - "default": true + "default": true, + "description": "Enable binning with Bin3C." }, "enable_metator": { "type": "boolean", - "default": true + "default": true, + "description": "Enable binning with MetaTor." } } }, "bin_refinement_options": { "title": "Bin refinement options", "type": "object", - "description": "", + "description": "Options relating to the refinement of genome bins.", "default": "", "properties": { "enable_bin_refinement": { "type": "boolean", - "default": true + "default": true, + "description": "Enable refinement of genome bins." }, "enable_dastool": { "type": "boolean", - "default": true + "default": true, + "description": "Enable bin refinement using DAS_Tool." }, "enable_magscot": { "type": "boolean", - "default": true + "default": true, + "description": "Enable bin refinement using MagScoT." }, "hmm_gtdb_pfam": { - "type": "string" + "type": "string", + "description": "Path to the GTDB r207 Pfam HMM database." }, "hmm_gtdb_tigrfam": { - "type": "string" + "type": "string", + "description": "Path to the GTDB r207 TigrFam HMM database." } } }, "pipeline_options": { "title": "Pipeline options", "type": "object", - "description": "", + "description": "General options relating to pipeline operation.", "default": "", "properties": { + "enable_summary": { + "type": "boolean", + "default": true, + "description": "Produce a final summary file about each genome bin." + }, "collate_bins": { "type": "boolean", - "default": true + "default": true, + "description": "For post-binning steps, collate all bins together so they operate as a single group. May save time when working with small numbers of bins." } } }, "bin_qc_options": { "title": "Bin QC options", "type": "object", - "description": "", + "description": "Options relating to the quality checking of genome bins.", "default": "", "properties": { "enable_binqc": { "type": "boolean", - "default": true + "default": true, + "description": "Enable quality checks of genome bins." }, "enable_checkm2": { "type": "boolean", - "default": true + "default": true, + "description": "Enable QC using CheckM2." }, "checkm2_db_version": { "type": "string", - "default": 5571251 + "default": 5571251, + "description": "If no local CheckM2 database provided, the Zenodo record ID of a CheckM2 database to download." }, "checkm2_local_db": { - "type": "string" + "type": "string", + "description": "Path to a local diamond database file for CheckM2." } } }, "bin_taxonomy_options": { "title": "Bin taxonomy options", "type": "object", - "description": "", + "description": "Options relating to the taxonomic classification of genome bins.", "default": "", "properties": { "enable_taxonomy": { "type": "boolean", - "default": true + "default": true, + "description": "Enable taxonomic classification of genome bins." }, "enable_gtdbtk": { "type": "boolean", - "default": true + "default": true, + "description": "Enable taxonomic classification with GTDB-Tk." }, "gtdbtk_db": { - "type": "string" + "type": "string", + "description": "Path to a directory containing the GTDB-Tk database." }, "gtdb_bac120_metadata": { - "type": "string" + "type": "string", + "default": "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz", + "description": "Path to the GTDB-Tk bac120 metadata file." }, "gtdb_ar53_metadata": { - "type": "string" + "type": "string", + "default": "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz", + "description": "Path to the GTDB-Tk ar53 metadata file." + }, + "gtdbtk_mash_db": { + "type": "string", + "description": "Path to a mash database built using the GTDB genomes." }, "gtdbtk_min_completeness": { "type": "integer", - "default": 50 + "default": 50, + "description": "Minimum CheckM2 completeness required for a bin to be classified by GTDB-Tk." }, "gtdbtk_max_contamination": { "type": "integer", - "default": 10 + "default": 10, + "description": "Maximum contamination allowed CheckM2 completeness for a bin to be classified by GTDB-Tk." + }, + "ncbi_taxonomy_dir": { + "type": "string", + "description": "Path to directory containing the names.dmp, nodes.dmp, delnodes.dmp, and merged.dmp files from the NCBI taxdump archive (ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz)" } } }, diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index 7acdbb2..28dd461 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -5,7 +5,6 @@ include { SEQKIT_STATS } from '../../modules/nf-core/seqkit/stats/ma workflow BIN_QC { take: bins - aa_bins main: ch_versions = Channel.empty() @@ -20,7 +19,7 @@ workflow BIN_QC { ) } - CHECKM2_PREDICT(aa_bins, ch_checkm2_db) + CHECKM2_PREDICT(bins, ch_checkm2_db) ch_versions = ch_versions | mix( diff --git a/subworkflows/local/bin_refinement.nf b/subworkflows/local/bin_refinement.nf index ffc3c5a..04f0a51 100644 --- a/subworkflows/local/bin_refinement.nf +++ b/subworkflows/local/bin_refinement.nf @@ -1,10 +1,9 @@ include { DASTOOL_DASTOOL } from '../../modules/nf-core/dastool/dastool/main' include { GAWK as GAWK_PROCESS_HMM_TBLOUT } from '../../modules/nf-core/gawk/main' include { GAWK as GAWK_MAGSCOT_PROCESS_CONTIG2BIN } from '../../modules/nf-core/gawk/main' -include { GAWK as GAWK_RENAME_DASTOOL_BINS } from '../../modules/nf-core/gawk/main' +include { GAWK as GAWK_RENAME_BINS } from '../../modules/nf-core/gawk/main' include { HMMER_HMMSEARCH } from '../../modules/nf-core/hmmer/hmmsearch/main' -include { CONTIG2BIN2FASTA as MAGSCOT_BINS } from '../../modules/local/contig2bin2fasta/main' -include { CONTIG2BIN2FASTA as DASTOOL_BINS } from '../../modules/local/contig2bin2fasta/main' +include { CONTIG2BINTOFASTA } from '../../modules/local/contig2bintofasta/main' include { MAGSCOT_MAGSCOT } from '../../modules/local/magscot/magscot/main' workflow BIN_REFINEMENT { @@ -14,9 +13,9 @@ workflow BIN_REFINEMENT { contig2bin main: - ch_versions = Channel.empty() - ch_refined_bins = Channel.empty() - ch_refined_contig2bin = Channel.empty() + ch_versions = Channel.empty() + ch_refined_bins = Channel.empty() + ch_refined_contig2bin_raw = Channel.empty() if(params.enable_dastool) { ch_contig2bins_to_merge = contig2bin @@ -25,34 +24,15 @@ workflow BIN_REFINEMENT { ch_dastool_input = assemblies | combine(ch_contig2bins_to_merge, by: 0) - | combine(proteins) + | combine(proteins, by: 0) DASTOOL_DASTOOL(ch_dastool_input, []) ch_versions = ch_versions.mix(DASTOOL_DASTOOL.out.versions) - // - // LOGIC: DAS_Tool does not rename the bins it outputs if the contigs - // in them do not change - this causes issues with file collisions - // downstream. Rename the bins inside the contig2bin script and - // write to fasta separately - // - GAWK_RENAME_DASTOOL_BINS(DASTOOL_DASTOOL.out.contig2bin, []) - ch_versions = ch_versions.mix(GAWK_RENAME_DASTOOL_BINS.out.versions) - - ch_dastool_bin_input = assemblies - | combine(GAWK_RENAME_DASTOOL_BINS.out.output, by: 0) - - ch_refined_contig2bin = ch_refined_contig2bin - | mix(GAWK_RENAME_DASTOOL_BINS.out.output) - - // emit dastool bins as fasta - DASTOOL_BINS(ch_dastool_bin_input, false) - ch_versions = ch_versions.mix(DASTOOL_BINS.out.versions) + ch_dastool_c2b = DASTOOL_DASTOOL.out.contig2bin + | map { meta, c2b -> [ meta + [binner: "dastool"], c2b ]} - ch_dastool_bins = DASTOOL_BINS.out.bins - | map { meta, fasta -> [ meta + [binner: "DASTool"], fasta ]} - - ch_refined_bins = ch_refined_bins.mix(ch_dastool_bins) + ch_refined_contig2bin_raw = ch_refined_contig2bin_raw.mix(ch_dastool_c2b) } if(params.enable_magscot) { @@ -82,8 +62,8 @@ workflow BIN_REFINEMENT { ch_versions = ch_versions.mix(GAWK_PROCESS_HMM_TBLOUT.out.versions) // - // LOGIC: the contig2bin files taken by MagScoT are in bin\tcontig format - // rather than contig\tbin format + // LOGIC: the contig2bin files taken by MagScoT are in bin\tcontig\tbinner + // format rather than contig\tbin format // GAWK_MAGSCOT_PROCESS_CONTIG2BIN( contig2bin, @@ -92,9 +72,9 @@ workflow BIN_REFINEMENT { ch_versions = ch_versions.mix(GAWK_MAGSCOT_PROCESS_CONTIG2BIN.out.versions) // - // LOGIC: Run MagScoT and write the bins to FASTA + // LOGIC: Run MagScoT // - ch_magscot_contig2bin = MAGSCOT_PROCESS_CONTIG2BIN.out.output + ch_magscot_contig2bin = GAWK_MAGSCOT_PROCESS_CONTIG2BIN.out.output | map { meta, c2b -> [ meta - meta.subMap(['binner']), c2b ] } | groupTuple(by: 0) @@ -104,19 +84,33 @@ workflow BIN_REFINEMENT { MAGSCOT_MAGSCOT(ch_magscot_input) ch_versions = ch_versions.mix(MAGSCOT_MAGSCOT.out.versions) - ch_refined_contig2bin = ch_refined_contig2bin - | mix(MAGSCOT_MAGSCOT.out.contig2bin) + ch_magscot_c2b = MAGSCOT_MAGSCOT.out.contig2bin + | map { meta, c2b -> [ meta + [binner: "magscot"], c2b ]} + + ch_refined_contig2bin_raw = ch_refined_contig2bin_raw.mix(ch_magscot_c2b) + } + + // + // LOGIC: DAS_Tool and MagScoT do not give control over the names of the bins + // they output - this causes issues with file collisions and expected name conventions + // downstream. Rename the bins inside the contig2bin script and write to fasta separately + // + if(params.enable_dastool || params.enable_magscot) { + GAWK_RENAME_BINS(ch_refined_contig2bin_raw, []) + ch_versions = ch_versions.mix(GAWK_RENAME_BINS.out.versions) + ch_refined_contig2bin = GAWK_RENAME_BINS.out.output - ch_magscot_contig2bin2fasta_input = assemblies - | combine(MAGSCOT_MAGSCOT.out.contig2bin, by: 0) + ch_c2b_to_combine = GAWK_RENAME_BINS.out.output + | map { meta, c2b -> [ meta - meta.subMap("binner"), meta, c2b ]} - MAGSCOT_BINS(ch_magscot_contig2bin2fasta_input, false) - ch_versions = ch_versions.mix(MAGSCOT_BINS.out.versions) + ch_contig2bintofasta_input = assemblies + | combine(ch_c2b_to_combine, by: 0) + | map { meta, contigs, meta_c2b, c2b -> [ meta_c2b, contigs, c2b ]} - ch_magscot_bins = MAGSCOT_BINS.out.bins - | map { meta, rbins -> [ meta + [binner: "magscot"], rbins] } + CONTIG2BINTOFASTA(ch_contig2bintofasta_input) + ch_versions = ch_versions.mix(CONTIG2BINTOFASTA.out.versions) - ch_refined_bins = ch_refined_bins.mix(ch_magscot_bins) + ch_refined_bins = CONTIG2BINTOFASTA.out.bins } emit: diff --git a/subworkflows/local/bin_taxonomy.nf b/subworkflows/local/bin_taxonomy.nf index 0ca3802..6262965 100644 --- a/subworkflows/local/bin_taxonomy.nf +++ b/subworkflows/local/bin_taxonomy.nf @@ -25,12 +25,12 @@ workflow BIN_TAXONOMY { | map { meta, row -> def completeness = Double.parseDouble(row.'Completeness') def contamination = Double.parseDouble(row.'Contamination') - [row.'Name' + ".fa", completeness, contamination] + [row.'Name', completeness, contamination] } ch_filtered_bins = bins | transpose() - | map { meta, bin -> [bin.getName(), bin, meta]} + | map { meta, bin -> [bin.getBaseName(), bin, meta]} | join(ch_bin_scores, failOnDuplicate: true) | filter { // it[3] = completeness, it[4] = contamination it[3] >= params.gtdbtk_min_completeness && it[4] <= params.gtdbtk_max_contamination @@ -44,16 +44,16 @@ workflow BIN_TAXONOMY { if(params.enable_gtdbtk && params.gtdbtk_db) { ch_gtdbtk_db = Channel.of(file(params.gtdbtk_db, checkIfExists: true).listFiles()) | collect | map { ["gtdb", it] } - ch_gtdb_bac120_metadata = params.gtdb_bac120_metadata ? Channel.of(file(params.gtdb_bac120_metadata)) : [] - ch_gtdb_ar53_metadata = params.gtdb_ar53_metadata ? Channel.of(file(params.gtdb_ar53_metadata)) : [] + + ch_gtdbtk_mash_db = params.gtdbtk_mash_db ? file(params.gtdbtk_mash_db) : [] GTDBTK_CLASSIFYWF( ch_filtered_bins, ch_gtdbtk_db, false, - [], - ch_gtdb_bac120_metadata, - ch_gtdb_ar53_metadata + ch_gtdbtk_mash_db, + file(params.gtdb_bac120_metadata), + file(params.gtdb_ar53_metadata) ) ch_versions = ch_versions.mix(GTDBTK_CLASSIFYWF.out.versions) ch_gtdb_summary = ch_gtdb_summary.mix(GTDBTK_CLASSIFYWF.out.summary) diff --git a/subworkflows/local/binning.nf b/subworkflows/local/binning.nf index 39bab3d..2a7e911 100644 --- a/subworkflows/local/binning.nf +++ b/subworkflows/local/binning.nf @@ -1,6 +1,6 @@ include { BIN3C_MKMAP } from '../../modules/local/bin3c/mkmap/main.nf' include { BIN3C_CLUSTER } from '../../modules/local/bin3c/cluster/main.nf' -include { DASTOOL_FASTATOCONTIG2BIN } from '../../modules/nf-core/dastool/fastatocontig2bin/main.nf' +include { FASTATOCONTIG2BIN } from '../../modules/local/fastatocontig2bin/main.nf' include { MAXBIN2 } from '../../modules/nf-core/maxbin2/main' include { GAWK as GAWK_MAXBIN2_DEPTHS } from '../../modules/nf-core/gawk/main' include { METABAT2_METABAT2 } from '../../modules/nf-core/metabat2/metabat2/main' @@ -94,9 +94,9 @@ workflow BINNING { // // LOGIC: Process all outputs into contig2bin format // - DASTOOL_FASTATOCONTIG2BIN(ch_bins, 'fa') - ch_contig2bin = ch_contig2bin.mix(DASTOOL_FASTATOCONTIG2BIN.out.fastatocontig2bin) - ch_versions = ch_versions.mix(DASTOOL_FASTATOCONTIG2BIN.out.versions) + FASTATOCONTIG2BIN(ch_bins, 'fa') + ch_contig2bin = ch_contig2bin.mix(FASTATOCONTIG2BIN.out.contig2bin) + ch_versions = ch_versions.mix(FASTATOCONTIG2BIN.out.versions) emit: bins = ch_bins diff --git a/subworkflows/local/utils_nfcore_longreadmag_pipeline/main.nf b/subworkflows/local/utils_nfcore_longreadmag_pipeline/main.nf index 0c8c323..db12dc3 100644 --- a/subworkflows/local/utils_nfcore_longreadmag_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_longreadmag_pipeline/main.nf @@ -10,6 +10,7 @@ include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' +include { paramsSummaryMap } from 'plugin/nf-schema' include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' include { YAML_INPUT } from '../yaml_input.nf' @@ -84,7 +85,6 @@ workflow PIPELINE_COMPLETION { take: outdir // path: Path to output directory where results will be published monochrome_logs // boolean: Disable ANSI colour codes in log output - multiqc_report // string: Path to MultiQC report main: diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf index 0fcbf7b..d6e593e 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf @@ -92,10 +92,12 @@ def checkCondaChannels() { channels = config.channels } catch (NullPointerException e) { + log.debug(e) log.warn("Could not verify conda channel configuration.") return null } catch (IOException e) { + log.debug(e) log.warn("Could not verify conda channel configuration.") return null } diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test index ca964ce..02dbf09 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test +++ b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test @@ -52,10 +52,12 @@ nextflow_workflow { } then { - assertAll( - { assert workflow.success }, - { assert workflow.stdout.contains("nextflow_workflow v9.9.9") } - ) + expect { + with(workflow) { + assert success + assert "nextflow_workflow v9.9.9" in stdout + } + } } } diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index 5cb7baf..bfd2587 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -56,21 +56,6 @@ def checkProfileProvided(nextflow_cli_args) { } } -// -// Citation string for pipeline -// -def workflowCitation() { - def temp_doi_ref = "" - def manifest_doi = workflow.manifest.doi.tokenize(",") - // Handling multiple DOIs - // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers - // Removing ` ` since the manifest.doi is a string and not a proper list - manifest_doi.each { doi_ref -> - temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" - } - return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + temp_doi_ref + "\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" -} - // // Generate workflow version string // @@ -150,33 +135,6 @@ def paramsSummaryMultiqc(summary_params) { return yaml_file_text } -// -// nf-core logo -// -def nfCoreLogo(monochrome_logs=true) { - def colors = logColours(monochrome_logs) as Map - String.format( - """\n - ${dashedLine(monochrome_logs)} - ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} - ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} - ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} - ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} - ${colors.green}`._,._,\'${colors.reset} - ${colors.purple} ${workflow.manifest.name} ${getWorkflowVersion()}${colors.reset} - ${dashedLine(monochrome_logs)} - """.stripIndent() - ) -} - -// -// Return dashed line -// -def dashedLine(monochrome_logs=true) { - def colors = logColours(monochrome_logs) as Map - return "-${colors.dim}----------------------------------------------------${colors.reset}-" -} - // // ANSII colours used for terminal logging // @@ -245,28 +203,24 @@ def logColours(monochrome_logs=true) { return colorcodes } -// -// Attach the multiqc report to email -// -def attachMultiqcReport(multiqc_report) { - def mqc_report = null - try { - if (workflow.success) { - mqc_report = multiqc_report.getVal() - if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) { - if (mqc_report.size() > 1) { - log.warn("[${workflow.manifest.name}] Found multiple reports from process 'MULTIQC', will use only one") - } - mqc_report = mqc_report[0] - } +// Return a single report from an object that may be a Path or List +// +def getSingleReport(multiqc_reports) { + if (multiqc_reports instanceof Path) { + return multiqc_reports + } else if (multiqc_reports instanceof List) { + if (multiqc_reports.size() == 0) { + log.warn("[${workflow.manifest.name}] No reports found from process 'MULTIQC'") + return null + } else if (multiqc_reports.size() == 1) { + return multiqc_reports.first() + } else { + log.warn("[${workflow.manifest.name}] Found multiple reports from process 'MULTIQC', will use only one") + return multiqc_reports.first() } + } else { + return null } - catch (Exception all) { - if (multiqc_report) { - log.warn("[${workflow.manifest.name}] Could not attach MultiQC report to summary email") - } - } - return mqc_report } // @@ -320,7 +274,7 @@ def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdi email_fields['summary'] = summary << misc_fields // On success try attach the multiqc report - def mqc_report = attachMultiqcReport(multiqc_report) + def mqc_report = getSingleReport(multiqc_report) // Check if we are only sending emails on failure def email_address = email @@ -340,7 +294,7 @@ def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdi def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as MemoryUnit def smail_fields = [email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "${workflow.projectDir}", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes()] def sf = new File("${workflow.projectDir}/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) @@ -351,14 +305,17 @@ def completionEmail(summary_params, email, email_on_fail, plaintext_email, outdi if (email_address) { try { if (plaintext_email) { -new org.codehaus.groovy.GroovyException('Send plaintext e-mail, not HTML') } + new org.codehaus.groovy.GroovyException('Send plaintext e-mail, not HTML') + } // Try to send HTML e-mail using sendmail def sendmail_tf = new File(workflow.launchDir.toString(), ".sendmail_tmp.html") sendmail_tf.withWriter { w -> w << sendmail_html } ['sendmail', '-t'].execute() << sendmail_html log.info("-${colors.purple}[${workflow.manifest.name}]${colors.green} Sent summary e-mail to ${email_address} (sendmail)-") } - catch (Exception all) { + catch (Exception msg) { + log.debug(msg.toString()) + log.debug("Trying with mail instead of sendmail") // Catch failures and try with plaintext def mail_cmd = ['mail', '-s', subject, '--content-type=text/html', email_address] mail_cmd.execute() << email_html diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test index 1dc317f..f117040 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test +++ b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test @@ -41,26 +41,14 @@ nextflow_function { } } - test("Test Function workflowCitation") { - - function "workflowCitation" - - then { - assertAll( - { assert function.success }, - { assert snapshot(function.result).match() } - ) - } - } - - test("Test Function nfCoreLogo") { + test("Test Function without logColours") { - function "nfCoreLogo" + function "logColours" when { function { """ - input[0] = false + input[0] = true """ } } @@ -73,9 +61,8 @@ nextflow_function { } } - test("Test Function dashedLine") { - - function "dashedLine" + test("Test Function with logColours") { + function "logColours" when { function { @@ -93,14 +80,13 @@ nextflow_function { } } - test("Test Function without logColours") { - - function "logColours" + test("Test Function getSingleReport with a single file") { + function "getSingleReport" when { function { """ - input[0] = true + input[0] = file(params.modules_testdata_base_path + '/generic/tsv/test.tsv', checkIfExists: true) """ } } @@ -108,18 +94,22 @@ nextflow_function { then { assertAll( { assert function.success }, - { assert snapshot(function.result).match() } + { assert function.result.contains("test.tsv") } ) } } - test("Test Function with logColours") { - function "logColours" + test("Test Function getSingleReport with multiple files") { + function "getSingleReport" when { function { """ - input[0] = false + input[0] = [ + file(params.modules_testdata_base_path + '/generic/tsv/test.tsv', checkIfExists: true), + file(params.modules_testdata_base_path + '/generic/tsv/network.tsv', checkIfExists: true), + file(params.modules_testdata_base_path + '/generic/tsv/expression.tsv', checkIfExists: true) + ] """ } } @@ -127,7 +117,9 @@ nextflow_function { then { assertAll( { assert function.success }, - { assert snapshot(function.result).match() } + { assert function.result.contains("test.tsv") }, + { assert !function.result.contains("network.tsv") }, + { assert !function.result.contains("expression.tsv") } ) } } diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap index 1037232..02c6701 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap +++ b/subworkflows/nf-core/utils_nfcore_pipeline/tests/main.function.nf.test.snap @@ -17,26 +17,6 @@ }, "timestamp": "2024-02-28T12:02:59.729647" }, - "Test Function nfCoreLogo": { - "content": [ - "\n\n-\u001b[2m----------------------------------------------------\u001b[0m-\n \u001b[0;32m,--.\u001b[0;30m/\u001b[0;32m,-.\u001b[0m\n\u001b[0;34m ___ __ __ __ ___ \u001b[0;32m/,-._.--~'\u001b[0m\n\u001b[0;34m |\\ | |__ __ / ` / \\ |__) |__ \u001b[0;33m} {\u001b[0m\n\u001b[0;34m | \\| | \\__, \\__/ | \\ |___ \u001b[0;32m\\`-._,-`-,\u001b[0m\n \u001b[0;32m`._,._,'\u001b[0m\n\u001b[0;35m nextflow_workflow v9.9.9\u001b[0m\n-\u001b[2m----------------------------------------------------\u001b[0m-\n" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-28T12:03:10.562934" - }, - "Test Function workflowCitation": { - "content": [ - "If you use nextflow_workflow for your analysis please cite:\n\n* The pipeline\n https://doi.org/10.5281/zenodo.5070524\n\n* The nf-core framework\n https://doi.org/10.1038/s41587-020-0439-x\n\n* Software dependencies\n https://github.com/nextflow_workflow/blob/master/CITATIONS.md" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-28T12:03:07.019761" - }, "Test Function without logColours": { "content": [ { @@ -95,16 +75,6 @@ }, "timestamp": "2024-02-28T12:03:17.969323" }, - "Test Function dashedLine": { - "content": [ - "-\u001b[2m----------------------------------------------------\u001b[0m-" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-02-28T12:03:14.366181" - }, "Test Function with logColours": { "content": [ { diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test index 842dc43..8fb3016 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -42,7 +42,7 @@ nextflow_workflow { params { test_data = '' - outdir = 1 + outdir = null } workflow { @@ -94,7 +94,7 @@ nextflow_workflow { params { test_data = '' - outdir = 1 + outdir = null } workflow { diff --git a/workflows/longreadmag.nf b/workflows/longreadmag.nf index 68cce29..2c13a6f 100644 --- a/workflows/longreadmag.nf +++ b/workflows/longreadmag.nf @@ -5,19 +5,19 @@ */ // include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_longreadmag_pipeline' -include { ASSEMBLY } from '../subworkflows/local/assembly' -include { BINNING } from '../subworkflows/local/binning' -include { BIN_QC } from '../subworkflows/local/bin_qc.nf' -include { BIN_TAXONOMY } from '../subworkflows/local/bin_taxonomy' -include { BIN_REFINEMENT } from '../subworkflows/local/bin_refinement' -include { BIN_SUMMARY } from '../modules/local/bin_summary' -include { CONTIG2BIN2FASTA as BINS_TO_PROTEIN } from '../modules/local/contig2bin2fasta' -include { PREPARE_DATA } from '../subworkflows/local/prepare_data' -include { READ_MAPPING } from '../subworkflows/local/read_mapping' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_longreadmag_pipeline' +include { ASSEMBLY } from '../subworkflows/local/assembly' +include { BINNING } from '../subworkflows/local/binning' +include { BIN_QC } from '../subworkflows/local/bin_qc.nf' +include { BIN_TAXONOMY } from '../subworkflows/local/bin_taxonomy' +include { BIN_REFINEMENT } from '../subworkflows/local/bin_refinement' +include { BIN_SUMMARY } from '../modules/local/bin_summary' +include { CONTIG2BINTOFASTA as BINS_TO_PROTEIN } from '../modules/local/contig2bintofasta' +include { PREPARE_DATA } from '../subworkflows/local/prepare_data' +include { READ_MAPPING } from '../subworkflows/local/read_mapping' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -77,20 +77,6 @@ workflow LONGREADMAG { ch_contig2bins = BINNING.out.contig2bin | mix(BIN_REFINEMENT.out.contig2bin) - // - // LOGIC: Convert nucleotide bins to amino acid bins using the - // predicted proteins from pyrodigal as many downstream processes - // repeat protein prediction - // - ch_c2b_to_join = ch_contig2bins - | map { meta, c2b -> [meta - meta.subMap("binner"), meta, c2b] } - ch_bin_to_protein_input = ch_c2b_to_join - | combine(ch_proteins, by: 0) - | map { meta, meta_c2b, c2b, faa -> [ meta_c2b, faa, c2b ] } - - BINS_TO_PROTEIN(ch_bin_to_protein_input, true) - ch_aa_bins = BINS_TO_PROTEIN.out.bins - // // LOGIC: (optional) collate bins from different binning steps into // single input to reduce redundant high-memory processes @@ -102,21 +88,14 @@ workflow LONGREADMAG { } | transpose | groupTuple(by: 0) - - ch_aa_bins = ch_aa_bins - | map { meta, bins -> - [ meta.subMap("id") + [assembler: "all"] + [binner: "all"], bins] - } - | transpose - | groupTuple(by: 0) } if(params.enable_binqc) { - BIN_QC(ch_bins, ch_aa_bins) + BIN_QC(ch_bins) ch_versions = ch_versions.mix(BIN_QC.out.versions) if(params.enable_taxonomy) { - BIN_TAXONOMY(ch_aa_bins, BIN_QC.out.checkm2_tsv) + BIN_TAXONOMY(ch_bins, BIN_QC.out.checkm2_tsv) ch_versions = ch_versions.mix(BIN_TAXONOMY.out.versions) ch_taxonomy_tsv = BIN_TAXONOMY.out.gtdb_ncbi