diff --git a/.github/workflows/workflow_modules.yml b/.github/workflows/workflow_modules.yml index f3491eca..068c472d 100644 --- a/.github/workflows/workflow_modules.yml +++ b/.github/workflows/workflow_modules.yml @@ -54,6 +54,7 @@ jobs: - name: Test EMGB import tools run: | ./bin/emgb.sh --output=output/test1 --runid=1 --binsdir=$(find output/test1/ -name "metabat") \ + --blastdb=uniref90 \ --db=${EMGB_KEGG_DB} \ --workdir="${WORK_DIR}_wFullPipeline" --name=test1 diff --git a/bin/emgb.sh b/bin/emgb.sh index f78806cc..71c196a6 100755 --- a/bin/emgb.sh +++ b/bin/emgb.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -VERSION=0.3.1 +VERSION=0.4.0 while [ $# -gt 0 ]; do case "$1" in @@ -19,6 +19,8 @@ while [ $# -gt 0 ]; do ;; --type=*) TYPE="${1#*=}" ;; + --blastdb=*) BLAST_DB="${1#*=}" + ;; --version) VERSION_CHECK=1 ;; --debug) DEBUG_CHECK=1 @@ -38,7 +40,7 @@ done function getGenes { - nr=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.ncbi_nr.blast.tsv" -exec readlink -f {} \; | sed 's/^/ -nr-blast-tab /g') + nr=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.${BLAST_DB}.blast.tsv" -exec readlink -f {} \; | sed 's/^/ -nr-blast-tab /g') tax=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.taxonomy.tsv" -exec readlink -f {} \; | sed 's/^/ -mmseqs-lineage /g') ffn=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.ffn.gz" -exec readlink -f {} \; | sed 's/^/ -ffn /g') gff=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.gff.gz" -exec readlink -f {} \; | sed 's/^/ -gff /g') @@ -103,6 +105,9 @@ help() echo " -- (e.g. X in the following example path fullPipelineOutput/SAMPLE/X/binning/) " echo " --binsdir -- directory of bins. If bin refinement was executed then the bin refinement output should be used." echo " -- (e.g. --binsdir=fullPipelineOutput/DRR066656/1/binning/0.4.0/metabat)" + echo " --blastdb -- Blast output that should be exported to emgb" + echo " -- (e.g. the folder name of BLAST_DB: output/test1/1/annotation/0.3.0/mmseqs2/BLAST_DB)" + echo " -- (Examples: bacmet20_predicted, ncbi_nr)" echo " --db -- emgb specific kegg database" echo " --name -- sample name, e.g. the SAMPLE in the paths above" echo " --type -- if other then Illumina: ONT/Hybrid" diff --git a/default/fullPipeline_illumina_nanpore.yml b/default/fullPipeline_illumina_nanpore.yml index d6d93253..d1bd4027 100644 --- a/default/fullPipeline_illumina_nanpore.yml +++ b/default/fullPipeline_illumina_nanpore.yml @@ -229,8 +229,11 @@ steps: defaultKingdom: false additionalParams: " --mincontiglen 500 " mmseqs2: + chunkSize: 20000 kegg: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: source: s3://databases_internal/kegg-mirror-2021-01_mmseqs.tar.zst @@ -238,29 +241,37 @@ steps: s5cmd: params: '--retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080' vfdb: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/vfdb_full_2022_07_29.tar.zst md5sum: 7e32aaed112d6e056fb8764b637bf49e bacmet20_experimental: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst md5sum: 57a6d328486f0acd63f7e984f739e8fe bacmet20_predicted: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst md5sum: 55902401a765fc460c09994d839d9b64 - ncbi_nr: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + uniref90: + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: - source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/nr_2023-04-29_mmseqs_taxonomy.tar.zst - md5sum: 79b9fb6b3dada41e602d70e12e7351c2 + source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/uniref90_20231108_mmseqs.tar.zst + md5sum: 313f2c031361091af2d5f3c6f6f46013 rgi: # --include_loose includes matches of more distant homologs of AMR genes which may also report spurious partial matches # --include_nudge Partial ORFs may do not pass curated bitscore cut-offs or novel samples may contain divergent alleles, so nudging diff --git a/default/fullPipeline_illumina_nanpore_without_aggregate.yml b/default/fullPipeline_illumina_nanpore_without_aggregate.yml index b9b58525..1d645380 100644 --- a/default/fullPipeline_illumina_nanpore_without_aggregate.yml +++ b/default/fullPipeline_illumina_nanpore_without_aggregate.yml @@ -140,7 +140,9 @@ steps: additionalParams: " --mincontiglen 500 " mmseqs2: kegg: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: source: s3://databases_internal/kegg-mirror-2021-01_mmseqs.tar.zst @@ -148,29 +150,37 @@ steps: s5cmd: params: '--retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080' vfdb: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/vfdb_full_2022_07_29.tar.zst md5sum: 7e32aaed112d6e056fb8764b637bf49e bacmet20_experimental: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst md5sum: 57a6d328486f0acd63f7e984f739e8fe bacmet20_predicted: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst md5sum: 55902401a765fc460c09994d839d9b64 - ncbi_nr: - params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + uniref90: + additionalParams: + search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 ' + additionalColumns: "" database: download: - source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/nr_2023-04-29_mmseqs_taxonomy.tar.zst - md5sum: 79b9fb6b3dada41e602d70e12e7351c2 + source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/uniref90_20231108_mmseqs.tar.zst + md5sum: 313f2c031361091af2d5f3c6f6f46013 mmseqs2_taxonomy: # Run taxonomy classification on MAGs and unbinable contigs or just the later runOnMAGs: true diff --git a/docker/toolkit-mmseqs2/Dockerfile b/docker/toolkit-mmseqs2/Dockerfile new file mode 100644 index 00000000..680d49e5 --- /dev/null +++ b/docker/toolkit-mmseqs2/Dockerfile @@ -0,0 +1,2 @@ +FROM ghcr.io/soedinglab/mmseqs2:15-6f452 +RUN apt update && apt install -y procps diff --git a/docker/toolkit-mmseqs2/VERSION b/docker/toolkit-mmseqs2/VERSION new file mode 100644 index 00000000..9de1d7d0 --- /dev/null +++ b/docker/toolkit-mmseqs2/VERSION @@ -0,0 +1 @@ +15-6f452-0 diff --git a/example_params/annotation.yml b/example_params/annotation.yml index 49dbbe9a..19992a81 100644 --- a/example_params/annotation.yml +++ b/example_params/annotation.yml @@ -11,8 +11,11 @@ steps: annotation: input: "test_data/annotation/input_small.tsv" mmseqs2: + chunkSize: 20000 kegg: - params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalParams: + search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalColumns: "" database: extractedDBPath: '/vol/spool/toolkit/kegg-mirror-2021-01_mmseqs/sequenceDB' # bacmet20_experimental: @@ -22,11 +25,14 @@ steps: # source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst # md5sum: 57a6d328486f0acd63f7e984f739e8fe bacmet20_predicted: - params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst md5sum: 55902401a765fc460c09994d839d9b64 + additionalParams: + search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalColumns: "" + # vfdb: # params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' # database: diff --git a/example_params/fullPipeline.yml b/example_params/fullPipeline.yml index fb81c108..0b398a1f 100644 --- a/example_params/fullPipeline.yml +++ b/example_params/fullPipeline.yml @@ -166,8 +166,11 @@ steps: defaultKingdom: false additionalParams: " --mincontiglen 200 " mmseqs2: + chunkSize: 20000 kegg: - params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalParams: + search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalColumns: "" database: extractedDBPath: '/vol/spool/toolkit/kegg-mirror-2021-01_mmseqs/sequenceDB' @@ -183,16 +186,23 @@ steps: # download: # source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst # md5sum: 57a6d328486f0acd63f7e984f739e8fe - ncbi_nr: - params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + uniref90: + additionalParams: + search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalColumns: "" + + database: download: - source: s3://databases/nr_2023-04-29_mmseqs_taxonomy/* - md5sum: 79b9fb6b3dada41e602d70e12e7351c2 + source: s3://databases/uniref90_20231108_mmseqs/* + md5sum: 313f2c031361091af2d5f3c6f6f46013 s5cmd: params: '--retry-count 30 --no-verify-ssl --no-sign-request --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080' + bacmet20_predicted: - params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalParams: + search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalColumns: "" database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst diff --git a/example_params/fullPipelineIlluminaOrONT.yml b/example_params/fullPipelineIlluminaOrONT.yml index 5e44a4b8..85355b83 100644 --- a/example_params/fullPipelineIlluminaOrONT.yml +++ b/example_params/fullPipelineIlluminaOrONT.yml @@ -149,8 +149,11 @@ steps: defaultKingdom: false additionalParams: " --mincontiglen 200 " mmseqs2: + chunkSize: 20000 kegg: - params: ' -s 2 --exact-kmer-matching 1 ' + additionalParams: + search : ' -s 2 --exact-kmer-matching 1 ' + additionalColumns: "" database: extractedDBPath: '/vol/spool/toolkit/kegg-mirror-2021-01_mmseqs/sequenceDB' # vfdb: @@ -166,7 +169,9 @@ steps: # source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst # md5sum: 57a6d328486f0acd63f7e984f739e8fe bacmet20_predicted: - params: ' -s 2 --exact-kmer-matching 1 ' + additionalParams: + search : ' -s 2 --exact-kmer-matching 1 ' + additionalColumns: "" database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst diff --git a/example_params/fullPipelineONT.yml b/example_params/fullPipelineONT.yml index 6d48d014..dbf6658c 100644 --- a/example_params/fullPipelineONT.yml +++ b/example_params/fullPipelineONT.yml @@ -149,6 +149,7 @@ steps: defaultKingdom: false additionalParams: " --mincontiglen 200 " mmseqs2: + chunkSize: 20000 # vfdb: # params: ' -s 2 --exact-kmer-matching 1 ' # database: @@ -162,7 +163,9 @@ steps: # source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst # md5sum: 57a6d328486f0acd63f7e984f739e8fe bacmet20_predicted: - params: ' -s 2 --exact-kmer-matching 1 ' + additionalParams: + search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalColumns: "" database: download: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst diff --git a/example_params/fullPipeline_fraction/fullPipeline_fraction_binning_metabinner.yml b/example_params/fullPipeline_fraction/fullPipeline_fraction_binning_metabinner.yml index 1fcdb12f..df779af3 100644 --- a/example_params/fullPipeline_fraction/fullPipeline_fraction_binning_metabinner.yml +++ b/example_params/fullPipeline_fraction/fullPipeline_fraction_binning_metabinner.yml @@ -40,8 +40,11 @@ steps: kmerSize: 4 annotation: mmseqs2: + chunkSize: 7000 kegg: - params: ' -s 2 --exact-kmer-matching 1 ' + additionalParams: + search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3' + additionalColumns: "" database: extractedDBPath: '/vol/spool/toolkit/kegg-mirror-2021-01_mmseqs/sequenceDB' resources: diff --git a/lib/Utils.groovy b/lib/Utils.groovy index fa23c986..37c6bed1 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -96,6 +96,7 @@ class Utils { } } } + static Object[] flattenTuple(tupl){ def chunkList = []; def SAMPLE_IDX = 0; @@ -106,6 +107,31 @@ class Utils { return chunkList; } + /* + * This method takes the number of entries of an input file (e.g. fasta entries in multi-fasta file), + * the maximum number of allowed entries per chunk and the actual input (e.g. file). + * It creates a list of indices of chunks of the input file based on the input parameters. + */ + static List splitFilesIndex(seqCount, chunkSize, sample){ + int chunk=seqCount.intdiv(chunkSize) + if(seqCount.mod(chunkSize) != 0){ + chunk = chunk + 1 + } + def chunks = [] + for(def n : 1..chunk){ + int start = (n-1) * chunkSize + 1 + + int end = n * chunkSize + + if(end > seqCount){ + end=seqCount + } + chunks.add(sample + [start, end, chunk]) + } + return chunks + } + + static getMappingIdentityParam(medianQuality) { if(medianQuality > 17){ return 97 diff --git a/modules/annotation/module.nf b/modules/annotation/module.nf index 29b4425a..01b7993e 100644 --- a/modules/annotation/module.nf +++ b/modules/annotation/module.nf @@ -4,6 +4,7 @@ include { pDumpLogs } from '../utils/processes' import java.nio.file.Files import java.nio.file.Paths +import java.util.stream.Collectors mode = 'mode_not_set' @@ -23,7 +24,7 @@ def getOutput(SAMPLE, RUNID, TOOL, filename){ * See “/lib/Utils.groovy” for more information. **/ def constructParametersObject(String tool){ - return params?.steps?.annotation?."$tool".findAll({ it.key != "runOnMAGs" }).collect{ Utils.getDockerMount(it.value?.database, params, 'true')}.join(" ") + return params?.steps?.annotation?."$tool".findAll({ it.key != "runOnMAGs" && it.key != "chunkSize" }).collect{ Utils.getDockerMount(it.value?.database, params, 'true')}.join(" ") } @@ -58,25 +59,23 @@ process pMMseqs2 { // Another mount flag is used to get a key file (aws format) into the docker-container. // This file is then used by s5cmd. The necessary mount points are generated by “constructParametersObject()”. - containerOptions constructParametersObject("mmseqs2") - + containerOptions constructParametersObject("mmseqs2") + " --entrypoint='' " + tag "Sample: $sample, Database: $dbType" - label 'highmemLarge' + label 'large' secret { "${S3_DB_ACCESS}"!="" ? ["S3_${dbType}_ACCESS", "S3_${dbType}_SECRET"] : [] } - publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "mmseqs2/${dbType}", filename) }, \ - pattern: "{**.blast.tsv}" - when params?.steps.containsKey("annotation") && params?.steps.annotation.containsKey("mmseqs2") input: val(binType) - tuple val(sample), file(fasta), file(contig2GeneMapping), val(dbType), val(parameters), val(EXTRACTED_DB), val(DOWNLOAD_LINK), val(MD5SUM), val(S5CMD_PARAMS) - + tuple val(sample), file(fasta), file(contig2GeneMapping), val(start), val(stop), val(numberOfChunks), val(dbType), val(parameters), val(columns), val(EXTRACTED_DB), val(DOWNLOAD_LINK), val(MD5SUM), val(S5CMD_PARAMS) + output: - tuple val("${dbType}"), val("${sample}"), val("${binType}"), path("${sample}_${binType}.${dbType}.blast.tsv"), optional:true, emit: blast + tuple val("${dbType}"), val("${sample}"), val("${binType}"), val("${start}"), val("${stop}"), val("${numberOfChunks}"),\ + path("${sample}_${binType}.${start}.${stop}.${dbType}.blast.tsv"), optional:true, emit: blast tuple val("${sample}_${binType}"), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \ file(".command.out"), file(".command.err"), file(".command.log"), emit: logs @@ -84,78 +83,8 @@ process pMMseqs2 { output = getOutput("${sample}", params.runid, "mmseqs2/${dbType}", "") S3_DB_ACCESS=params.steps?.annotation?.mmseqs2?."${dbType}"?.database?.download?.s5cmd && S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_${dbType}_ACCESS" : "" S3_DB_SECRET=params.steps?.annotation?.mmseqs2?."${dbType}"?.database?.download?.s5cmd && S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_${dbType}_SECRET" : "" - ''' - mkdir -p !{params.polished.databases} - # if no local database is referenced, start download part - if [ -z "!{EXTRACTED_DB}" ] - then - # polished params end with a / so no additional one is needed at the end - DATABASE=!{params.polished.databases}mmseqs2 - mkdir -p ${DATABASE}/!{dbType} - LOCK_FILE=${DATABASE}/!{dbType}/lock.txt - - # Check if access and secret keys are necessary for s5cmd - if [ ! -z "!{S3_DB_ACCESS}" ] - then - export AWS_ACCESS_KEY_ID=!{S3_DB_ACCESS} - export AWS_SECRET_ACCESS_KEY=!{S3_DB_SECRET} - fi - - # Create and try to lock the given “LOCK_FILE”. If the file is locked no other process will download the same database simultaneously - # and wait until the database is downloaded. Depending on your database path (starts with s3:// | https:// | /../../.. ) - # one of the flock input commands is executed to download and decompress the database. - # If a database is present at the given path, checksums are compared, if they are identical the download will be omitted. - flock ${LOCK_FILE} concurrentDownload.sh --output=${DATABASE}/!{dbType} \ - --link=!{DOWNLOAD_LINK} \ - --httpsCommand="wget -qO- !{DOWNLOAD_LINK} | zstd -T!{task.cpus} -d -c | tar -xv " \ - --s3FileCommand="s5cmd !{S5CMD_PARAMS} cat --concurrency !{task.cpus} !{DOWNLOAD_LINK} | zstd -T!{task.cpus} -d -c | tar -xv " \ - --s3DirectoryCommand="s5cmd !{S5CMD_PARAMS} cp --concurrency !{task.cpus} !{DOWNLOAD_LINK} . " \ - --s5cmdAdditionalParams="!{S5CMD_PARAMS}" \ - --localCommand="zstd -T!{task.cpus} -c -d !{DOWNLOAD_LINK} | tar -xv " \ - --expectedMD5SUM=!{MD5SUM} - - # Path of the newly downloaded database. A mmseqs2 database consists out of multiple files, - # the unique lookup file is searched to determine the basename of each database. - FILEPATH=$(find ${DATABASE}/!{dbType} -name *.lookup -type f) - # remove extension to get the files basename. - MMSEQS2_DATABASE_DIR=${FILEPATH%.*} - else - # If an extracted database is present use that path. - MMSEQS2_DATABASE_DIR="!{EXTRACTED_DB}" - fi - MMSEQS_HEADER="query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen,qcov,tcov" - OUTPUT_TMP_TSV="!{sample}_!{binType}.!{dbType}.blast.tmp.tsv" - OUTPUT_TSV="!{sample}_!{binType}.!{dbType}.blast.tsv" - - # According to the MMSeqs docs the --split-memory-limit parameter defines the RAM usage for *about* 80 percent of the total RAM consumption. - # We set the ram limit parameter to 75 percent of the total available RAM to make sure to not run into out of memory errors. - RAM_LIMIT="$(awk -v RATIO=75 -v RAM=$(echo !{task.memory} | cut -f 1 -d ' ') 'BEGIN { print int(RAM / 100 * RATIO) }')G" - - mkdir tmp - # Only mmseqs2 databases can be used for every kind of search. Inputs have to be converted first. - mmseqs createdb !{fasta} queryDB - # Load all indices into memory to increase searching speed - mmseqs touchdb --threads !{task.cpus} queryDB - mmseqs touchdb --threads !{task.cpus} ${MMSEQS2_DATABASE_DIR} - mmseqs search queryDB ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.results.database tmp !{parameters} \ - --threads !{task.cpus} --split-memory-limit ${RAM_LIMIT} - # mmseqs2 searches produce output databases. These have to be converted to a more useful format. The blast -outfmt 6 in this case. - mmseqs convertalis queryDB ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.results.database ${OUTPUT_TMP_TSV} \ - --threads !{task.cpus} --format-output ${MMSEQS_HEADER} - - # Try only if the output file is not empty, csvtk will fail otherwise - if [ -s ${OUTPUT_TMP_TSV} ]; then - # Add header - sed -i "1i $(echo ${MMSEQS_HEADER} | tr ',' '\t')" ${OUTPUT_TMP_TSV} - - # Add BIN_ID and SAMPLE column - csvtk -t -T join -f "locus_tag;query" \ - <(csvtk -t -T concat !{contig2GeneMapping} | csvtk -t -T cut -f SAMPLE,BIN_ID,CONTIG,locus_tag) ${OUTPUT_TMP_TSV} > ${OUTPUT_TSV} - - # use "query" column name instead of "locus_tag" - sed -i -e "1s/locus_tag/query/" ${OUTPUT_TSV} - fi - ''' + ADDITIONAL_COLUMNS = "${columns}".trim() == "" ? "" : ",${columns}" + template("mmseqs2.sh") } /** @@ -175,7 +104,7 @@ process pMMseqs2_taxonomy { // Therefore this place has to be mounted to the docker container to be accessible during run time. // Another mount flag is used to get a key file (aws format) into the docker-container. // This file is then used by s5cmd. The necessary mount points are generated by “constructParametersObject()”. - containerOptions constructParametersObject("mmseqs2_taxonomy") + containerOptions constructParametersObject("mmseqs2_taxonomy") + " --entrypoint='' " tag "Sample: $sample, Database_taxonomy: $dbType" @@ -252,22 +181,36 @@ process pMMseqs2_taxonomy { MMSEQS2_DATABASE_DIR="!{EXTRACTED_DB}" fi + + # Check which mmseqs version is the right one + MMSEQS_VERSION="" + if grep -q "avx2" /proc/cpuinfo; then + echo "AVX2 supported, using mmseqs_avx2" + MMSEQS_VERSION="mmseqs_avx2" + elif grep -q "sse4_1" /proc/cpuinfo; then + echo "SSE4.1 supported, using mmseqs_sse41" + MMSEQS_VERSION="mmseqs_sse41" + else + echo "Using mmseqs_sse2" + MMSEQS_VERSION="mmseqs_sse2" + fi + mkdir tmp # Only mmseqs2 databases can be used for every kind of search. Inputs have to be converted first. - mmseqs createdb !{fasta} queryDB + $MMSEQS_VERSION createdb !{fasta} queryDB # If the ramMode is set to true, the whole database will be loaded into the RAM. Do not forget to set the MMseqs2 parameter accordingly, --db-load-mode 3. if !{ramMode} then # Load all indices into memory to increase searching speed - mmseqs touchdb --threads !{task.cpus} queryDB - mmseqs touchdb --threads !{task.cpus} ${MMSEQS2_DATABASE_DIR} + $MMSEQS_VERSION touchdb --threads !{task.cpus} queryDB + $MMSEQS_VERSION touchdb --threads !{task.cpus} ${MMSEQS2_DATABASE_DIR} fi # Define taxonomies - mmseqs taxonomy queryDB ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.taxresults.database tmp !{parameters} --start-sens 1 --sens-steps 1 -s !{sensitivity} --threads !{task.cpus} + $MMSEQS_VERSION taxonomy queryDB ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.taxresults.database tmp !{parameters} -s !{sensitivity} --threads !{task.cpus} # mmseqs2 searches produce output databases. These have to be converted to more useful formats. - mmseqs createtsv queryDB !{sample}_!{binType}.!{dbType}.taxresults.database !{sample}_!{binType}.!{dbType}.taxonomy.tsv --threads !{task.cpus} - mmseqs taxonomyreport ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.taxresults.database !{sample}_!{binType}.!{dbType}.krakenStyleTaxonomy.out - mmseqs taxonomyreport ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.taxresults.database !{sample}_!{binType}.!{dbType}.krona.html --report-mode 1 + $MMSEQS_VERSION createtsv queryDB !{sample}_!{binType}.!{dbType}.taxresults.database !{sample}_!{binType}.!{dbType}.taxonomy.tsv --threads !{task.cpus} + $MMSEQS_VERSION taxonomyreport ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.taxresults.database !{sample}_!{binType}.!{dbType}.krakenStyleTaxonomy.out + $MMSEQS_VERSION taxonomyreport ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.taxresults.database !{sample}_!{binType}.!{dbType}.krona.html --report-mode 1 ''' } @@ -507,10 +450,10 @@ process pKEGGFromBlast { secret { "${S3_KEGG_ACCESS}"!="" ? ["S3_kegg_ACCESS", "S3_kegg_SECRET"] : [] } input: - tuple val(sample), val(binType), file(blast_result) + tuple val(sample), val(binType), file(blastResult) output: - tuple val("${sample}"), path("${sample}_${binType}_keggPaths.tsv"), emit: kegg_paths + tuple val("${sample}"), path("*.keggPaths.tsv"), emit: keggPaths tuple val("${sample}_${binType}"), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \ file(".command.out"), file(".command.err"), file(".command.log"), emit: logs @@ -553,7 +496,7 @@ process pKEGGFromBlast { else KEGG_DB="!{EXTRACTED_DB}" fi - blast2kegg.py !{blast_result} ${KEGG_DB} !{sample}_!{binType}_keggPaths.tsv + blast2kegg.py !{blastResult} ${KEGG_DB} !{blastResult.baseName}.keggPaths.tsv ''' } @@ -722,6 +665,98 @@ workflow wAnnotateList { proteins = _wAnnotation.out.prokka_faa } +/** +* +* The pCount process is designed to count the number of sequences in a given FASTA file. +* It uses the seqkit tool to calculate the sequence statistics of a FASTA file, specifically extracting the sequence count. +* +**/ + +process pCount { + + label 'tiny' + + tag "Sample: $sample, BinID: $binID" + + container "${params.ubuntu_image}" + + input: + tuple val(sample), val(binID), path(fasta), path(metadata) + + output: + tuple val("${sample}"), val("${binID}"), path(fasta), path(metadata), env(COUNT) + + shell: + ''' + COUNT=$(seqkit stats -T <(cat !{fasta}) | cut -d$'\t' -f 4 | tail -n 1) + ''' +} + +/** +* +* The pCollectFile process is designed to collect and concatenate BLAST output files. +* It uses the csvtk tool to concatenate all the .tsv files and then splits them based on the BIN_ID. +* +**/ +process pCollectFile { + + label 'small' + + tag "Sample: $sample, Database: $dbType" + + container "${params.ubuntu_image}" + + publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "mmseqs2/${dbType}", filename) }, \ + pattern: "{**.blast.tsv}" + + input: + tuple val(dbType), val(sample), val(type), path(blastOutputs) + + output: + tuple val("${sample}"), val("${type}"), val("${dbType}"), path("*.blast.tsv", arity: "1..*") + + shell: + ''' + mkdir tmp + csvtk -t -T concat *.tsv > tmp/concat.tsv + + # Column number of of BIN_ID column + COLUMN_NUMBER=$(sed 1q tmp/concat.tsv | tr -s '\t' '\n' | nl | grep -E 'BIN_ID' | cut -f 1 | tr -d ' ') + for bin in $(csvtk cut -t -T -f BIN_ID tmp/concat.tsv | tail -n +2 | sort | uniq); do + FILE_NAME="${bin}_!{sample}_!{type}.!{dbType}.blast.tsv" + head -n 1 tmp/concat.tsv > ${FILE_NAME} + awk -F$'\t' -v filter=${bin} -v column=${COLUMN_NUMBER} ' $column == filter ' tmp/concat.tsv >> ${FILE_NAME} + done + ''' +} + + +/* +* +* This method splits the input file in chunks. +* +*/ +workflow _wSplit { + take: + input + chunkSize + main: + SAMPLE_IDX = 0 + FILE_IDX = 1 + FILE_2_IDX = 2 + FILE_3_IDX = 3 + COUNT_IDX=4 + CHUNK_SIZE_IDX=5 + ID_PLACEHOLDER = "" + + input | map { dataset -> [dataset[SAMPLE_IDX].toString(), ID_PLACEHOLDER, dataset[FILE_IDX], dataset[FILE_2_IDX]] } \ + | pCount | combine(chunkSize) | flatMap { sample -> \ + Utils.splitFilesIndex(Integer.parseInt(sample[COUNT_IDX]), sample[CHUNK_SIZE_IDX], [sample[SAMPLE_IDX], sample[FILE_2_IDX], sample[FILE_3_IDX]]) } \ + | set { chunks } + + emit: + chunks +} /** * @@ -749,8 +784,9 @@ workflow _wAnnotation { pProkka(prodigalMode, _wCreateProkkaInput.out.prokkaInput | combine(contigCoverage, by: SAMPLE_IDX)) // Collect all databases - selectedDBs = params?.steps?.annotation?.mmseqs2.findAll().collect({ - [it.key, it.value?.params ?: "", \ + selectedDBs = params?.steps?.annotation?.mmseqs2.findAll({ it.key != "chunkSize" }).collect({ + [it.key, it.value?.additionalParams?.search ?: "", \ + it.value?.additionalParams?.additionalColumns ?: "", \ it.value?.database?.extractedDBPath ?: "", \ it.value.database?.download?.source ?: "", \ it.value.database?.download?.md5sum ?: "", \ @@ -766,7 +802,6 @@ workflow _wAnnotation { it.value.database?.download?.md5sum ?: "", \ it.value.database?.download?.s5cmd?.params ?: "" ] }) - PATH_IDX=2 // The following groupTuple operators need a counter for the expected number of bins or fasta files of a sample. @@ -779,28 +814,52 @@ workflow _wAnnotation { // Run all amino acid outputs against all databases // Collect by sample name to bundle searches and avoid calls with small input files pProkka.out.faa | map { [it[SAMPLE_IDX], it[PATH_IDX]] } | combine(fastaCounter, by:SAMPLE_IDX) \ - | map { sample, path, size -> tuple( groupKey(sample, size), path ) } | groupTuple() | set { groupedProkkaFaa } + | map { sample, path, size -> tuple( groupKey(sample, size), path ) } | groupTuple() | set { groupedProkkaFaa } + + MMSEQS2_CHUNK_SIZE_DEFAULT=7000 - groupedProkkaFaa | combine(contig2GeneMapping, by: SAMPLE_IDX) | combine(Channel.from(selectedDBs)) | set { combinedMMseqs } + // Split fasta files and run blast on each part in parallel + _wSplit(groupedProkkaFaa | combine(contig2GeneMapping, by: SAMPLE_IDX), \ + Channel.from(params.steps?.annotation?.mmseqs2?.chunkSize?:MMSEQS2_CHUNK_SIZE_DEFAULT)) \ + | combine(Channel.from(selectedDBs)) | set { combinedMMseqs } pMMseqs2(sourceChannel, combinedMMseqs) + + FIRST_ELEM_IDX = 0 + FIRST_ELEM_DB_IDX = 0 + FIRST_ELEM_SAMPLE_IDX = 1 + FIRST_ELEM_TYPE_IDX = 2 + + ELEM_PATH_IDX = 3 + + pMMseqs2.out.blast \ + // An artificial group key is created which consists of the db, sample name and the type of the data (e.g. binned) + | map { db, sample, type, start, stop, chunks, out -> [db + "_-_" + sample + "_-_" + type, db, sample, type, start, stop, chunks, out] } \ + | map { key, db, sample, type, start, stop, chunks, out -> tuple(groupKey(key, chunks.toInteger()), [db, sample, type, out]) } \ + // Based on the artificial group key and the expected number of chunks, the groupTuple operator is executed + | groupTuple() \ + // All values of each entry are then reordered and prepared for the pCollectFile process input + | map { key, dataset -> [dataset[FIRST_ELEM_IDX][FIRST_ELEM_DB_IDX], dataset[FIRST_ELEM_IDX][FIRST_ELEM_SAMPLE_IDX], \ + dataset[FIRST_ELEM_IDX][FIRST_ELEM_TYPE_IDX], dataset.stream().map{ elem -> elem[ELEM_PATH_IDX] }.collect()] } \ + | pCollectFile \ + // The kegg database result is selected and reordered for the pKEGGFromBlast process input + | filter({ sample, type, dbType, blastFiles -> dbType == "kegg" }) \ + | flatMap({ sample, type, dbType, blastFiles -> blastFiles.stream().map({ fi -> [sample, type, fi] }).collect() }) \ + | pKEGGFromBlast + combinedMMseqsTax = groupedProkkaFaa | combine(Channel.from(selectedTaxDBs)) pMMseqs2_taxonomy(sourceChannel, combinedMMseqsTax) DB_TYPE_IDX = 0 - pMMseqs2.out.blast | filter({ result -> result[DB_TYPE_IDX] == "kegg" }) \ - | map({ result -> result.remove(0); result }) \ - | set { mmseqs2Results } // Run Resistance Gene Identifier with amino acid outputs pProkka.out.faa | pResistanceGeneIdentifier - pKEGGFromBlast(mmseqs2Results) pProkka.out.logs | mix(pMMseqs2.out.logs) \ | mix(pMMseqs2_taxonomy.out.logs) \ | mix(pResistanceGeneIdentifier.out.logs) \ | mix(pKEGGFromBlast.out.logs) | pDumpLogs emit: - keggAnnotation = pKEGGFromBlast.out.kegg_paths + keggAnnotation = pKEGGFromBlast.out.keggPaths mmseqs2_kronaHtml = pMMseqs2_taxonomy.out.kronaHtml mmseqs2_krakenTaxonomy = pMMseqs2_taxonomy.out.krakenStyleTaxonomy mmseqs2_taxonomy = pMMseqs2_taxonomy.out.taxonomy diff --git a/modules/plasmids/module.nf b/modules/plasmids/module.nf index 71dc8da5..1739d61f 100644 --- a/modules/plasmids/module.nf +++ b/modules/plasmids/module.nf @@ -138,31 +138,6 @@ process pCount { ''' } -/* -* This method takes number of entries in a input file (e.g. fata entries in multi fasta file), -* the maximum number of allowed entries per chunk and the actual input (e.g. file). -* It creates a list of indices of chunks of the input file based on the input parameters. -*/ -def splitFilesIndex(seqCount, chunkSize, sample){ - int chunk=seqCount.intdiv(chunkSize) - if(seqCount.mod(chunkSize) != 0){ - chunk = chunk + 1 - } - def chunks = [] - for(def n : 1..chunk){ - int start = (n-1) * chunkSize + 1 - - int end = n * chunkSize - - if(end > seqCount){ - end=seqCount - } - chunks.add(sample + [start, end]) - } - - return chunks -} - /* * @@ -180,8 +155,8 @@ workflow _wSplit { COUNT_IDX=3 CHUNK_SIZE_IDX=4 input | pCount | combine(chunkSize) | flatMap { sample -> \ - splitFilesIndex(Integer.parseInt(sample[COUNT_IDX]), sample[CHUNK_SIZE_IDX], [sample[SAMPLE_IDX], sample[BIN_ID_IDX], sample[FILE_IDX]]) } \ - | set { chunks } + Utils.splitFilesIndex(Integer.parseInt(sample[COUNT_IDX]), sample[CHUNK_SIZE_IDX], [sample[SAMPLE_IDX], sample[BIN_ID_IDX], sample[FILE_IDX]]) } \ + | map({ sample, binID, binFile, start, end, chunkSize -> [sample, binID, binFile, start, end] }) | set { chunks } emit: chunks } diff --git a/nextflow.config b/nextflow.config index f5bc96db..49dbfc91 100644 --- a/nextflow.config +++ b/nextflow.config @@ -302,7 +302,7 @@ params { PlasClass_image = "quay.io/biocontainers/plasclass:0.1.1--pyhdfd78af_0" rgi_image = "quay.io/biocontainers/rgi:6.0.1--pyha8f3691_1" sans_image = "quay.io/metagenomics/toolkit-sans:0.1.0" - mmseqs2_image = "quay.io/microbiome-informatics/mmseqs:2.13" + mmseqs2_image = "quay.io/metagenomics/toolkit-mmseqs2:15-6f452-0" magscot_image = "quay.io/metagenomics/toolkit-magscot:1.0.0" kmc_image = "quay.io/biocontainers/kmc:3.2.1--hf1761c0_2" diff --git a/templates/mmseqs2.sh b/templates/mmseqs2.sh new file mode 100644 index 00000000..a0cc82c1 --- /dev/null +++ b/templates/mmseqs2.sh @@ -0,0 +1,88 @@ +mkdir -p !{params.polished.databases} +# if no local database is referenced, start the download part +if [ -z "!{EXTRACTED_DB}" ] +then +# polished params end with a / so no additional one is needed at the end + DATABASE=!{params.polished.databases}mmseqs2 + mkdir -p ${DATABASE}/!{dbType} + LOCK_FILE=${DATABASE}/!{dbType}/lock.txt + + # Check if access and secret keys are necessary for s5cmd + if [ ! -z "!{S3_DB_ACCESS}" ] + then + export AWS_ACCESS_KEY_ID=!{S3_DB_ACCESS} + export AWS_SECRET_ACCESS_KEY=!{S3_DB_SECRET} + fi + + # Create and try to lock the given “LOCK_FILE”. If the file is locked no other process will download the same database simultaneously + # and wait until the database is downloaded. Depending on your database path (starts with s3:// | https:// | /../../.. ) + # one of the flock input commands is executed to download and decompress the database. + # If a database is present at the given path, checksums are compared. If they are identical, the download will be omitted. + flock ${LOCK_FILE} concurrentDownload.sh --output=${DATABASE}/!{dbType} \ + --link=!{DOWNLOAD_LINK} \ + --httpsCommand="wget -qO- !{DOWNLOAD_LINK} | zstd -T!{task.cpus} -d -c | tar -xv " \ + --s3FileCommand="s5cmd !{S5CMD_PARAMS} cat --concurrency !{task.cpus} !{DOWNLOAD_LINK} | zstd -T!{task.cpus} -d -c | tar -xv " \ + --s3DirectoryCommand="s5cmd !{S5CMD_PARAMS} cp --concurrency !{task.cpus} !{DOWNLOAD_LINK} . " \ + --s5cmdAdditionalParams="!{S5CMD_PARAMS}" \ + --localCommand="zstd -T!{task.cpus} -c -d !{DOWNLOAD_LINK} | tar -xv " \ + --expectedMD5SUM=!{MD5SUM} + + # Path of the newly downloaded database. A mmseqs2 database consists of multiple files, + # the unique lookup file is searched to determine the basename of each database. + FILEPATH=$(find ${DATABASE}/!{dbType} -name *.lookup -type f) + # remove the extension to get the files basename. + MMSEQS2_DATABASE_DIR=${FILEPATH%.*} +else + # If an extracted database is present use that path. + MMSEQS2_DATABASE_DIR="!{EXTRACTED_DB}" +fi + +# Check which mmseqs version is the right one +MMSEQS_VERSION="" +if grep -q "avx2" /proc/cpuinfo; then + echo "AVX2 supported, using mmseqs_avx2" + MMSEQS_VERSION="mmseqs_avx2" +elif grep -q "sse4_1" /proc/cpuinfo; then + echo "SSE4.1 supported, using mmseqs_sse41" + MMSEQS_VERSION="mmseqs_sse41" +else + echo "Using mmseqs_sse2" + MMSEQS_VERSION="mmseqs_sse2" +fi + +MMSEQS_HEADER="query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen,qcov,tcov!{ADDITIONAL_COLUMNS}" +OUTPUT_TMP_TSV="!{sample}_!{binType}.!{start}.!{stop}.!{dbType}.blast.tmp.tsv" +OUTPUT_TSV="!{sample}_!{binType}.!{start}.!{stop}.!{dbType}.blast.tsv" + +# According to the MMSeqs docs the --split-memory-limit parameter defines the RAM usage for *about* 80 percent of the total RAM consumption. +RAM_LIMIT="$(awk -v RATIO=80 -v RAM=$(echo !{task.memory} | cut -f 1 -d ' ') 'BEGIN { print int(RAM / 100 * RATIO) }')G" + +mkdir tmp + +# Create new input file +cat !{fasta} | seqkit range -r !{start}:!{stop} > input.fa + +# Only mmseqs2 databases can be used for every kind of search. Inputs have to be converted first. +$MMSEQS_VERSION createdb input.fa queryDB +# Load all indices into memory to increase searching speed +$MMSEQS_VERSION touchdb --threads !{task.cpus} queryDB +$MMSEQS_VERSION touchdb --threads !{task.cpus} ${MMSEQS2_DATABASE_DIR} +$MMSEQS_VERSION search queryDB ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.results.database tmp !{parameters} \ + --threads !{task.cpus} --split-memory-limit ${RAM_LIMIT} +# mmseqs2 searches produce output databases. These have to be converted to a more useful format. The blast -outfmt 6 in this case. +$MMSEQS_VERSION convertalis queryDB ${MMSEQS2_DATABASE_DIR} !{sample}_!{binType}.!{dbType}.results.database ${OUTPUT_TMP_TSV} \ + --threads !{task.cpus} --format-output ${MMSEQS_HEADER} + +# Try only if the output file is not empty, csvtk will fail otherwise +if [ -s ${OUTPUT_TMP_TSV} ]; then + # Add header + sed -i "1i $(echo ${MMSEQS_HEADER} | tr ',' '\t')" ${OUTPUT_TMP_TSV} + + # Add BIN_ID and SAMPLE column + csvtk -t -T join -f "locus_tag;query" \ + <(csvtk -t -T concat !{contig2GeneMapping} | csvtk -t -T cut -f SAMPLE,BIN_ID,CONTIG,locus_tag) ${OUTPUT_TMP_TSV} > ${OUTPUT_TSV} + + # use "query" column name instead of "locus_tag" + sed -i -e "1s/locus_tag/query/" ${OUTPUT_TSV} +fi +