Skip to content

Commit

Permalink
feat(annotation): mmseqs speedup (#348)
Browse files Browse the repository at this point in the history
* feat(annotation): chunk input fasta for mmseqs jobs

* feat(config): add mmseqs2 chunkSize parameter

* fix(annotation): adjust output file pattern

* fix(annotation): use correct chunkSize parameter

* fix(annotation): remove header

* fix(annotation): set default value for chunkSize

* feat(annotation): use newest mmseqs version with increased chunk size

* fix(annotation): add entrypoint for mmseqs annotation

* feat(annotation): introduce uniref90

* fix(annotation): add additional params to mmseqs section

* feat(annotation): mmseqs output is collected and split by bin id

* fix(annotation): use correct cpu instruction set in mmseqs taxonomy

* fix(annotation): remove deprecated parameter

* fix(annotation): provide more RAM to collect process

* fix(annotation): set correct arity for Kegg output channel

* fix(plasmids): filter out chunkSize parameter

* fix(plasmids): use correct input parameter for map operator

* fix(tests): use uniref90 instead of nr

* fix(annotation): fix typos

Co-authored-by: bosterholz <[email protected]>

* fix(annotation): fix more typos

Co-authored-by: bosterholz <[email protected]>

* doc(annotation): describe the collection part of the data chunking

---------

Co-authored-by: bosterholz <[email protected]>
  • Loading branch information
pbelmann and bosterholz authored Mar 4, 2024
1 parent 7605f0f commit 95b5d6a
Show file tree
Hide file tree
Showing 16 changed files with 366 additions and 161 deletions.
1 change: 1 addition & 0 deletions .github/workflows/workflow_modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
- name: Test EMGB import tools
run: |
./bin/emgb.sh --output=output/test1 --runid=1 --binsdir=$(find output/test1/ -name "metabat") \
--blastdb=uniref90 \
--db=${EMGB_KEGG_DB} \
--workdir="${WORK_DIR}_wFullPipeline" --name=test1
Expand Down
9 changes: 7 additions & 2 deletions bin/emgb.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
set -e

VERSION=0.3.1
VERSION=0.4.0

while [ $# -gt 0 ]; do
case "$1" in
Expand All @@ -19,6 +19,8 @@ while [ $# -gt 0 ]; do
;;
--type=*) TYPE="${1#*=}"
;;
--blastdb=*) BLAST_DB="${1#*=}"
;;
--version) VERSION_CHECK=1
;;
--debug) DEBUG_CHECK=1
Expand All @@ -38,7 +40,7 @@ done


function getGenes {
nr=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.ncbi_nr.blast.tsv" -exec readlink -f {} \; | sed 's/^/ -nr-blast-tab /g')
nr=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.${BLAST_DB}.blast.tsv" -exec readlink -f {} \; | sed 's/^/ -nr-blast-tab /g')
tax=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.taxonomy.tsv" -exec readlink -f {} \; | sed 's/^/ -mmseqs-lineage /g')
ffn=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.ffn.gz" -exec readlink -f {} \; | sed 's/^/ -ffn /g')
gff=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.gff.gz" -exec readlink -f {} \; | sed 's/^/ -gff /g')
Expand Down Expand Up @@ -103,6 +105,9 @@ help()
echo " -- (e.g. X in the following example path fullPipelineOutput/SAMPLE/X/binning/) "
echo " --binsdir -- directory of bins. If bin refinement was executed then the bin refinement output should be used."
echo " -- (e.g. --binsdir=fullPipelineOutput/DRR066656/1/binning/0.4.0/metabat)"
echo " --blastdb -- Blast output that should be exported to emgb"
echo " -- (e.g. the folder name of BLAST_DB: output/test1/1/annotation/0.3.0/mmseqs2/BLAST_DB)"
echo " -- (Examples: bacmet20_predicted, ncbi_nr)"
echo " --db -- emgb specific kegg database"
echo " --name -- sample name, e.g. the SAMPLE in the paths above"
echo " --type -- if other then Illumina: ONT/Hybrid"
Expand Down
27 changes: 19 additions & 8 deletions default/fullPipeline_illumina_nanpore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -229,38 +229,49 @@ steps:
defaultKingdom: false
additionalParams: " --mincontiglen 500 "
mmseqs2:
chunkSize: 20000
kegg:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: s3://databases_internal/kegg-mirror-2021-01_mmseqs.tar.zst
md5sum: 0d20db97b3e7ee6571ca1fd5ad3a87f1
s5cmd:
params: '--retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080'
vfdb:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/vfdb_full_2022_07_29.tar.zst
md5sum: 7e32aaed112d6e056fb8764b637bf49e
bacmet20_experimental:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst
md5sum: 57a6d328486f0acd63f7e984f739e8fe
bacmet20_predicted:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst
md5sum: 55902401a765fc460c09994d839d9b64
ncbi_nr:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
uniref90:
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/nr_2023-04-29_mmseqs_taxonomy.tar.zst
md5sum: 79b9fb6b3dada41e602d70e12e7351c2
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/uniref90_20231108_mmseqs.tar.zst
md5sum: 313f2c031361091af2d5f3c6f6f46013
rgi:
# --include_loose includes matches of more distant homologs of AMR genes which may also report spurious partial matches
# --include_nudge Partial ORFs may do not pass curated bitscore cut-offs or novel samples may contain divergent alleles, so nudging
Expand Down
26 changes: 18 additions & 8 deletions default/fullPipeline_illumina_nanpore_without_aggregate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,37 +140,47 @@ steps:
additionalParams: " --mincontiglen 500 "
mmseqs2:
kegg:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: s3://databases_internal/kegg-mirror-2021-01_mmseqs.tar.zst
md5sum: 0d20db97b3e7ee6571ca1fd5ad3a87f1
s5cmd:
params: '--retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080'
vfdb:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/vfdb_full_2022_07_29.tar.zst
md5sum: 7e32aaed112d6e056fb8764b637bf49e
bacmet20_experimental:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst
md5sum: 57a6d328486f0acd63f7e984f739e8fe
bacmet20_predicted:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst
md5sum: 55902401a765fc460c09994d839d9b64
ncbi_nr:
params: ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
uniref90:
additionalParams:
search : ' --max-seqs 300 --max-accept 50 -c 0.8 --cov-mode 0 --start-sens 4 --sens-steps 1 -s 6 --num-iterations 2 -e 0.001 --e-profile 0.01 --db-load-mode 3 '
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/nr_2023-04-29_mmseqs_taxonomy.tar.zst
md5sum: 79b9fb6b3dada41e602d70e12e7351c2
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/uniref90_20231108_mmseqs.tar.zst
md5sum: 313f2c031361091af2d5f3c6f6f46013
mmseqs2_taxonomy:
# Run taxonomy classification on MAGs and unbinable contigs or just the later
runOnMAGs: true
Expand Down
2 changes: 2 additions & 0 deletions docker/toolkit-mmseqs2/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
FROM ghcr.io/soedinglab/mmseqs2:15-6f452
RUN apt update && apt install -y procps
1 change: 1 addition & 0 deletions docker/toolkit-mmseqs2/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
15-6f452-0
10 changes: 8 additions & 2 deletions example_params/annotation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@ steps:
annotation:
input: "test_data/annotation/input_small.tsv"
mmseqs2:
chunkSize: 20000
kegg:
params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalParams:
search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalColumns: ""
database:
extractedDBPath: '/vol/spool/toolkit/kegg-mirror-2021-01_mmseqs/sequenceDB'
# bacmet20_experimental:
Expand All @@ -22,11 +25,14 @@ steps:
# source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst
# md5sum: 57a6d328486f0acd63f7e984f739e8fe
bacmet20_predicted:
params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst
md5sum: 55902401a765fc460c09994d839d9b64
additionalParams:
search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalColumns: ""

# vfdb:
# params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
# database:
Expand Down
22 changes: 16 additions & 6 deletions example_params/fullPipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,11 @@ steps:
defaultKingdom: false
additionalParams: " --mincontiglen 200 "
mmseqs2:
chunkSize: 20000
kegg:
params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalParams:
search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalColumns: ""
database:
extractedDBPath: '/vol/spool/toolkit/kegg-mirror-2021-01_mmseqs/sequenceDB'

Expand All @@ -183,16 +186,23 @@ steps:
# download:
# source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst
# md5sum: 57a6d328486f0acd63f7e984f739e8fe
ncbi_nr:
params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
uniref90:
additionalParams:
search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalColumns: ""


database:
download:
source: s3://databases/nr_2023-04-29_mmseqs_taxonomy/*
md5sum: 79b9fb6b3dada41e602d70e12e7351c2
source: s3://databases/uniref90_20231108_mmseqs/*
md5sum: 313f2c031361091af2d5f3c6f6f46013
s5cmd:
params: '--retry-count 30 --no-verify-ssl --no-sign-request --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080'

bacmet20_predicted:
params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalParams:
search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst
Expand Down
9 changes: 7 additions & 2 deletions example_params/fullPipelineIlluminaOrONT.yml
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,11 @@ steps:
defaultKingdom: false
additionalParams: " --mincontiglen 200 "
mmseqs2:
chunkSize: 20000
kegg:
params: ' -s 2 --exact-kmer-matching 1 '
additionalParams:
search : ' -s 2 --exact-kmer-matching 1 '
additionalColumns: ""
database:
extractedDBPath: '/vol/spool/toolkit/kegg-mirror-2021-01_mmseqs/sequenceDB'
# vfdb:
Expand All @@ -166,7 +169,9 @@ steps:
# source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst
# md5sum: 57a6d328486f0acd63f7e984f739e8fe
bacmet20_predicted:
params: ' -s 2 --exact-kmer-matching 1 '
additionalParams:
search : ' -s 2 --exact-kmer-matching 1 '
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst
Expand Down
5 changes: 4 additions & 1 deletion example_params/fullPipelineONT.yml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ steps:
defaultKingdom: false
additionalParams: " --mincontiglen 200 "
mmseqs2:
chunkSize: 20000
# vfdb:
# params: ' -s 2 --exact-kmer-matching 1 '
# database:
Expand All @@ -162,7 +163,9 @@ steps:
# source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst
# md5sum: 57a6d328486f0acd63f7e984f739e8fe
bacmet20_predicted:
params: ' -s 2 --exact-kmer-matching 1 '
additionalParams:
search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalColumns: ""
database:
download:
source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_predicted.tar.zst
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,11 @@ steps:
kmerSize: 4
annotation:
mmseqs2:
chunkSize: 7000
kegg:
params: ' -s 2 --exact-kmer-matching 1 '
additionalParams:
search : ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
additionalColumns: ""
database:
extractedDBPath: '/vol/spool/toolkit/kegg-mirror-2021-01_mmseqs/sequenceDB'
resources:
Expand Down
26 changes: 26 additions & 0 deletions lib/Utils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class Utils {
}
}
}

static Object[] flattenTuple(tupl){
def chunkList = [];
def SAMPLE_IDX = 0;
Expand All @@ -106,6 +107,31 @@ class Utils {
return chunkList;
}

/*
* This method takes the number of entries of an input file (e.g. fasta entries in multi-fasta file),
* the maximum number of allowed entries per chunk and the actual input (e.g. file).
* It creates a list of indices of chunks of the input file based on the input parameters.
*/
static List splitFilesIndex(seqCount, chunkSize, sample){
int chunk=seqCount.intdiv(chunkSize)
if(seqCount.mod(chunkSize) != 0){
chunk = chunk + 1
}
def chunks = []
for(def n : 1..chunk){
int start = (n-1) * chunkSize + 1

int end = n * chunkSize

if(end > seqCount){
end=seqCount
}
chunks.add(sample + [start, end, chunk])
}
return chunks
}


static getMappingIdentityParam(medianQuality) {
if(medianQuality > 17){
return 97
Expand Down
Loading

0 comments on commit 95b5d6a

Please sign in to comment.