diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875f..40acc23f 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 with: workflow: linting.yml workflow_conclusion: completed diff --git a/.nf-core.yml b/.nf-core.yml index 4c14d399..cb4134c5 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,4 +1,5 @@ repository_type: pipeline +nf_core_version: "2.14.1" lint: files_exist: - CODE_OF_CONDUCT.md diff --git a/CHANGELOG.md b/CHANGELOG.md index f1bb6ee1..644393ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,24 +3,54 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v0.1.2 - [2024-05-02] +## v0.2.0 - [2024-05-14] + +### `Added` + +- Updated documentation for params. See [PR 66](https://github.com/phac-nml/mikrokondo/pull/66) + +- Fixed param typos in schema, config and docs. See [PR 66](https://github.com/phac-nml/mikrokondo/pull/66) + +- Added parameter to skip length filtering of sequences. See [PR 66](https://github.com/phac-nml/mikrokondo/pull/66) + +- Added locidex for allele calling. See [PR 62](https://github.com/phac-nml/mikrokondo/pull/62) + +- Updated directory output structure and names. See [PR 66](https://github.com/phac-nml/mikrokondo/pull/66) + +- Added tests for Kraken2 contig binning. See [PR 66](https://github.com/phac-nml/mikrokondo/pull/66) + +### `Fixed` + +- If you select to filter contigs by length, those contigs will now be used for subsequent analysis. See [PR 66](https://github.com/phac-nml/mikrokondo/pull/66) + +- Matched ECTyper and SISTR parameters to what is set in the current IRIDA. See [PR 68](https://github.com/phac-nml/mikrokondo/pull/68) + +- Updated StarAMR point finder DB selection to resolve error when in db selection when a database is not selected addressing issue. See [PR 74](https://github.com/phac-nml/mikrokondo/pull/74) -### Added +- Fixed calculation of SeqtkBaseCount value include counts for both pairs of paird-end reads. See [PR 65](https://github.com/phac-nml/mikrokondo/pull/65). + +## `Changed` + +- Changed the specific files and metadata to store within IRIDA Next. See [PR 65](https://github.com/phac-nml/mikrokondo/pull/65) + +- Added separate report fields for (PASSED|FAILED|WARNING) values and for the the actual value. See [PR 65](https://github.com/phac-nml/mikrokondo/pull/65) + +- Updated StarAMR to version 0.10.0. See [PR 74](https://github.com/phac-nml/mikrokondo/pull/74) + +## v0.1.2 - [2024-05-02] ### Changed -- Changed default values for database parameters `--dehosting_idx`, `--mash_sketch`, `--kraken2_db`, and `--bakta_db` to null. -- Enabled checking for existance of database files in JSON Schema to avoid issues with staging non-existent files in Azure. -- Set `--kraken2_db` to be a required parameter for the pipeline. -- Hide bakta parameters from IRIDA Next UI. +- Changed default values for database parameters `--dehosting_idx`, `--mash_sketch`, `--kraken2_db`, and `--bakta_db` to null. See [PR 71](https://github.com/phac-nml/mikrokondo/pull/71) +- Enabled checking for existance of database files in JSON Schema to avoid issues with staging non-existent files in Azure. See [PR 71](https://github.com/phac-nml/mikrokondo/pull/71). +- Set `--kraken2_db` to be a required parameter for the pipeline. See [PR 71](https://github.com/phac-nml/mikrokondo/pull/71) +- Hide bakta parameters from IRIDA Next UI. See [PR 71](https://github.com/phac-nml/mikrokondo/pull/71) ## v0.1.1 - [2024-04-22] -### Added - ### Changed -- Switched the resource labels for **parse_fastp**, **select_pointfinder**, **report**, and **parse_kat** from `process_low` to `process_single` as they are all configured to run on the local Nextflow machine. +- Switched the resource labels for **parse_fastp**, **select_pointfinder**, **report**, and **parse_kat** from `process_low` to `process_single` as they are all configured to run on the local Nextflow machine. See [PR 67](https://github.com/phac-nml/mikrokondo/pull/67) ## v0.1.0 - [2024-03-22] @@ -28,7 +58,7 @@ Initial release of phac-nml/mikrokondo. Mikrokondo currently supports: read trim - Bumped version number to 0.1.0 -- Updated docs to include awesome-page plugin and restructured readme. +- Updated docs to include awesome-page plugin and restructured readme. - Updated coverage defaults for Shigella, Escherichia and Vibrio @@ -49,11 +79,3 @@ Initial release of phac-nml/mikrokondo. Mikrokondo currently supports: read trim - Changed salmonella default default coverage to 40 - Added integration testing using [nf-test](https://www.nf-test.com/). - -### `Added` - -### `Fixed` - -### `Dependencies` - -### `Deprecated` diff --git a/README.md b/README.md index 7da1e7ca..9757d21b 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,35 @@ [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) +- [Introduction](#introduction) + * [What is mikrokondo?](#what-is-mikrokondo-) + * [Is mikrokondo right for me?](#is-mikrokondo-right-for-me-) + * [Citation](#citation) + + [Contact](#contact) +- [Installing mikrokondo](#installing-mikrokondo) + * [Step 1: Installing Nextflow](#step-1--installing-nextflow) + * [Step 2: Choose a Container Engine](#step-2--choose-a-container-engine) + + [Docker or Singularity?](#docker-or-singularity-) + * [Step 3: Install dependencies](#step-3--install-dependencies) + + [Dependencies listed](#dependencies-listed) + * [Step 4: Further resources to download](#step-4--further-resources-to-download) + + [Configuration and settings:](#configuration-and-settings-) +- [Getting Started](#getting-started) + * [Usage](#usage) + + [Data Input/formats](#data-input-formats) + + [Output/Results](#output-results) + * [Run example data](#run-example-data) + * [Testing](#testing) + + [Install nf-test](#install-nf-test) + + [Run tests](#run-tests) + * [Troubleshooting and FAQs:](#troubleshooting-and-faqs-) + * [References](#references) + * [Legal and Compliance Information:](#legal-and-compliance-information-) + * [Updates and Release Notes:](#updates-and-release-notes-) + +Table of contents generated with markdown-toc + + # Introduction ## What is mikrokondo? @@ -127,18 +156,21 @@ For more information see the [useage docs](https://phac-nml.github.io/mikrokondo ### Output/Results -All output files will be written into the `outdir` (specified by the user). More explicit tool results can be found in both the [Workflow](workflows/CleanAssemble/) and [Subworkflow](subworkflows/) sections of the docs. Here is a brief description of the outdir structure: - -- **annotations** - dir containing all annotation tool output. -- **assembly** - dir containing all assembly tool related output, including quality, 7 gene MLST and taxon determination. -- **pipeline_info** - dir containing all pipeline related information including software versions used and execution reports. -- **ReadQuality** - dir containing all read tool related output, including contamination, fastq, mash, and subsampled read sets (when present) -- **subtyping** - dir containing all subtyping tool related output, including SISTR, ECtyper, etc. -- **SummaryReport** - dir containing collated results files for all tools, including: - - Individual sample flatted json reports - - **final_report** - All tool results for all samples in both .json (including a flattened version) and .tsv format -- **bco.json** - data providence file generated from the nf-prov plug-in -- **manifest.json** - data providence file generated from the nf-prov plug-in +All output files will be written into the `outdir` (specified by the user). More explicit tool results can be found in both the [Workflow](workflows/CleanAssemble/) and [Subworkflow](subworkflows/) sections of the docs. Here is a brief description of the outdir structure (though in brief the further into the structure you head, the further in the workflow the tool has been run): + +- **Assembly** - contains all output files generated as a result of read assembly and tools using assembled contigs as input + - **Annotation** - contains output files generated from tools applying annotation and/or gene characterization from assembled contigs + - **Assembling** - contains output files generated as a part of the assembly process in nested order + - **FinalAssembly** - this directory will always contain the final output contig files from the last step in the assembly process (will take into account any skip flags in the process) + - **PostProcessing** - contains output files from intermediary tools that run after assembly but before annotation takes place in the workflow + - **Quality** - contains all output files generated as a result of quality tools after assembly +- **Subtyping** - contains all output files from workflow subtyping tools, based off assembled contigs +- **FinalReports** - contains assorted reports including aggregated and flat reports +- **pipeline_info** - includes tool versions and other pipeline specific information +- **Reads** - contains all output files generated as a result of read processing and tools using reads as input + - **FinalReads** - this directory will contain the final output read files from the last step in read processing (taking into account any skip flags used in the run) + - **Processing** - contains output files from tools run to process reads in nested order + - **Quality** - contains all output files generated from read quality tools ## Run example data diff --git a/bin/kraken2_bin.py b/bin/kraken2_bin.py index 0c48f26b..bbc40660 100755 --- a/bin/kraken2_bin.py +++ b/bin/kraken2_bin.py @@ -13,6 +13,7 @@ from collections import defaultdict import os import sys +import re kraken2_classifiers = frozenset(["U", "R", "D", "K", "P", "C", "O", "F", "G", "S"]) @@ -355,7 +356,7 @@ def write_fastas(self, sequences): """ for k, v in sequences.items(): with open( - f"{k.strip().replace(' ', '_').replace('(', '_').replace(')', '_').replace('.', '_')}_binned.fasta", + "{}.binned.fasta".format(re.sub(r'[^A-Za-z0-9\-_]', '_', k)), "w", encoding="utf8", ) as out_file: diff --git a/conf/irida_next.config b/conf/irida_next.config index 2af2b394..3eb6bd2f 100755 --- a/conf/irida_next.config +++ b/conf/irida_next.config @@ -13,90 +13,201 @@ iridanext { files { idkey = "sample" global = [ - "**/SummaryReport/final_report.json", - "**/SummaryReport/final_report.tsv" + "**/FinalReports/Aggregated/Json/final_report.json", + "**/FinalReports/Aggregated/Tables/final_report.tsv" ] samples = [ - "**/assembly/length_filtered_contigs/*_filtered.fasta.gz", - "**/assembly/quality/quast/*/*.pdf", - "**/assembly/7GeneMLST/*.json", - "**/assembly/taxon_determination/mash/*.taxa.screen", - "**/subtyping/ectyper/*/output.tsv", - "**/subtyping/sistr/*.json", - "**/subtyping/lissero/*.tsv", - "**/annotations/abricate/*.txt", - "**/annotations/mobrecon/*/mobtyper_results.txt", - "**/annotations/bakta/*.gbff", - "**/annotations/bakta/*.txt", - "**/StarAMR/*/summary.tsv", - "**/StarAMR/*/detailed_summary.tsv", - "**/StarAMR/*/results.xlsx", - "**/SummaryReport/*_flat_sample.json.gz", + "**/Assembly/FinalAssembly/*/*.filtered.assembly.fasta.gz", + "**/Assembly/Quality/QUAST/*/*.transposed*.quast.quality.tsv", + "**/Assembly/Quality/SeqKitStats/*.seqkit.stats.summary.tsv", + "**/Assembly/PostProcessing/Speciation/MashScreen/*.taxa.screen.screen", + "**/Reads/Quality/Trimmed/MashScreen/*.reads.screen.screen", + "**/Reads/Quality/Trimmed/FastP/*.fastp.summary.json", + "**/Reads/Quality/RawReadQuality/*.scan.summary.json", + "**/Assembly/Subtyping/ECTyper/*/*.blast_output_alleles.ectyper.subtyping.txt", + "**/Assembly/Subtyping/ECTyper/*/*.ectyper.subtyping.log", + "**/Assembly/Subtyping/ECTyper/*/*.output.ectyper.subtyping.tsv", + "**/Assembly/Subtyping/Lissero/*.lissero.subtyping.tsv", + "**/Assembly/Subtyping/SISTR/*.sistr.subtyping.tab", + "**/Assembly/Subtyping/SISTR/*.sistr.allele.subtyping.fasta", + "**/Assembly/Subtyping/SISTR/*.sistr.allele.subtyping.json", + "**/Assembly/Subtyping/SISTR/*.sistr.cgmlst.subtyping.csv", + "**/Assembly/Subtyping/Locidex/Report/*.mlst.subtyping.json.gz", + "**/FinalReports/FlattenedReports/*.flat_sample.json.gz", + "**/Assembly/Annotation/Abricate/*abricate.annotation.txt", + "**/Assembly/Annotation/Mobsuite/Recon/*/*mobtyper_results*.txt", + "**/Assembly/Annotation/Bakta/*.gbff", + "**/Assembly/Annotation/Bakta/*.txt", + "**/Assembly/Annotation/StarAMR/*/*summary*.tsv", + "**/Assembly/Annotation/StarAMR/*/*detailed_summary*.tsv", + "**/Assembly/Annotation/StarAMR/*/*results*.xlsx", ] } metadata { samples { flatten = true - ignore = [ - "QUAST.0.Reference length", - "QUAST.0.Estimated reference length", - "QUAST.0.Reference GC (%)", - "QUAST.0.NG50", - "QUAST.0.NG90", - "QUAST.0.auNG", - "QUAST.0.LG50", - "QUAST.0.LG90", - "QUAST.0.Reference mapped (%)", - "QUAST.0.Reference properly paired (%)", - "QUAST.0.Reference avg. coverage depth", - "QUAST.0.Reference coverage >= 1x (%)", - "QUAST.0.# large blocks misassemblies", - "QUAST.0.# misassemblies", - "QUAST.0.# misassembled contigs", - "QUAST.0.Misassembled contigs length", - "QUAST.0.# local misassemblies", - "QUAST.0.# scaffold gap ext. mis.", - "QUAST.0.# scaffold gap loc. mis.", - "QUAST.0.# structural variations", - "QUAST.0.# possible TEs", - "QUAST.0.# unaligned mis. contigs", - "QUAST.0.# unaligned contigs", - "QUAST.0.Unaligned length", - "QUAST.0.Genome fraction (%)", - "QUAST.0.Duplication ratio", - "QUAST.0.Avg contig read support", - "QUAST.0.# mismatches per 100 kbp", - "QUAST.0.# indels per 100 kbp", - "QUAST.0.# genomic features", - "QUAST.0.# operons", - "QUAST.0.Complete BUSCO (%)", - "QUAST.0.Partial BUSCO (%)", - "QUAST.0.# predicted genes (unique)", - "QUAST.0.# predicted genes (>= 0 bp)", - "QUAST.0.# predicted genes (>= 300 bp)", - "QUAST.0.# predicted genes (>= 1500 bp)", - "QUAST.0.# predicted genes (>= 3000 bp)", - "QUAST.0.# predicted rRNA genes", - "QUAST.0.Largest alignment", - "QUAST.0.Total aligned length", - "QUAST.0.NA50", - "QUAST.0.NGA50", - "QUAST.0.NA90", - "QUAST.0.NGA90", - "QUAST.0.auNA", - "QUAST.0.auNGA", - "QUAST.0.LA50", - "QUAST.0.LGA50", - "QUAST.0.LA90", - "QUAST.0.LGA90", - "QUAST.0.K-mer-based compl. (%)", - "QUAST.0.K-mer-based cor. length (%)", - "QUAST.0.K-mer-based mis. length (%)", - "QUAST.0.# k-mer-based misjoins", - "FastP.command" + rename = [ + "QCStatus" : "QC Status", + "QualityAnalysis.checkm_contamination.qc_status" : "Checkm Status", + "QualityAnalysis.checkm_contamination.value" : "Checkm Value", + "QualityAnalysis.average_coverage.qc_status" : "Average Coverage Status", + "QualityAnalysis.average_coverage.value" : "Average Coverage Value", + "QualityAnalysis.n50_value.qc_status" : "n50 Status", + "QualityAnalysis.n50_value.value" : "n50 Value", + "QualityAnalysis.raw_average_quality.qc_status" : "Raw Average Quality Status", + "QualityAnalysis.raw_average_quality.value" : "Raw Average Quality Value", + "QualityAnalysis.length.qc_status" : "Length Status", + "QualityAnalysis.length.value" : "Length Value", + "QualityAnalysis.nr_contigs.qc_status" : "nr contigs Status", + "QualityAnalysis.nr_contigs.value" : "nr contigs Value", + "QCSummary" : "QC Summary", + "meta.downsampled" : "Downsampled", + "SpeciesTopHit" : "Species", + "ECTyperSubtyping.0.Database" : "ECTyper Database", + "ECTyperSubtyping.0.Evidence" : "ECTyper Evidence", + "ECTyperSubtyping.0.GeneCoverages(%)" : "ECTyper GeneCoverages (%)", + "ECTyperSubtyping.0.GeneIdentities(%)" : "ECTyper GeneIdentities (%)", + "ECTyperSubtyping.0.GeneScores" : "ECTyper GeneScores", + "ECTyperSubtyping.0.H-type" : "ECTyper H-Antigen", + "ECTyperSubtyping.0.O-type" : "ECTyper O-Antigen", + "ECTyperSubtyping.0.QC" : "ECTyper QCFlag", + "ECTyperSubtyping.0.Serotype" : "ECTyper Serotype", + "ECTyperSubtyping.0.Species" : "ECTyper Subtyping", + "ECTyperSubtyping.0.Warnings" : "ECTyper Warnings", + "LISSEROSubtyping.0.SEROTYPE" : "LISSERO Serotype", + "QUAST.0.GC (%)" : "GC (%)", + "RawReadSummary.R1.mean_sequence_length" : "Mean Sequence Length Forward", + "SISTRSubtyping.0.cgmlst_ST" : "SISTR cgMLST ST", + "SISTRSubtyping.0.cgmlst_found_loci" : "SISTR cgMLST Found Loci", + "SISTRSubtyping.0.cgmlst_genome_match" : "SISTR cgMLST Genome Match", + "SISTRSubtyping.0.cgmlst_matching_alleles" : "SISTR cgMLST Matching Alleles", + "SISTRSubtyping.0.cgmlst_subspecies" : "SISTR cgMLST Subspecies", + "SISTRSubtyping.0.h1" : "SISTR H1", + "SISTRSubtyping.0.h2" : "SISTR H2", + "SISTRSubtyping.0.o_antigen" : "SISTR Antigen", + "SISTRSubtyping.0.qc_messages" : "SISTR QC Message", + "SISTRSubtyping.0.qc_status" : "SISTR QC Status", + "SISTRSubtyping.0.serogroup" : "SISTR Serogroup", + "SISTRSubtyping.0.serovar" : "SISTR Serovar", + "SISTRSubtyping.0.serovar_antigen" : "SISTR Serovar Antigen", + "SISTRSubtyping.0.serovar_cgmlst" : "SISTR Serovar cgMLST", + "SeqtkBaseCount" : "BaseCount", + "SevenGeneMLSTReport.0.alleles.abcZ" : "abcZ", + "SevenGeneMLSTReport.0.alleles.adk" : "adk", + "SevenGeneMLSTReport.0.alleles.arcA" : "arcA", + "SevenGeneMLSTReport.0.alleles.aroC" : "aroC", + "SevenGeneMLSTReport.0.alleles.aspC" : "aspC", + "SevenGeneMLSTReport.0.alleles.bglA" : "bglA", + "SevenGeneMLSTReport.0.alleles.cat" : "cat", + "SevenGeneMLSTReport.0.alleles.clpX" : "clpX", + "SevenGeneMLSTReport.0.alleles.dapE" : "dapE", + "SevenGeneMLSTReport.0.alleles.dat" : "dat", + "SevenGeneMLSTReport.0.alleles.dnaG" : "dnaG", + "SevenGeneMLSTReport.0.alleles.dnaN" : "dnaN", + "SevenGeneMLSTReport.0.alleles.fadD" : "fadD", + "SevenGeneMLSTReport.0.alleles.fumC" : "fumC", + "SevenGeneMLSTReport.0.alleles.gyrB" : "gyrB", + "SevenGeneMLSTReport.0.alleles.hemD" : "hemD", + "SevenGeneMLSTReport.0.alleles.hisD" : "hisD", + "SevenGeneMLSTReport.0.alleles.icd" : "icd", + "SevenGeneMLSTReport.0.alleles.ldh" : "ldh", + "SevenGeneMLSTReport.0.alleles.lhkA" : "lhkA", + "SevenGeneMLSTReport.0.alleles.lysP" : "lysP", + "SevenGeneMLSTReport.0.alleles.mdh" : "mdh", + "SevenGeneMLSTReport.0.alleles.purA" : "purA", + "SevenGeneMLSTReport.0.alleles.purE" : "purE", + "SevenGeneMLSTReport.0.alleles.recA" : "recA", + "SevenGeneMLSTReport.0.alleles.sucA" : "sucA", + "SevenGeneMLSTReport.0.alleles.thrA" : "thrA", + "SevenGeneMLSTReport.0.sequence_type" : "7 Gene ST", + "SevenGeneMLSTReport.0.scheme" : "7 Gene Scheme", + "StarAMR.0.Genotype" : "StarAMR Genotype", + "StarAMR.0.Predicted Phenotype" : "StarAMR Predicted Phenotype", + "StarAMR.0.CGE Predicted Phenotype" : "StarAMR CGE Predicted Phenotype", + "StarAMR.0.Plasmid" : "StarAMR Plasmid" + ] + keep = [ + "QCStatus", + "QualityAnalysis.checkm_contamination.qc_status", + "QualityAnalysis.checkm_contamination.value", + "QualityAnalysis.average_coverage.qc_status", + "QualityAnalysis.average_coverage.value", + "QualityAnalysis.n50_value.qc_status", + "QualityAnalysis.n50_value.value", + "QualityAnalysis.raw_average_quality.qc_status", + "QualityAnalysis.raw_average_quality.value", + "QualityAnalysis.length.qc_status", + "QualityAnalysis.length.value", + "QualityAnalysis.nr_contigs.qc_status", + "QualityAnalysis.nr_contigs.value", + "QCSummary", + "meta.downsampled", + "SpeciesTopHit", + "ECTyperSubtyping.0.Database", + "ECTyperSubtyping.0.Evidence", + "ECTyperSubtyping.0.GeneCoverages(%)", + "ECTyperSubtyping.0.GeneIdentities(%)", + "ECTyperSubtyping.0.GeneScores", + "ECTyperSubtyping.0.H-type", + "ECTyperSubtyping.0.O-type", + "ECTyperSubtyping.0.QC", + "ECTyperSubtyping.0.Serotype", + "ECTyperSubtyping.0.Species", + "ECTyperSubtyping.0.Warnings", + "LISSEROSubtyping.0.SEROTYPE", + "QUAST.0.GC (%)", + "RawReadSummary.R1.mean_sequence_length", + "SISTRSubtyping.0.cgmlst_ST", + "SISTRSubtyping.0.cgmlst_found_loci", + "SISTRSubtyping.0.cgmlst_genome_match", + "SISTRSubtyping.0.cgmlst_matching_alleles", + "SISTRSubtyping.0.cgmlst_subspecies", + "SISTRSubtyping.0.h1", + "SISTRSubtyping.0.h2", + "SISTRSubtyping.0.o_antigen", + "SISTRSubtyping.0.qc_messages", + "SISTRSubtyping.0.qc_status", + "SISTRSubtyping.0.serogroup", + "SISTRSubtyping.0.serovar", + "SISTRSubtyping.0.serovar_antigen", + "SISTRSubtyping.0.serovar_cgmlst", + "SeqtkBaseCount", + "SevenGeneMLSTReport.0.alleles.abcZ", + "SevenGeneMLSTReport.0.alleles.adk", + "SevenGeneMLSTReport.0.alleles.arcA", + "SevenGeneMLSTReport.0.alleles.aroC", + "SevenGeneMLSTReport.0.alleles.aspC", + "SevenGeneMLSTReport.0.alleles.bglA", + "SevenGeneMLSTReport.0.alleles.cat", + "SevenGeneMLSTReport.0.alleles.clpX", + "SevenGeneMLSTReport.0.alleles.dapE", + "SevenGeneMLSTReport.0.alleles.dat", + "SevenGeneMLSTReport.0.alleles.dnaG", + "SevenGeneMLSTReport.0.alleles.dnaN", + "SevenGeneMLSTReport.0.alleles.fadD", + "SevenGeneMLSTReport.0.alleles.fumC", + "SevenGeneMLSTReport.0.alleles.gyrB", + "SevenGeneMLSTReport.0.alleles.hemD", + "SevenGeneMLSTReport.0.alleles.hisD", + "SevenGeneMLSTReport.0.alleles.icd", + "SevenGeneMLSTReport.0.alleles.ldh", + "SevenGeneMLSTReport.0.alleles.lhkA", + "SevenGeneMLSTReport.0.alleles.lysP", + "SevenGeneMLSTReport.0.alleles.mdh", + "SevenGeneMLSTReport.0.alleles.purA", + "SevenGeneMLSTReport.0.alleles.purE", + "SevenGeneMLSTReport.0.alleles.recA", + "SevenGeneMLSTReport.0.alleles.sucA", + "SevenGeneMLSTReport.0.alleles.thrA", + "SevenGeneMLSTReport.0.sequence_type", + "SevenGeneMLSTReport.0.scheme", + "StarAMR.0.Genotype", + "StarAMR.0.Predicted Phenotype", + "StarAMR.0.CGE Predicted Phenotype", + "StarAMR.0.Plasmid" ] json { - path = "**/SummaryReport/final_report_flattened.json" + path = "**/FinalReports/Sample/Json/final_report_flattened.json" } } } diff --git a/conf/modules.config b/conf/modules.config index 2d0205cb..ba4b9083 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,8 +10,31 @@ ---------------------------------------------------------------------------------------- */ -// TODO tie all ext.args to nextflow.config -// TODO mix in versions of all outputs + +def reformat_output(filename_in, outputtype, merge_string, meta_){ + /* + filename_in ConfigObject: The file or directory being renamed. Can be treated as GString but not a file + outputtype GString|null: string specifying the type of output to run, or null if there is not type + merge_string Gstring: String to fill in as a tool name + meta_: metadata passed in + + filename_in appears to be a ConfigObject preventing file methods from being called on the directory. + */ + + def allowed_compression_types = ["gz"] + def output_type = outputtype + def file_extension_delimiter = '.' + + def parent = filename_in.tokenize(File.separator)[0..<-1].join(File.separator) ?: null + def tokens = parent ? filename_in.tokenize(File.separator)[-1].tokenize(file_extension_delimiter) : filename_in.tokenize(file_extension_delimiter) + output_type ? allowed_compression_types.contains(tokens[-1]) ? tokens.add(tokens.size() - 2, output_type) : tokens.add(tokens.size() - 1, output_type) : null + + def new_name = tokens[0].equals(meta_.id) ? meta_.id : "${meta_.id}.${tokens[0]}" + def new_fn = "${new_name}.${merge_string}.${tokens[1..-1].join(file_extension_delimiter)}" + def output = parent ? [parent, new_fn].join(File.separator) : new_fn + return output +} + process { if(params.slurm_p){ @@ -22,19 +45,103 @@ process { } + assembly_directory_name = "Assembly" + read_directory_name = "Reads" + summaries_directory_name = "FinalReports" + + read_quality_directory_name = ["${params.outdir}", "${read_directory_name}", "Quality"].join(File.separator) + read_trimmed_reports_directory_name = ["${read_quality_directory_name}", "Trimmed"].join(File.separator) + read_final_processing_directory_name = ["${params.outdir}", "${read_directory_name}", "FinalReads"].join(File.separator) + + read_processing_directory_name = ["${params.outdir}", "${read_directory_name}", "Processing"].join(File.separator) + read_decon_directory_name = ["${read_processing_directory_name}", "Dehosting"].join(File.separator) + read_trimmed_directory_name = ["${read_decon_directory_name}", "Trimmed"].join(File.separator) + read_downsampled_directory_name = ["${read_trimmed_directory_name}", "DownSampled"].join(File.separator) + + + assembly_assembling_directory_name = ["${params.outdir}", "${assembly_directory_name}", "Assembling"].join(File.separator) + assembly_consensus_directory_name = ["${assembly_assembling_directory_name}", "ConsensusGeneration"].join(File.separator) + assembly_polishing_directory_name = ["${assembly_consensus_directory_name}", "Polishing"].join(File.separator) + assembly_final_result_directory_name = ["${params.outdir}", "${assembly_directory_name}", "FinalAssembly"].join(File.separator) + + + assembly_post_processing_directory_name = ["${params.outdir}", "${assembly_directory_name}", "PostProcessing"].join(File.separator) + assembly_filtering_directory_name = ["${assembly_post_processing_directory_name}", "LengthFilteredContigs"].join(File.separator) + assembly_post_speciation_directory_name = ["${assembly_post_processing_directory_name}", "Speciation"].join(File.separator) + + assembly_contigs_metagenomic_directory_name = ["${assembly_post_processing_directory_name}", "Metagenomic"].join(File.separator) + assembly_contigs_binned_directory_name = ["${assembly_contigs_metagenomic_directory_name}", "BinnedContigs"].join(File.separator) + + assembly_annotations_directory_name = ["${params.outdir}", "${assembly_directory_name}", "Annotation"].join(File.separator) + assembly_subtyping_directory_name = ["${params.outdir}", "${assembly_directory_name}", "Subtyping"].join(File.separator) + assembly_quality_directory_name = ["${params.outdir}", "${assembly_directory_name}", "Quality"].join(File.separator) + + summaries_reports_directory_name = ["${params.outdir}", "${summaries_directory_name}"].join(File.separator) + + publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - //withName: SAMPLESHEET_CHECK { - // publishDir = [ - // path: { "${params.outdir}/pipeline_info" }, - // mode: params.publish_dir_mode, - // saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - // ] - //} + withName: PUBLISH_FINAL_READS { + ext.parameters = params.coreutils + publishDir = [ + path: { "${task.read_final_processing_directory_name}" }, + mode: params.publish_dir_mode, + pattern: "*/*", + saveAs: { filename -> filename.equals('versions.yml') ? null : reformat_output(filename, "reads", "final", meta) } + ] + } + + withName: PUBLISH_FINAL_ASSEMBLIES { + ext.parameters = params.coreutils + publishDir = [ + path: { "${task.assembly_final_result_directory_name}" }, + mode: params.publish_dir_mode, + pattern: "*/*", + saveAs: { filename -> filename.equals('versions.yml') ? null : reformat_output(filename, "assembly", "final", meta) } + ] + } + + withName: LOCIDEX_EXTRACT { + ext.parameters = params.locidex + publishDir = [ + mode: params.publish_dir_mode, + path: { ["${task.assembly_subtyping_directory_name}", "Locidex", "Extract"].join(File.separator) }, + pattern: "*/*${params.locidex.extracted_seqs_suffix}", + saveAs: { filename -> filename.equals('versions.yml') ? null : reformat_output(filename, "subtyping", "locidex.extract", meta) } + ] + } + + withName: LOCIDEX_SEARCH { + ext.parameters = params.locidex + publishDir = [ + [ + mode: params.publish_dir_mode, + path: { ["${task.assembly_subtyping_directory_name}", "Locidex", "Search"].join(File.separator) }, + pattern: "*${params.locidex.seq_store_suffix}", + saveAs: { filename -> filename.equals('versions.yml') ? null : reformat_output(filename, null, "locidex", meta) } + ], + [ + mode: params.publish_dir_mode, + path: { ["${task.assembly_subtyping_directory_name}", "Locidex", "Search"].join(File.separator) }, + pattern: "*${params.locidex.gbk_suffix}", + saveAs: { filename -> filename.equals('versions.yml') ? null : reformat_output(filename, "annotation", "locidex", meta) } + ] + ] + } + + withName: LOCIDEX_REPORT { + ext.parameters = params.locidex + publishDir = [ + mode: params.publish_dir_mode, + path: { ["${task.assembly_subtyping_directory_name}", "Locidex", "Report"].join(File.separator) }, + pattern: "*${params.locidex.report_suffix}", + saveAs: { filename -> filename.equals('versions.yml') ? null : reformat_output(filename, "subtyping", "locidex.report", meta) } + ] + } withName: REPORT{ executor = 'local' @@ -42,8 +149,8 @@ process { errorStrategy = "terminate" publishDir = [ mode: params.publish_dir_mode, - path: "${params.outdir}/SummaryReport", - pattern: "final_report.json" + path: { ["${task.summaries_reports_directory_name}", "Aggregated", "Json"].join(File.separator) }, + pattern: "final_report.json", ] } @@ -53,60 +160,40 @@ process { } withName: REPORT_AGGREGATE { - ext.containers = params.python3 + ext.parameters = params.python3 cache = 'false' // Resume does not work on module, if enabled a warning is thrown errorStrategy = "terminate" publishDir = [ [ mode: params.publish_dir_mode, - path: "${params.outdir}/SummaryReport", - pattern: "*.tsv" + path: { ["${task.summaries_reports_directory_name}", "Aggregated", "Tables"].join(File.separator) }, + pattern: "*.tsv", ], [ mode: params.publish_dir_mode, - path: "${params.outdir}/SummaryReport", - pattern: "*.json" + path: { ["${task.summaries_reports_directory_name}", "Sample", "Json"].join(File.separator) }, + pattern: "*.json", ] ] } - //withName: SHIGATYPER { - // ext.containers = params.shigatyper - // //container = params.shigatyper.container - // stageInMode = params.stage_in_mode - // // scratch = false - // publishDir = [ - // path: { "${params.outdir}/subtyping/shigatyper" }, - // mode: params.publish_dir_mode, - // ] - //} - - //withName: FASTQSCAN { - // container = params.fastqscan.container - // ext.args = params.fastqscan.args - // errorStrategy = 'terminate' - // publishDir = [ - // mode: params.publish_dir_mode, - // path: "${params.outdir}/ReadQuality/raw_reads", - // pattern: "*.json" - // ] - //} withName: BIN_KRAKEN2 { - ext.containers = params.python3 - maxForks = 20; + ext.parameters = params.python3 + maxForks = 20 + publishDir = [ mode: params.publish_dir_mode, - path: "${params.outdir}/${params.spades.outdir}/taxon_determination/kraken2/binned_contigs", - pattern: "*${params.kraken_bin.fasta_ext}" + path: { ["${task.assembly_contigs_binned_directory_name}", "Kraken2"].join(File.separator) }, + pattern: "*${params.kraken_bin.fasta_ext}", + saveAs: { filename -> filename.equals('versions.yml') ? null : reformat_output(filename, "assembly", "kraken2", meta) } ] } withName: COMBINE_DATA { - ext.containers = params.coreutils - //container = params.coreutils.container + ext.parameters = params.coreutils publishDir = [ enabled: false ] @@ -114,18 +201,18 @@ process { withName: GZIP_FILES { - ext.containers = params.coreutils + ext.parameters = params.coreutils publishDir = [ [ mode: params.publish_dir_mode, - path: "${params.outdir}/SummaryReport", + path: { ["${task.summaries_reports_directory_name}", "FlattenedReports"].join(File.separator) }, pattern: "*.gz" ] ] } withName: CHECK_ONT { - ext.containers = params.python3 + ext.parameters = params.python3 publishDir = [ enabled: false ] @@ -133,7 +220,7 @@ process { } withName: PARSE_MASH { - ext.containers = params.python3 + ext.parameters = params.python3 errorStrategy = { task.exitStatus == 255 || task.exitStatus == 1 ? 'ignore' : 'finish'} publishDir = [ enabled: false @@ -141,7 +228,7 @@ process { } withName: PARSE_KRAKEN { - ext.containers = params.python3 + ext.parameters = params.python3 errorStrategy = { task.exitStatus == 255 || task.exitStatus == 1 ? 'ignore' : 'finish'} publishDir = [ enabled: false @@ -150,12 +237,14 @@ process { withName: READ_SCAN { errorStrategy = "terminate" - ext.containers = params.python3 + ext.parameters = params.python3 publishDir = [ [ mode: params.publish_dir_mode, - path: "${params.outdir}/ReadQuality/RawReadData", - pattern: "*.json" + path: { [task.read_quality_directory_name, "RawReadQuality"].join(File.separator) }, + pattern: "*.json", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "summary", "read.scan", meta) } ], ] } @@ -180,40 +269,46 @@ process { withName: SEQKIT_STATS { ext.args = "" - ext.containers = params.seqkit + ext.parameters = params.seqkit stageInMode = params.stage_in_mode publishDir = [ [ - path: "${params.outdir}/${params.spades.outdir}/quality/seqkit_stats/", + path: { ["${task.assembly_quality_directory_name}", "SeqKitStats"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.seqkit.report_ext}" + pattern: "*${params.seqkit.report_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "summary", "seqkit.stats", meta) } ] ] } withName: SEQKIT_FILTER { ext.args = "" - ext.containers = params.seqkit + ext.parameters = params.seqkit stageInMode = params.stage_in_mode publishDir = [ [ - path: "${params.outdir}/${params.spades.outdir}/length_filtered_contigs/", + path: { ["${task.assembly_filtering_directory_name}", "SeqKitFilter"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.seqkit.fasta_ext}" + pattern: "*${params.seqkit.fasta_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "assembly", "seqkit.filter", meta) } ] ] } withName: SEQTK_SAMPLE { ext.args = "" - ext.containers = params.seqtk + ext.parameters = params.seqtk stageInMode = params.stage_in_mode // scratch = false publishDir = [ [ - path: "${params.outdir}/ReadQuality/SubSampledReads", + path: { [ "${task.read_downsampled_directory_name}", "SeqTK" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.seqtk.reads_ext}" + pattern: "*${params.seqtk.reads_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "reads", "seqtk.sample", meta) } ] ] } @@ -221,45 +316,36 @@ process { withName: SEQTK_SIZE { ext.args = "" - ext.containers = params.seqtk_size + ext.parameters = params.seqtk_size stageInMode = params.stage_in_mode // scratch = false publishDir = [ [ - path: "${params.outdir}/ReadQuality/SubSampledReads/BaseCounts", + path: { ["${task.read_final_processing_directory_name}", "BaseCounts"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*.txt" + pattern: "*.txt", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "seqtk.size", meta) } ] ] } - //withName: SEQTK_FASTA_FASTQ { - // ext.args = "" - // ext.containers = params.seqtk - // //container = params.seqtk.container - // stageInMode = params.stage_in_mode - // // scratch = false - // publishDir = [ - // [ - // path: "${params.outdir}/subtyping/shigatyper", - // mode: params.publish_dir_mode, - // pattern: "*${params.seqtk.assembly_fastq}" - // ] - // ] - //} + withName: QUAST { ext.args = params.quast.args - ext.containers = params.quast + ext.parameters = params.quast stageInMode = params.stage_in_mode // scratch = false errorStrategy = { task.attempt <= task.maxRetries ? sleep(Math.pow(2, task.attempt) * 200 as long) && 'retry' : 'ignore' } maxForks = 10 // Quast can get overloaded by job subs, so needs to be limited publishDir = [ [ - path: "${params.outdir}/${params.spades.outdir}/quality/${params.quast.suffix}", + path: { ["${task.assembly_quality_directory_name}", "QUAST"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*/*" + pattern: "*/**", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "quality", "quast", meta) } ] ] @@ -268,15 +354,17 @@ process { withName: CHECKM_LINEAGEWF { ext.args = "" //container = params.checkm.container - ext.containers = params.checkm + ext.parameters = params.checkm stageInMode = params.stage_in_mode errorStrategy = { task.attempt <= task.maxRetries ? sleep(Math.pow(2, task.attempt) * 200 as long) && 'retry' : 'finish' } // scratch = false publishDir = [ [ - path: "${params.outdir}/${params.spades.outdir}/quality/${params.checkm.folder_name}", + path: { ["${task.assembly_quality_directory_name}", "CheckM"].join(File.separator) } , mode: params.publish_dir_mode, - pattern: "*/*" + pattern: "*/**", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "quality", "checkm", meta) } ] ] } @@ -284,15 +372,17 @@ process { withName: BANDAGE_IMAGE { ext.args = "" //container = params.bandage.container - ext.containers = params.bandage + ext.parameters = params.bandage errorStrategy = { task.attempt <= task.maxRetries ? sleep(Math.pow(2, task.attempt) * 200 as long) && 'retry' : 'ignore' } stageInMode = params.stage_in_mode // scratch = false publishDir = [ [ - path: "${params.outdir}/${params.spades.outdir}/quality/${params.bandage.outdir}", + path: { ["${task.assembly_assembling_directory_name}", "Bandage"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.bandage.svg_ext}" + pattern: "*${params.bandage.svg_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "quality", "bandage", meta) } ] ] } @@ -300,30 +390,37 @@ process { withName: KRAKEN { ext.args = "" //container = params.kraken.container - ext.containers = params.kraken + ext.parameters = params.kraken // scratch = false stageInMode = params.stage_in_mode publishDir = [ [ - path: "${params.outdir}/${params.spades.outdir}/taxon_determination/kraken2/${params.kraken.classified_suffix}", + path: { ["${task.assembly_post_speciation_directory_name}", "Kraken2", "Classified"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*.${params.kraken.classified_suffix}*" - + pattern: "*.${params.kraken.classified_suffix}*", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "kraken2", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/taxon_determination/kraken2/${params.kraken.unclassified_suffix}", + path: { ["${task.assembly_post_speciation_directory_name}", "Kraken2", "UnClassified"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*.${params.kraken.unclassified_suffix}*" + pattern: "*.${params.kraken.unclassified_suffix}*", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "kraken2", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/taxon_determination/kraken2/${params.kraken.report_suffix}", + path: { ["${task.assembly_post_speciation_directory_name}", "Kraken2", "Report"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*.kraken2.${params.kraken.report_suffix}.txt" + pattern: "*.kraken2.${params.kraken.report_suffix}.txt", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "screen", "kraken2", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/taxon_determination/kraken2/${params.kraken.output_suffix}", + path: { ["${task.assembly_post_speciation_directory_name}", "Kraken2", "Output"].join(File.separator) } , mode: params.publish_dir_mode, - pattern: "*.${params.kraken.output_suffix}.txt" + pattern: "*.${params.kraken.output_suffix}.txt", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "kraken2", meta) } ] ] @@ -331,7 +428,7 @@ process { withName: MASH_ESTIMATE { - ext.containers = params.mash + ext.parameters = params.mash // scratch = false maxForks = 20 maxRetries = 3 @@ -339,9 +436,11 @@ process { stageInMode = params.stage_in_mode publishDir = [ [ - path: "${params.outdir}/ReadQuality/mash_sketch", + path: { ["${task.read_trimmed_directory_name}", "MashSketches"].join(File.separator) } , mode: params.publish_dir_mode, - pattern: "*${params.mash.sketch_ext}" + pattern: "*${params.mash.sketch_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "mash.estimate", meta) } ] ] @@ -349,55 +448,49 @@ process { withName: MLST { //container = params.mlst.container - ext.containers = params.mlst + ext.parameters = params.mlst // scratch = false stageInMode = params.stage_in_mode errorStrategy = 'ignore' publishDir = [ [ - path: "${params.outdir}/assembly/7GeneMLST", + path: { ["${task.assembly_subtyping_directory_name}", "SevenGeneMLST"].join(File.separator) } , mode: params.publish_dir_mode, - pattern: "*${params.mlst.json_ext}" + pattern: "*${params.mlst.json_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "subtyping", "7.mlst", meta) } ] ] } - withName: STARAMR_DUMP_DB_VERSIONS { - ext.containers = params.staramr - ext.args = params.staramr.args - errorStrategy = 'ignore' - publishDir = [ - [ - path: "${params.outdir}/annotations/StarAMR", - mode: params.publish_dir_mode, - ] - ] - } withName: STARAMR { - ext.containers = params.staramr + ext.parameters = params.staramr ext.args = params.staramr.args errorStrategy = 'ignore' publishDir = [ [ - path: "${params.outdir}/annotations/StarAMR", + path: { ["${task.assembly_annotations_directory_name}", "StarAMR"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*/*" + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "annotation", "staramr", meta) } ] ] } withName: MOBSUITE_RECON { - ext.containers = params.mobsuite_recon + ext.parameters = params.mobsuite_recon stageInMode = params.stage_in_mode errorStrategy = 'ignore' ext.args = params.mobsuite_recon.args publishDir = [ [ - path: "${params.outdir}/annotations/mobrecon/", + path: { ["${task.assembly_annotations_directory_name}", "Mobsuite", "Recon"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*/*" + pattern: "*/*", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "annotation", "mob.recon", meta) } ] ] } @@ -405,7 +498,7 @@ process { withName: MASH_SKETCH { //container = params.mash.container maxForks = 20 - ext.containers = params.mash + ext.parameters = params.mash // scratch = false maxRetries = 3 maxErrors = 3 @@ -416,7 +509,7 @@ process { withName: MASH_PASTE { //container = params.mash.container - ext.containers = params.mash + ext.parameters = params.mash // scratch = false stageInMode = params.stage_in_mode publishDir = [ @@ -430,48 +523,45 @@ process { withName: MASH_SCREEN { def dir_out = null - errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } + errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long) + return 'retry' } maxForks = 20 maxErrors = 3 - //dir_out = process.toString().contains("QC_READS") ? "ReadQuality/${params.mash.output_dir}/mash" : "taxon_determination/mash" ext.args = "-w" - //container = params.mash.container - ext.containers = params.mash - // scratch = false + ext.parameters = params.mash stageInMode = params.stage_in_mode publishDir = [ [ - path: "${params.outdir}/ReadQuality/${params.mash.output_dir}/mash", + path: { ["${task.read_trimmed_reports_directory_name}", "MashScreen" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.mash.output_reads_ext}" + pattern: "*${params.mash.output_reads_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "screen", "mash.screen", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/taxon_determination/mash", + path: { ["${task.assembly_post_speciation_directory_name}", "MashScreen"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.mash.output_taxa_ext}" + pattern: "*${params.mash.output_taxa_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "screen", "mash.screen", meta) } ] ] } - withName: PARSE_KAT { - // scratch = false - ext.containers = params.python3 - executor = 'local' - errorStrategy = "terminate" - - } withName: REMOVE_CONTAMINANTS { ext.args = "-a" // output sam format in minimap2 - ext.containers = params.r_contaminants + ext.parameters = params.r_contaminants //container = params.r_contaminants.container // scratch = false stageInMode = params.stage_in_mode publishDir = [ [ - path: { "${params.outdir}/ReadQuality/${params.r_contaminants.output_dir}"}, + path: { "${task.read_decon_directory_name}" }, mode: params.publish_dir_mode, - pattern: "*.gz" // specifying this outside of config as yet to handle singletons TODO decide on singleton usage + pattern: "*.gz", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "reads", "deconned", meta) } ] ] } @@ -479,39 +569,51 @@ process { withName: FLYE_ASSEMBLE { errorStrategy = { task.exitStatus in [140] ? 'retry' : 'ignore'} //container = params.flye.container - ext.containers = params.flye + ext.parameters = params.flye ext.args = params.flye.args maxRetries = 3 // scratch = false publishDir = [ [ - path: "${params.outdir}/${params.spades.outdir}/flye/contigs", + path: { ["${task.assembly_assembling_directory_name}", "Flye", "Contigs" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.flye.fasta_ext}" + pattern: "*${params.flye.fasta_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "assembly", "flye", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/flye/graphs", + path: { ["${task.assembly_assembling_directory_name}", "Flye", "Graphs" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.flye.gfa_ext}" + pattern: "*${params.flye.gfa_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "graph", "flye", meta) } ], - [ path: "${params.outdir}/${params.spades.outdir}/flye/gv", + [ path: { ["${task.assembly_assembling_directory_name}", "Flye", "gv" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.flye.gv_ext}" + pattern: "*${params.flye.gv_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "flye", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/flye/summary", + path: { ["${task.assembly_assembling_directory_name}", "Flye", "Summary" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.flye.txt_ext}" + pattern: "*${params.flye.txt_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "flye", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/flye/log", + path: { ["${task.assembly_assembling_directory_name}", "Flye", "Log" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.flye.log_ext}" + pattern: "*${params.flye.log_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "flye", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/flye/json", + path: { ["${task.assembly_assembling_directory_name}", "Flye", "Json" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.flye.json_ext}" + pattern: "*${params.flye.json_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "flye", meta) } ] ] } @@ -520,40 +622,52 @@ process { withName: SPADES_ASSEMBLE { maxRetries = 3 ext.args = "" - ext.containers = params.spades + ext.parameters = params.spades errorStrategy = 'ignore' //errorStrategy = { task.attempt <= task.maxRetries && sleep(Math.pow(2, task.attempt) * 200 as long) ? 'retry' : 'ignore' } // scratch = false publishDir = [ [ - path: "${params.outdir}/${params.spades.outdir}/spades/scaffolds/", + path: { ["${task.assembly_assembling_directory_name}", "Spades", "Scaffolds" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.spades.scaffolds_ext}" + pattern: "*${params.spades.scaffolds_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "spades", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/spades/contigs/", + path: { ["${task.assembly_assembling_directory_name}", "Spades", "Contigs" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.spades.contigs_ext}" + pattern: "*${params.spades.contigs_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "assembly", "spades", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/spades/transcripts/", + path: { ["${task.assembly_assembling_directory_name}", "Spades", "Transcripts" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.spades.transcripts_ext}" + pattern: "*${params.spades.transcripts_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "spades", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/spades/gene_clusters/", + path: { ["${task.assembly_assembling_directory_name}", "Spades", "GeneClusters" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.spades.gene_clusters_ext}" + pattern: "*${params.spades.gene_clusters_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "spades", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/spades/logs/", + path: { ["${task.assembly_assembling_directory_name}", "Spades", "Logs" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.spades.log_ext}" + pattern: "*${params.spades.log_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "spades", meta) } ], [ - path: "${params.outdir}/${params.spades.outdir}/spades/graphs/", + path: { ["${task.assembly_assembling_directory_name}", "Spades", "Graphs" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.spades.assembly_graphs_ext}" + pattern: "*${params.spades.assembly_graphs_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "graph", "spades", meta) } ] ] } @@ -563,23 +677,29 @@ process { withName: UNICYCLER_ASSEMBLE { ext.args = "" //container = params.unicycler.container - ext.containers = params.unicycler + ext.parameters = params.unicycler // scratch = false publishDir = [ [ - path: { "${params.outdir}/${params.spades.outdir}/${params.unicycler.outdir}/scaffolds"}, + path: { ["${task.assembly_assembling_directory_name}", "Unicycler", "Scaffolds" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.unicycler.scaffolds_ext}" + pattern: "*${params.unicycler.scaffolds_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "unicylcer", meta) } ], [ - path: { "${params.outdir}/${params.spades.outdir}/${params.unicycler.outdir}/assembly" }, + path: { ["${task.assembly_assembling_directory_name}", "Unicycler", "Assembly" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.unicycler.assembly_ext}" + pattern: "*${params.unicycler.assembly_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "assembly", "unicycler", meta) } ], [ - path: { "${params.outdir}/${params.spades.outdir}/${params.unicycler.outdir}/logs" }, + path: { ["${task.assembly_assembling_directory_name}", "Unicycler", "Logs" ].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.unicycler.log_ext}" + pattern: "*${params.unicycler.log_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "unicycler", meta) } ] ] } @@ -588,43 +708,33 @@ process { withName: FASTP_TRIM { //ext.args = "" //container = params.fastp.container - ext.containers = params.fastp + ext.parameters = params.fastp // scratch = false publishDir = [ [ - path: { "${params.outdir}/ReadQuality/fastp/trimmed_reads"}, + path: { ["${task.read_trimmed_directory_name}", "FastP"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.fastp.fastq_ext}" + pattern: "*${params.fastp.fastq_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "reads", "fastp", meta) } ], [ - path: { "${params.outdir}/ReadQuality/fastp/reports"}, + path: { ["${task.read_trimmed_reports_directory_name}", "FastP"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*{${params.fastp.json_ext},${params.fastp.html_ext}}" + pattern: "*{${params.fastp.json_ext},${params.fastp.html_ext}}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "summary", "fastp", meta) } ] ] } - withName: CHOPPER_TRIM { - ext.args = "-q ${params.chopper.quality} -l ${params.chopper.minlength}" - //container = params.chopper.container - ext.containers = params.chopper - // scratch = false - stageInMode = params.stage_in_mode - publishDir = [ - [ - path: { "${params.outdir}/ReadQuality/chopper/trimmed_reads"}, - mode: params.publish_dir_mode, - pattern: "*${params.chopper.fastq_ext}" - ] - ] - } withName: MINIMAP2_INDEX { ext.args = "" //container = params.minimap2.container - ext.containers = params.minimap2 + ext.parameters = params.minimap2 // scratch = false stageInMode = params.stage_in_mode publishDir = [ @@ -635,22 +745,17 @@ process { withName: MINIMAP2_MAP { ext.args = "" //container = params.minimap2.container - ext.containers = params.minimap2 + ext.parameters = params.minimap2 // scratch = false stageInMode = params.stage_in_mode publishDir = [ enabled: false - //[ - // path: { "${params.outdir}/${params.spades.outdir}/minimap2/${params.minimap2.mapped_outdir}" }, - // mode: params.publish_dir_mode, - // pattern: "*{${params.minimap2.mapped_paf_ext},${params.minimap2.mapped_sam_ext}}" - //] ] } withName: SAM_TO_BAM { //container = params.samtools.container - ext.containers = params.samtools + ext.parameters = params.samtools // scratch = false stageInMode = params.stage_in_mode publishDir = [ @@ -661,132 +766,97 @@ process { withName: RACON_POLISH { ext.args = "" //container = params.racon.container - ext.containers = params.racon + ext.parameters = params.racon // scratch = false stageInMode = params.stage_in_mode publishDir = [ [ - path: { "${params.outdir}/${params.spades.outdir}/${params.racon.outdir}/racon/racon_consensus" }, + path: { ["${task.assembly_consensus_directory_name}", "Racon", "Consensus"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.racon.consensus_ext}" + pattern: "*${params.racon.consensus_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "assembly", "racon", meta) } ] ] } - withName: PILON_POLISH { - ext.args = "" - stageInMode = params.stage_in_mode - // scratch = false - //container = params.pilon.container - ext.containers = params.pilon - publishDir = [ - [ - path: { "${params.outdir}/${params.spades.outdir}/${params.racon.outdir}/${params.pilon.outdir}/fasta" }, - mode: params.publish_dir_mode, - pattern: "*${params.pilon.fasta_ext}" - ], - [ - path: { "${params.outdir}/${params.spades.outdir}/${params.racon.outdir}/${params.pilon.outdir}/vcf" }, - mode: params.publish_dir_mode, - pattern: "*${params.pilon.vcf_ext}" - ], - [ - path: { "${params.outdir}/${params.spades.outdir}/${params.racon.outdir}/${params.pilon.outdir}/changes" }, - mode: params.publish_dir_mode, - pattern: "*${params.pilon.changes_ext}" - ] - ] - } withName: PILON_ITER { ext.args = "" stageInMode = params.stage_in_mode // scratch = false - ext.containers = params.pilon_iterative + ext.parameters = params.pilon_iterative maxRetries = 3 errorStrategy = { task.attempt <= task.maxRetries ? sleep(Math.pow(2, task.attempt) * 200 as long) && 'retry' : 'ignore' } publishDir = [ [ - path: { "${params.outdir}/${params.spades.outdir}/${params.racon.outdir}/${params.pilon_iterative.outdir}/fasta" }, + path: { ["${task.assembly_polishing_directory_name}", "Pilon", "Fasta"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.pilon_iterative.fasta_ext}" + pattern: "*${params.pilon_iterative.fasta_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "assembly", "pilon", meta) } ], [ - path: { "${params.outdir}/${params.spades.outdir}/${params.racon.outdir}/${params.pilon_iterative.outdir}/vcf" }, + path: { ["${task.assembly_polishing_directory_name}", "Pilon", "VCF"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.pilon_iterative.vcf_ext}" + pattern: "*${params.pilon_iterative.vcf_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "vcf", "pilon", meta) } ], [ - path: { "${params.outdir}/${params.spades.outdir}/${params.racon.outdir}/${params.pilon_iterative.outdir}/changes" }, + path: { ["${task.assembly_polishing_directory_name}", "Pilon", "Changes"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.pilon_iterative.changes_ext}" + pattern: "*${params.pilon_iterative.changes_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, null, "pilon", meta) } ], [ - path: { "${params.outdir}/${params.spades.outdir}/${params.racon.outdir}/${params.pilon_iterative.outdir}/bams" }, + path: { ["${task.assembly_polishing_directory_name}", "Pilon", "BAMs"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.pilon_iterative.bam_ext}" + pattern: "*${params.pilon_iterative.bam_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "bam", "pilon", meta) } ] ] } - withName: KAT_HIST { - ext.args = "" - stageInMode = params.stage_in_mode - // scratch = false - errorStrategy = "finish" - //container = params.kat.container - ext.containers = params.kat - publishDir = [ - [ - path: { "${params.outdir}/ReadQuality/kat/hist/histogram" }, - mode: params.publish_dir_mode, - pattern: "*${params.kat.hist_ext}" - ], - [ - path: { "${params.outdir}/ReadQuality/kat/hist/json" }, - mode: params.publish_dir_mode, - pattern: "*${params.kat.json_ext}" - ], - [ - path: { "${params.outdir}/ReadQuality/kat/hist/png" }, - mode: params.publish_dir_mode, - pattern: "*${params.kat.png_ext}" - ] - ] - } withName: MEDAKA_POLISH{ ext.args = "" cache = 'lenient' stageInMode = params.stage_in_mode // scratch = false - ext.containers = params.medaka + ext.parameters = params.medaka maxRetries = 3 publishDir = [ [ - path: { "${params.outdir}/${params.spades.outdir}/${params.racon.outdir}/${params.medaka.outdir}" }, + path: { ["${task.assembly_polishing_directory_name}", "Medaka"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.medaka.fasta_ext}" + pattern: "*${params.medaka.fasta_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "assembly", "medaka", meta) } ] ] } withName: BAKTA_DB_DOWNLOAD { // scratch = false - ext.containers = params.bakta + ext.parameters = params.bakta stageInMode = params.stage_in_mode } withName: BAKTA_ANNOTATE { // scratch = false // bakta uses its own temp dir so things get weird errorStrategy = 'ignore' // Some samples do not run through MikroKondo - ext.containers = params.bakta + ext.parameters = params.bakta ext.args = params.bakta.args stageInMode = params.stage_in_mode publishDir = [ [ - path: { "${params.outdir}/annotations/bakta/" }, + path: { ["${task.assembly_annotations_directory_name}", "Bakta"].join(File.separator) }, mode: params.publish_dir_mode, + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "annotation", "bakta", meta) } ] ] @@ -795,82 +865,93 @@ process { withName: ABRICATE { errorStrategy = 'ignore' ext.args = params.abricate.args - ext.containers = params.abricate + ext.parameters = params.abricate stageInMode = params.stage_in_mode publishDir = [ [ - path: { "${params.outdir}/annotations/abricate/" }, + path: { ["${task.assembly_annotations_directory_name}", "Abricate"].join(File.separator) }, mode: params.publish_dir_mode, + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "annotation", "abricate", meta) } ] ] } withName: ECTYPER { - //container = params.ectyper.container - ext.containers = params.ectyper + ext.parameters = params.ectyper stageInMode = params.stage_in_mode ext.args = params.ectyper.args time = '30m' // scratch = false publishDir = [ - path: { "${params.outdir}/subtyping/ectyper" }, + path: { ["${task.assembly_subtyping_directory_name}", "ECTyper"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*/*" + pattern: "*/*", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "subtyping", "ectyper", meta) } ] } withName: KLEBORATE { - ext.containers = params.kleborate + ext.parameters = params.kleborate stageInMode = params.stage_in_mode // scratch = false publishDir = [ - path: { "${params.outdir}/subtyping/kleborate" }, + path: { ["${task.assembly_subtyping_directory_name}", "Kleborate"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.kleborate.txt_ext}" + pattern: "*${params.kleborate.txt_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "subtyping", "kleborate", meta) } ] } withName: SPATYPER { - ext.containers = params.spatyper + ext.parameters = params.spatyper stageInMode = params.stage_in_mode //scratch = false publishDir = [ - path: { "${params.outdir}/subtyping/spatyper" }, + path: { ["${task.assembly_subtyping_directory_name}", "SpaTyper"].join(File.separator) }, mode: params.publish_dir_mode, - pattern: "*${params.spatyper.tsv_ext}" + pattern: "*${params.spatyper.tsv_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "subtyping", "spatyper", meta) } ] } withName: SISTR { - //container = params.sistr.container - ext.containers = params.sistr + ext.parameters = params.sistr stageInMode = params.stage_in_mode //scratch = false publishDir = [ - path: { "${params.outdir}/subtyping/sistr" }, + path: { ["${task.assembly_subtyping_directory_name}", "SISTR"].join(File.separator) }, mode: params.publish_dir_mode, + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "subtyping", "sistr", meta) } ] } withName: LISSERO { - //container = params.lissero.container - ext.containers = params.lissero + ext.parameters = params.lissero stageInMode = params.stage_in_mode //scratch = false publishDir = [ - path: { "${params.outdir}/subtyping/lissero" }, + path: { ["${task.assembly_subtyping_directory_name}", "Lissero"].join(File.separator) }, mode: params.publish_dir_mode, + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "subtyping", "lissero", meta) } ] } withName: SHIGEIFINDER { - ext.containers = params.shigeifinder + ext.parameters = params.shigeifinder stageInMode = params.stage_in_mode scratch = false publishDir = [ - path: { "${params.outdir}/subtyping/shigeifinder" }, + path: { ["${task.assembly_subtyping_directory_name}", "ShigeiFinder"].join(File.separator) } , mode: params.publish_dir_mode, + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "subtyping", "shigeifinder", meta) } ] } diff --git a/docs/index.md b/docs/index.md index 1b2741aa..beb166ef 100644 --- a/docs/index.md +++ b/docs/index.md @@ -3,16 +3,40 @@ # Welcome to mikrokondo! ## What is mikrokondo? + Mikrokondo is a tidy workflow for performing routine bioinformatic assessment of sequencing reads and assemblies, such as: read pre-processing, assessing contamination, assembly, quality assessment of assemblies, and pathogen-specific typing. It is easily configurable, provides dynamic dispatch of species specific workflows and produces common outputs. ## What is the target audience? + This workflow can be used in sequencing and reference laboratories as a part of an automated quality and initial bioinformatics assessment protocol. ## Is mikrokondo right for me? + Mikrokondo is purpose built to provide sequencing and clinical laboratories with an all encompassing workflow to provide a standardized workflow that can provide the initial quality assessment of sequencing reads and assemblies, and initial pathogen-specific typing. It has been designed to be configurable so that new tools and quality metrics can be easily incorporated into the workflow to allow for automation of these routine tasks regardless of pathogen of interest. It currently accepts Illumina, Nanopore or Pacbio (Pacbio data only partially tested) sequencing data. It is capable of hybrid assembly or accepting pre-assembled genomes. This workflow will detect what pathogen(s) is present and apply the applicable metrics and genotypic typing where appropriate, generating easy to read and understand reports. If your group is regularly sequencing or analyzing genomic sequences, implementation of this workflow will automate the hands-on time time usually required for these common bioinformatic tasks. +## Whole genome and Metagenomic samples typical workflow differences + +This pipeline has been written to automatically detect if a sample contains more than one organism. Whether it is intentional (as in shotgun metagenomics) or contamination, a sample with more than one organism detected via Mash will be marked as `metagenomic`. + +Typical workflow for **whole genome sample** (one organism): + +1. Reads are cleaned of human DNA, trimmed and one organism is detected +2. Reads are assembled into contigs, then polished +3. Assemblies are run through quality tools +4. Assemblies are speciated with appropriate tool +5. Species specific subtyping tools are called based on speciation from step 4. + +Typical workflow for **metagenomic sample** (greater than one organism): + +1. Reads are cleaned of human DNA, trimmed and multiple organisms are detected +2. Reads are assembled into contigs, then polished +3. Assemblies undergo contig binning (via Kraken2) +4. Each bin is run through quality tools +5. Each bin is speciated with appropriate tool +6. Species specific subtyping tools are called on each bin based on speciation from step 5. + ## Workflow Schematics (Subject to change) ![Pipeline](images/mikrokondo_mermaid.svg "Workflow") diff --git a/docs/subworkflows/assemble_reads.md b/docs/subworkflows/assemble_reads.md index 9ef62e83..5b3f9cbc 100644 --- a/docs/subworkflows/assemble_reads.md +++ b/docs/subworkflows/assemble_reads.md @@ -1,26 +1,29 @@ -# Assembly - -## subworkflows/local/assemble_reads - ->**NOTE:** ->Hybrid assembly of long and short reads uses a different workflow that can be found [here](/subworkflows/hybrid_assembly) - -## Steps - -1. **Assembly** proceeds differently depending whether paired-end short or long reads. If the samples are marked as metagenomic, then metagenomic assembly flags will be added to the corresponding assembler. - - **Paired end assembly** is performed using [Spades](https://github.com/ablab/spades) (for more information see the module [spades_assemble.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/spades_assemble.nf)) - - **Long read assembly** is performed using [Flye](https://github.com/fenderglass/Flye) (for more information see the module [flye_assemble.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/flye_assemble.nf) - -2. **Bandage plots** are generated using [Bandage](https://rrwick.github.io/Bandage/), these images were included as they can be informative of assembly quality in some situations [bandage_image.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bandage_image.nf). - -3. **Polishing** (OPTIONAL) can be performed on either short or long/hybrid assemblies. [Minimap2](https://github.com/lh3/minimap2) is used to create a contig index [minimap2_index.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/minimap2_index.nf) and then maps reads to that index [minimap2_map.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/minimap2_map.nf). Lastly, [Racon](https://github.com/isovic/racon) uses this output to perform contig polishing [racon_polish.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/racon_polish.nf). To turn off polishing add the following to your command line parameters `--skip_polishing`. - -## Input -- cleaned reads -- metadata - -## Outputs -- contigs -- assembly graphs -- polished contigs -- software versions +# Assembly + +## subworkflows/local/assemble_reads + +>**NOTE:** +>Hybrid assembly of long and short reads uses a different workflow that can be found [here](/mikrokondo/docs/subworkflows/hybrid_assembly/) + +## Steps + +1. **Assembly** proceeds differently depending whether paired-end short or long reads. If the samples are marked as metagenomic, then metagenomic assembly flags will be added to the corresponding assembler. + - **Paired end assembly** is performed using [Spades](https://github.com/ablab/spades) (for more information see the module [spades_assemble.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/spades_assemble.nf)) + - **Long read assembly** is performed using [Flye](https://github.com/fenderglass/Flye) (for more information see the module [flye_assemble.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/flye_assemble.nf) + +2. **Bandage plots** are generated using [Bandage](https://rrwick.github.io/Bandage/), these images were included as they can be informative of assembly quality in some situations [bandage_image.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bandage_image.nf). + +3. **Polishing** (OPTIONAL) can be performed on either short or long/hybrid assemblies. [Minimap2](https://github.com/lh3/minimap2) is used to create a contig index [minimap2_index.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/minimap2_index.nf) and then maps reads to that index [minimap2_map.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/minimap2_map.nf). Lastly, [Racon](https://github.com/isovic/racon) uses this output to perform contig polishing [racon_polish.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/racon_polish.nf). To turn off polishing add the following to your command line parameters `--skip_polishing`. + +## Input +- Processed Reads from [clean reads](/docs/subworkflows/clean_reads) subworkflow + +## Output +- Reads + - FinalReads + - SAMPLE + - Processing + - Dehosting + - Trimmed + - FastP + - MashSketches diff --git a/docs/subworkflows/bin_contigs.md b/docs/subworkflows/bin_contigs.md index a8149c59..ebfc9392 100644 --- a/docs/subworkflows/bin_contigs.md +++ b/docs/subworkflows/bin_contigs.md @@ -8,11 +8,14 @@ ## Input -- contigs -- reads -- metadata +- Contig file (fasta) from the `FinalAssembly` dir + - This is the final contig file from the last step in the CleanAssemble workflow (taking into account any skip flags that have been used) +- metadata from prior tools ## Outputs - -- metadata -- binned contigs +- Assembly + - PostProcessing + - Metagenomic + - BinnedContigs + - SAMPLE + - CONTIG diff --git a/docs/subworkflows/clean_reads.md b/docs/subworkflows/clean_reads.md index 13403528..08b1c3fe 100644 --- a/docs/subworkflows/clean_reads.md +++ b/docs/subworkflows/clean_reads.md @@ -21,12 +21,29 @@ 5. **Metagenomic assesment** using a custom [Mash](https://github.com/marbl/Mash) 'sketch' file generated from the Genome Taxonomy Database [GTDB](https://gtdb.ecogenomic.org/) and the mash_screen module, this step assesses how many bacterial genera are present in a sample (e.g. a contaminated or metagenomic sample may have more than one genus of bacteria present) with greater than 90% identity (according to Mash). When more than 1 taxa are present, the metagenomic tag is set, turning on metagenomic assembly in later steps. Additionally Kraken2 will be run on metagenomic assemblies and contigs will be binned at a defined taxonomic level (default level: genus). -6. **Nanopore ID screening** duplicate Nanopore read ID's have been known to cause issues in the pipeline downstream. In order to bypass this issue, an option can be toggled where a script will read in Nanopore reads and append a unique ID to the header, this process can be slow so default setting is `--skip_ont_header_cleaning true`. +6. **Nanopore ID screening** duplicate Nanopore read ID's have been known to cause issues in the pipeline downstream. In order to bypass this issue, an option can be toggled where a script will read in Nanopore reads and append a unique ID to the header, this process can be slow so default setting is to skip, `--skip_ont_header_cleaning true`. ## Input -- reads and metadata +- Next generation sequencing reads: + + Short read - Illumina + + Long read: + * Nanopore + * Pacbio +- User submitted sample sheet + ## Outputs -- quality trimmed and deconned reads -- estimated genome size -- software versions +- Reads + - FinalReads + - SAMPLE + - Processing + - Dehosting + - Trimmed + - FastP + - Seqtk + - MashSketches + - Quality + - RawReadQuality + - Trimmed + - FastP + - MashScreen diff --git a/docs/subworkflows/determine_species.md b/docs/subworkflows/determine_species.md index c77ebfda..363a31ea 100644 --- a/docs/subworkflows/determine_species.md +++ b/docs/subworkflows/determine_species.md @@ -1,17 +1,19 @@ -# Determine Species - -## subworkflows/local/determine_species - -## Steps -1. **Taxonomic classification** is completed using [Mash](https://github.com/marbl/Mash) (DEFAULT), [mash_screen.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/mash_screen.nf), or [Kraken2](https://github.com/DerrickWood/kraken2) (OPTIONAL, or when samples are flagged metagenomic), [kraken.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/kraken.nf). Species classification and subsequent subtyping can be skipped by passing `--skip_species_classification true` on the command line. To select Kraken2 for speciation rather than mash you add `--run_kraken true` to your command line arguments. - ->NOTE: ->If species specific subtyping tools are to be executed by the pipeline, **Mash must be the chosen classifier** - -## Input -- metadata -- assembled contigs - -## Output -- Mash/Kraken2 report -- software versions +# Determine Species + +## subworkflows/local/determine_species + +## Steps +1. **Taxonomic classification** is completed using [Mash](https://github.com/marbl/Mash) (DEFAULT; [mash_screen.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/mash_screen.nf)), or [Kraken2](https://github.com/DerrickWood/kraken2) (OPTIONAL, or when samples are flagged metagenomic; [kraken.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/kraken.nf)). Species classification and subsequent subtyping can be skipped by passing `--skip_species_classification true` on the command line. To select Kraken2 for speciation rather than mash add `--run_kraken true` to your command line arguments. + +>IMPORTANT: +>If species specific subtyping tools are to be executed by the pipeline **Mash must be the chosen classifier** + +## Input +- Contig file (fasta) from the `FinalAssembly` dir + - This is the final contig file from the last step in the CleanAssemble workflow (taking into account any skip flags that have been used) + +## Output +- Assembly + - PostProcessing + - Speciation + - MashScreen diff --git a/docs/subworkflows/genomes_annotate.md b/docs/subworkflows/genomes_annotate.md index 1ba31ca8..840da8d1 100644 --- a/docs/subworkflows/genomes_annotate.md +++ b/docs/subworkflows/genomes_annotate.md @@ -1,37 +1,41 @@ -# Genome Annotation - -## subworflows/local/annotate_genomes - -## Steps -1. **Genome annotation** is performed using [Bakta](https://github.com/oschwengers/bakta) within [bakta_annotate.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bakta_annotate.nf) - - - You must download a Bakta database and add its path to the [nextflow.config](https://github.com/phac-nml/mikrokondo/blob/main/nextflow.config) file or add its path as a command line option - - To skip running Bakta add `--skip_bakta true` to your command line options. - -2. **Screening for antimicrobial resistance** [Abricate](https://github.com/tseemann/abricate) is used with the default options and default database, however you can specify a database by updating the `args` in the [nextflow.config](https://github.com/phac-nml/mikrokondo/blob/main/nextflow.config) for Abricate. - - - You can skip running Abricate by adding `--skip_abricate true` to your command line options. - -3. **Screening for plasmids** is performed using [Mob-suite](https://github.com/phac-nml/mob-suite) with default options. - -4. **Selection of Pointfindr database**. This step is only ran if running [StarAMR](https://github.com/phac-nml/staramr). It will try and select the correct database based on the species identified earlier in the pipeline. If a database cannot be determined pointfinder will simply not be run. - -5. **Exporting of StarAMR databases used**. To provide a method of user validation for automatic database selection, the database info from StarAMR will be exported from the pipeline into the file `StarAMRDBVersions.txt` and placed in the StarAMR directory. - -6. **Screening for antimicrobial resistance** with **StarAMR**. [StarAMR](https://github.com/phac-nml/staramr) is provided as an additional option to screen for antimicrobial resistance in ResFinder, PointFinder and PlasmidFinder databases. Passing in a database is optional as the one within the container will be used by default. - - You can skip running StarAMR by adding the following flag `--skip_starmar` - ->NOTE: ->A custom database for Bakta can be downloaded via the commandline using `bakta_download_db.nf`. ->The `bakta_db` setting can be changed in the `nextflow.config` file, see [bakta](/usage/tool_params/#bakta) - -## Input -- contigs -- metadata - -## Output -- Bakta outputs -- abricate outputs -- mob-suite outputs -- starAMR outputs -- software versions +# Genome Annotation + +## subworkflows/local/annotate_genomes + +## Steps +1. **Genome annotation** is performed using [Bakta](https://github.com/oschwengers/bakta) within [bakta_annotate.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bakta_annotate.nf) + + - You must download a Bakta database and add its path to the [nextflow.config](https://github.com/phac-nml/mikrokondo/blob/main/nextflow.config) file or add its path as a command line option + - To skip running Bakta add `--skip_bakta true` to your command line options. + +2. **Screening for antimicrobial resistance** [Abricate](https://github.com/tseemann/abricate) is used with the default options and default database, however you can specify a database by updating the `args` in the [nextflow.config](https://github.com/phac-nml/mikrokondo/blob/main/nextflow.config) for Abricate. + + - You can skip running Abricate by adding `--skip_abricate true` to your command line options. + +3. **Screening for plasmids** is performed using [Mob-suite](https://github.com/phac-nml/mob-suite) with default options. + +4. **Selection of Pointfindr database**. This step is only ran if running [StarAMR](https://github.com/phac-nml/staramr). It will try and select the correct database based on the species identified earlier in the pipeline. If a database cannot be determined pointfinder will simply not be run. + +5. **Exporting of StarAMR databases used**. To provide a method of user validation for automatic database selection, the database info from StarAMR will be exported from the pipeline into the file `StarAMRDBVersions.txt` and placed in the StarAMR directory. + +6. **Screening for antimicrobial resistance** with **StarAMR**. [StarAMR](https://github.com/phac-nml/staramr) is provided as an additional option to screen for antimicrobial resistance in ResFinder, PointFinder and PlasmidFinder databases. Passing in a database is optional as the one within the container will be used by default. + - You can skip running StarAMR by adding the following flag `--skip_starmar` + +>NOTE: +>A custom database for Bakta can be downloaded via the commandline using `bakta_download_db.nf`. +>The `bakta_db` setting can be changed in the `nextflow.config` file, see [bakta](/usage/tool_params/#bakta) + +## Input +- Contig file (fasta) from the `FinalAssembly` dir + - This is the final contig file from the last step in the CleanAssemble workflow (taking into account any skip flags that have been used) +- metadata from prior tools + +## Output +- Assembly + - Annotation + - Abricate + - Mobsuite + - recon + - SAMPLE + - StarAMR + - SAMPLE diff --git a/docs/subworkflows/hybrid_assembly.md b/docs/subworkflows/hybrid_assembly.md index e8c7c8a1..cb53f74b 100644 --- a/docs/subworkflows/hybrid_assembly.md +++ b/docs/subworkflows/hybrid_assembly.md @@ -1,23 +1,39 @@ -# Hybrid Assembly - -## subworkflows/local/hybrid_assembly - -## Choice of 2 workflows -1. **DEFAULT** - A. [Flye](https://github.com/fenderglass/Flye) assembly [flye_assembly.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/flye_assemble.nf) - B. [Bandage](https://rrwick.github.io/Bandage/) creates a bandage plot of the assembly [bandage_image.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bandage_image.nf) - C. [Minimap2](https://github.com/lh3/minimap2) creates an index of the contigs (minimap2_index.nf), then maps long reads to this index [minimap2_map.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/minimap2_map.nf) - D. [Racon](https://github.com/isovic/racon) uses the short reads to iteratively polish contigs [pilon_iter.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/pilon_polisher.nf) -2. **OPTIONAL** - A. [Unicycler](https://github.com/rrwick/Unicycler) assembly [unicycler_assemble.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/unicycler_assemble.nf) - B. [Bandage](https://rrwick.github.io/Bandage/) creates a bandage plot of the assembly [bandage_image.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bandage_image.nf) - -## Input -- metadata -- short reads -- long reads - -## Output -- contigs (pilon, unicycler) -- vcf data (pilon) -- software versions +# Hybrid Assembly + +## subworkflows/local/hybrid_assembly + +## Choice of 2 workflows +1. **DEFAULT** + A. [Flye](https://github.com/fenderglass/Flye) assembly [flye_assembly.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/flye_assemble.nf) + B. [Bandage](https://rrwick.github.io/Bandage/) creates a bandage plot of the assembly [bandage_image.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bandage_image.nf) + C. [Minimap2](https://github.com/lh3/minimap2) creates an index of the contigs (minimap2_index.nf), then maps long reads to this index [minimap2_map.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/minimap2_map.nf) + D. [Racon](https://github.com/isovic/racon) uses the short reads to iteratively polish contigs [pilon_iter.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/pilon_polisher.nf) +2. **OPTIONAL** + A. [Unicycler](https://github.com/rrwick/Unicycler) assembly [unicycler_assemble.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/unicycler_assemble.nf) + B. [Bandage](https://rrwick.github.io/Bandage/) creates a bandage plot of the assembly [bandage_image.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bandage_image.nf) + +## Input +- Next generation sequencing reads: + + Short read + * Illumina + + Long read: + * Nanopore + * Pacbio +- User submitted sample sheet + +## Output +- Assembly + - Assembling + - Bandage + - ConsensusGeneration + - Polishing + - Pilon + - BAMs + - Changes + - Fasta + - VCF + - Racon + - Consensus + - Unicycler + - FinalAssembly + - SAMPLE diff --git a/docs/subworkflows/input_check.md b/docs/subworkflows/input_check.md index 3e607c37..60c6bf05 100644 --- a/docs/subworkflows/input_check.md +++ b/docs/subworkflows/input_check.md @@ -11,7 +11,7 @@ 3. If there are samples that have duplicate ID's the **samples will be combined**. ## Input -- CSV formatted sample sheet +- User-submitted CSV formatted sample sheet ## Outputs - A channel of reads and their associated tags diff --git a/docs/subworkflows/polish_assemblies.md b/docs/subworkflows/polish_assemblies.md index c0e1f714..7c849645 100644 --- a/docs/subworkflows/polish_assemblies.md +++ b/docs/subworkflows/polish_assemblies.md @@ -11,9 +11,20 @@ - **Pacbio** No addtional polishing is performed, outputs of Pacbio data still need to be tested. ## Input -- cleaned reads -- Assembly +- cleaned reads (`fastq`) from the `FinalReads` dir + - This is the final reads file from the last step in the `Clean Reads` workflow (taking into account any skip flags that have been used) +- Contig file (`fasta`) from the `FinalAssembly` dir + - This is the final contig file from the last step in the CleanAssemble workflow (taking into account any skip flags that have been used) ## Outputs -- Polished assemblies -- Reads used to polish +- Assembly + - Assembling + - ConsensusGeneration + - Polishing + - Pilon + - BAMs + - Changes + - Fasta + - VCF + - Racon + - Consensus diff --git a/docs/subworkflows/qc_assembly.md b/docs/subworkflows/qc_assembly.md index 7bea75bf..62b2b29e 100644 --- a/docs/subworkflows/qc_assembly.md +++ b/docs/subworkflows/qc_assembly.md @@ -19,9 +19,21 @@ ## Input -- cleaned reads with tags -- polished contigs with tags +- cleaned reads (`fastq`) from the `FinalReads` dir + - This is the final reads file from the last step in the `Clean Reads` workflow (taking into account any skip flags that have been used) +- Contig file (`fasta`) from the `FinalAssembly` dir + - This is the final contig file from the last step in the CleanAssemble workflow (taking into account any skip flags that have been used) ## Outputs -- filtered contigs -- software versions +- Assembly + - Quality + - CheckM + - SAMPLE + - bins + - storage + - tree + - Quast + - SAMPLE +- Subtyping + - SevenGeneMLST + - mlst diff --git a/docs/subworkflows/subtype_genome.md b/docs/subworkflows/subtype_genome.md index 118a7f1a..39a20daf 100644 --- a/docs/subworkflows/subtype_genome.md +++ b/docs/subworkflows/subtype_genome.md @@ -12,8 +12,14 @@ > If a sample cannot be subtyped, it merely passes through the pipeline and is not typed. A log message will instead be displayed notifying the user the sample cannot be typed. ## Input -- contigs and associated tags -- Mash report +- Contig file (fasta) from the `FinalAssembly` dir + - This is the final contig file from the last step in the `CleanAssemble` workflow (taking into account any skip flags that have been used) +- Mash report from assembly speciation step in the `CleanAssemble` workflow ## Output -- software versions +- Subtyping + - ECTyper + - SAMPLE + - SevenGeneMLST + - SISTR + - Etc... diff --git a/docs/troubleshooting/FAQ.md b/docs/troubleshooting/FAQ.md index a60b4969..98db5134 100644 --- a/docs/troubleshooting/FAQ.md +++ b/docs/troubleshooting/FAQ.md @@ -1,84 +1,90 @@ -# FAQ - -## How is variable type determined from command line parameters? - -In a situation where you are developing the pipeline or finding that the parameter passed on the command line is not working as expected, for example, example: the user wants a sample to have at least 1000 reads before going for assembly (`--min_reads 1000`) and samples with less than 1000 reads are passing onto the assembly step. - -The way a variable type is determined from the command line can be found in the below [groovy code](https://github.com/nextflow-io/nextflow/blob/8c0566fc3a35c8d3a4e01a508a0667e471bab297/modules/nextflow/src/main/groovy/nextflow/cli/CmdRun.groovy#L506-L518). The snippet is also pasted below and is up to date as of 2023-10-16: - -``` - static protected parseParamValue(String str ) { - - if ( str == null ) return null - - if ( str.toLowerCase() == 'true') return Boolean.TRUE - if ( str.toLowerCase() == 'false' ) return Boolean.FALSE - - if ( str==~/\d+(\.\d+)?/ && str.isInteger() ) return str.toInteger() - if ( str==~/\d+(\.\d+)?/ && str.isLong() ) return str.toLong() - if ( str==~/\d+(\.\d+)?/ && str.isDouble() ) return str.toDouble() - - return str - } -``` - -# Troubleshooting - -## Common errors and how to (maybe) fix them - -### null errors, or report generation failing on line 701 - -Currently there are compatibility issues between version 22 and 23.10.0 of nextflow with regards to parsing the `nextflow.config` file. I am currently working on addressing them now. if you happen to encounter issues please downgrade your nextflow install to 22.10.1. - -### Permission denied on a python script (`bin/some_script.py`) - -On some installs, a lack of permissions for python scripts are causing this error to occur. The easiest way to solve this issue is to execute `chmod +x bin/*.py` in the mikrokondo installation directory. This will add execution permissions to all of the scripts, if this solution does not work then please submit an issue. - -### Random issues containing on resume `org.iq80.leveldb.impl.Version.retain()` - -Sometimes the resume features of Nextflow don't work completely. The above error string typically implies that some output could not be gathered from a process and on subsequent resumes you will get an error. You can find out what process (and its work directory location) caused the error in the `nextflow.log` (normally it will be at the top of some long traceback in the log), and a work directory will be specified listing the directory causing the error. Delete this directory and resume the pipeline. **If you hate logs and you don't care about resuming** you can simply delete the work directory entirely. - - -### StarAMR - -- Exit code 1, and an error involving ` stderr=FASTA-Reader: Ignoring invalid residues at position(s):` - - This is likely not a problem with your data but with your databases, following the instructions listed [here](https://github.com/phac-nml/staramr/issues/200#issuecomment-1741082733) should fix the issue. - - The command to download the proper databases mentioned in the issue is listed here: - `staramr db build --dir staramr_databases --resfinder-commit fa32d9a3cf0c12ec70ca4e90c45c0d590ee810bd --pointfinder-commit 8c694b9f336153e6d618b897b3b4930961521eb8 --plasmidfinder-commit c18e08c17a5988d4f075fc1171636e47546a323d` - - **Passing in a database is optional as the one within the container will be used by default.** - - If you continue to have problems with StarAMR you can skip it using `--skip_staramr` - - -### Common mash errors - -- Mash exit code 139 or 255, you may see `org.iq80.leveldb.impl.Version.retain()` appearing on screen as well. - - This indicates a segmentation fault, due to mash failing or alternatively some resource not being available. If you see that mash has run properly in the work directory output but Nextflow is saying the process failed and the `versions.yml` file is missing you likely have encountered some resource limit on your system. A simple solution is likely to reduce the number of `maxForks` available to the different Mash processes in the `conf/modules.config` file. Alternatively you may need to alter the number in some Nextflow environment variables e.g. `OMP_NUM_THREADS`, `USE_SIMPLE_THREADED_LEVEL3` and `OPENBLAS_NUM_THREADS`. - -### Common spades issues - -- Spades exit code 21 - - One potential cause of this issue (requires looking at the log files) is due to not enough reads being present. You can avoid samples with too few reads going to assembly by adjusting the `min_reads` parameter in the `nextflow.config`. It can also be adjusted from the command line with the flag `--min_reads 1000` - -- Spades exit code 245 - - This could be due to multiple issues and typically results from a segmentation fault (OS Code 11). Try increasing the amount of memory spades is alloted ([base.config](https://github.com/phac-nml/mikrokondo/blob/main/conf/base.config)) if the issue persists try using a different Spades container/ create an issue. - -### Common Kraken2 issues - -- Kraken2 exit code 2 - - It is still a good idea to look at the output logs to verify your issue as they may say something like: `kraken2: database ("./kraken2_database") does not contain necessary file taxo.k2d` despite the taxo.k2d file being present. This is potentially caused by symlink issues, and one possible fix is to provide the absolute path to your Kraken2 database in the [nextflow.config](https://github.com/phac-nml/mikrokondo/blob/main/nextflow.config) or from the command line `--kraken.db /PATH/TO/DB` - - -### Common Docker issues - -- Exit code 137: - - Exit code 137, likely means your docker container used to much memory. You can adjust how much memory each process gets in the [base.config](https://github.com/phac-nml/mikrokondo/blob/main/conf/base.config) file, however there may be some underlying configuration you need to perform for Docker to solve this issue. - -### CheckM fails - -- CheckM exit code 1, could not find concatenated.tree or concatentated.pplacer.json - - This is a sign that CheckM has run out of memory, make sure you are using your desired executor. You may need to adjust configuration settings. - -### QUAST fails with a read-only error - -- `[Errno 30] Read-only file system: '/usr/local/lib/python3.9/site-packages/quast_libs/gridss'` - - This issue appears to be related to QUAST trying to download GRIDSS for structural variant detection and this action being incompatible with the container used to run QUAST. You may be able to resolve this be adding `--no-sv` as a QUAST command-line flag in Mikrokondo's `nextflow.config`, or by switching your container platform to singularity. Errors were observed with `apptainer version 1.2.3`, which were resolved by switching to singularity (`singularity-ce version 3.9.5` and `singularity-ce version a948062` resolved the issue). +# FAQ + +## How is variable type determined from command line parameters? + +In a situation where you are developing the pipeline or finding that the parameter passed on the command line is not working as expected, for example, example: the user wants a sample to have at least 1000 reads before going for assembly (`--min_reads 1000`) and samples with less than 1000 reads are passing onto the assembly step. + +The way a variable type is determined from the command line can be found in the below [groovy code](https://github.com/nextflow-io/nextflow/blob/8c0566fc3a35c8d3a4e01a508a0667e471bab297/modules/nextflow/src/main/groovy/nextflow/cli/CmdRun.groovy#L506-L518). The snippet is also pasted below and is up to date as of 2023-10-16: + +``` + static protected parseParamValue(String str ) { + + if ( str == null ) return null + + if ( str.toLowerCase() == 'true') return Boolean.TRUE + if ( str.toLowerCase() == 'false' ) return Boolean.FALSE + + if ( str==~/\d+(\.\d+)?/ && str.isInteger() ) return str.toInteger() + if ( str==~/\d+(\.\d+)?/ && str.isLong() ) return str.toLong() + if ( str==~/\d+(\.\d+)?/ && str.isDouble() ) return str.toDouble() + + return str + } +``` + +# Troubleshooting + +## Common errors and how to (maybe) fix them + +### null errors, or report generation failing on line 701 + +Currently there are compatibility issues between version 22 and 23.10.0 of nextflow with regards to parsing the `nextflow.config` file. I am currently working on addressing them now. if you happen to encounter issues please downgrade your nextflow install to 22.10.1. + +### Permission denied on a python script (`bin/some_script.py`) + +On some installs, a lack of permissions for python scripts are causing this error to occur. The easiest way to solve this issue is to execute `chmod +x bin/*.py` in the mikrokondo installation directory. This will add execution permissions to all of the scripts, if this solution does not work then please submit an issue. + +### Random issues containing on resume `org.iq80.leveldb.impl.Version.retain()` + +Sometimes the resume features of Nextflow don't work completely. The above error string typically implies that some output could not be gathered from a process and on subsequent resumes you will get an error. You can find out what process (and its work directory location) caused the error in the `nextflow.log` (normally it will be at the top of some long traceback in the log), and a work directory will be specified listing the directory causing the error. Delete this directory and resume the pipeline. **If you hate logs and you don't care about resuming** you can simply delete the work directory entirely. + + +### StarAMR + +- Exit code 1, and an error involving ` stderr=FASTA-Reader: Ignoring invalid residues at position(s):` + - This is likely not a problem with your data but with your databases, following the instructions listed [here](https://github.com/phac-nml/staramr/issues/200#issuecomment-1741082733) should fix the issue. + - The command to download the proper databases mentioned in the issue is listed here: + `staramr db build --dir staramr_databases --resfinder-commit fa32d9a3cf0c12ec70ca4e90c45c0d590ee810bd --pointfinder-commit 8c694b9f336153e6d618b897b3b4930961521eb8 --plasmidfinder-commit c18e08c17a5988d4f075fc1171636e47546a323d` + - **Passing in a database is optional as the one within the container will be used by default.** + - If you continue to have problems with StarAMR you can skip it using `--skip_staramr` + + +### Common mash errors + +- Mash exit code 139 or 255, you may see `org.iq80.leveldb.impl.Version.retain()` appearing on screen as well. + - This indicates a segmentation fault, due to mash failing or alternatively some resource not being available. If you see that mash has run properly in the work directory output but Nextflow is saying the process failed and the `versions.yml` file is missing you likely have encountered some resource limit on your system. A simple solution is likely to reduce the number of `maxForks` available to the different Mash processes in the `conf/modules.config` file. Alternatively you may need to alter the number in some Nextflow environment variables e.g. `OMP_NUM_THREADS`, `USE_SIMPLE_THREADED_LEVEL3` and `OPENBLAS_NUM_THREADS`. + +### Common spades issues + +- Spades exit code 21 + - One potential cause of this issue (requires looking at the log files) is due to not enough reads being present. You can avoid samples with too few reads going to assembly by adjusting the `min_reads` parameter in the `nextflow.config`. It can also be adjusted from the command line with the flag `--min_reads 1000` + +- Spades exit code 245 + - This could be due to multiple issues and typically results from a segmentation fault (OS Code 11). Try increasing the amount of memory spades is alloted ([base.config](https://github.com/phac-nml/mikrokondo/blob/main/conf/base.config)) if the issue persists try using a different Spades container/ create an issue. + +### Common Kraken2 issues + +- Kraken2 exit code 2 + - It is still a good idea to look at the output logs to verify your issue as they may say something like: `kraken2: database ("./kraken2_database") does not contain necessary file taxo.k2d` despite the taxo.k2d file being present. This is potentially caused by symlink issues, and one possible fix is to provide the absolute path to your Kraken2 database in the [nextflow.config](https://github.com/phac-nml/mikrokondo/blob/main/nextflow.config) or from the command line `--kraken.db /PATH/TO/DB` + + +### Common Docker issues + +- Exit code 137: + - Exit code 137, likely means your docker container used to much memory. You can adjust how much memory each process gets in the [base.config](https://github.com/phac-nml/mikrokondo/blob/main/conf/base.config) file, however there may be some underlying configuration you need to perform for Docker to solve this issue. + +### CheckM fails + +- CheckM exit code 1, could not find concatenated.tree or concatentated.pplacer.json + - This is a sign that CheckM has run out of memory, make sure you are using your desired executor. You may need to adjust configuration settings. + +### QUAST fails with a read-only error + +- `[Errno 30] Read-only file system: '/usr/local/lib/python3.9/site-packages/quast_libs/gridss'` + - This issue appears to be related to QUAST trying to download GRIDSS for structural variant detection and this action being incompatible with the container used to run QUAST. You may be able to resolve this be adding `--no-sv` as a QUAST command-line flag in Mikrokondo's `nextflow.config`, or by switching your container platform to singularity. Errors were observed with `apptainer version 1.2.3`, which were resolved by switching to singularity (`singularity-ce version 3.9.5` and `singularity-ce version a948062` resolved the issue). + +### ECTyper fails + +- ECTyper makes uses of pythons temporary files which can result in issues on shared file systems e.g. `ntfs`. If you encounter issues try running the pipeline in a place where read and write permissions are more relaxed. + + diff --git a/docs/usage/.pages b/docs/usage/.pages index e4bd5925..4b68091a 100644 --- a/docs/usage/.pages +++ b/docs/usage/.pages @@ -1,7 +1,7 @@ -nav: - - installation.md - - useage.md - - examples.md - - Utilities.md - - configuration.md - - tool_params.md \ No newline at end of file +nav: + - installation.md + - usage.md + - examples.md + - Utilities.md + - configuration.md + - tool_params.md diff --git a/docs/usage/examples.md b/docs/usage/examples.md index 01bd0383..dd73a7a9 100644 --- a/docs/usage/examples.md +++ b/docs/usage/examples.md @@ -1,34 +1,34 @@ -# Command Line Examples - -Some example commands of running mikrokondo are provided below: - -## Running paired-end illumina data skipping Bakta -`nextflow run main.nf --input sample_sheet.csv --skip_bakta true --platform illumina --outdir ../test_illumina -profile singularity -resume` - -The above command would run paired-end Illumina data, using Singulairty as a container service, using resume (e.g if picks up where the pipeline left off if being run again), skipping Bakta and outputting results in a folder called `test_illumina` one directory back from where the pipeline is run. **Note: your sample sheet does not need to be called sample_sheet.csv** - -## Running paired-end illumina data using Kraken2 for classifying the top species hit - -`nextflow run main.nf --input sample_sheet.csv --skip_bakta true --run_kraken true --platform illumina --outdir ../test_illumina_kraken -profile singularity -resume` - -The above command would run paired-end Illumina data, using Singulairty as a container service, using resume (e.g if picks up where the pipeline left off if being run again), skipping Bakta, using kraken2 to classify the species top hit and outputting results in a folder called `test_illumina_kraken` one directory back from where the pipeline is run. **Note: your sample sheet does not need to be called sample_sheet.csv** - -## Running nanopore data -`nextflow run main.nf --input sample_sheet.csv --skip_ont_header_cleaning true --nanopore_chemistry r941_min_hac_g507 --platform nanopore --outdir ../test_nanopore -profile docker -resume` - -The above command would run single-end Nanopore data using Docker as a container service, using resume (e.g if picks up where the pipeline left off if being run again), outputting data into a folder called `../test_nanopore` and skipping the process of verifying all Nanopore fastq data headers are unique. **Note: your sample sheet does not need to be called sample_sheet.csv** - -## Running a hybrid assembly using Unicycler -`nextflow run main.nf --input sample_sheet.csv --hybrid_unicycler true --nanopore_chemistry r941_min_hac_g507 --platform hybrid --outdir ../test_hybrid -profile apptainer -resume` - -The above command would run single-end Nanopore and paired-end Illumina data using apptainer as a container service, using resume (e.g if picks up where the pipeline left off if being run again), outputting data into a folder called `../test_hybrid` and using Unicycler for assembly. **Note: your sample sheet does not need to be called sample_sheet.csv** - -## Running a hybrid assembly without Unicycler -`nextflow run main.nf --input sample_sheet.csv --platform hybrid --outdir ../test_hybrid -profile singularity -resume` - -The above command would run single-end Nanopore and paired-end Illumina data using singularity as a container service, using resume (e.g if picks up where the pipeline left off if being run again), outputting data into a folder called `../test_hybrid`. **Note: your sample sheet does not need to be called sample_sheet.csv** - -## Running metagenomic Nanopore data -`nextflow run main.nf --skip_depth_sampling true --input sample_sheet.csv --skip_polishing true --skip_bakta true --metagenomic_run true --nanopore_chemistry r941_prom_hac_g507 --platform nanopore --outdir ../test_nanopore_meta -profile singularity -resume` - -The above command would run single-end Nanopore and paired-end Illumina data using singularity as a container service, using resume (e.g if picks up where the pipeline left off if being run again), outputting data into a folder call `../test_nanopore_meta`, all samples would be labeled treated as metagenomic, assembly polishing would be turned off and annotation of assemblies with Bakta would not be performed, depth sampling would not be performed either. **Note: your sample sheet does not need to be called sample_sheet.csv** \ No newline at end of file +# Command Line Examples + +Some example commands of running mikrokondo are provided below: + +## Running paired-end illumina data skipping Bakta +`nextflow run main.nf --input sample_sheet.csv --skip_bakta true --platform illumina --outdir ../test_illumina -profile singularity -resume` + +The above command would run paired-end Illumina data, using Singularity as a container service, using resume (e.g if picks up where the pipeline left off if being run again), skipping Bakta and outputting results in a folder called `test_illumina` one directory back from where the pipeline is run. **Note: your sample sheet does not need to be called sample_sheet.csv** + +## Running paired-end illumina data using Kraken2 for classifying the top species hit + +`nextflow run main.nf --input sample_sheet.csv --skip_bakta true --run_kraken true --platform illumina --outdir ../test_illumina_kraken -profile singularity -resume` + +The above command would run paired-end Illumina data, using Singularity as a container service, using resume (e.g if picks up where the pipeline left off if being run again), skipping Bakta, using kraken2 to classify the species top hit and outputting results in a folder called `test_illumina_kraken` one directory back from where the pipeline is run. **Note: your sample sheet does not need to be called sample_sheet.csv** + +## Running nanopore data +`nextflow run main.nf --input sample_sheet.csv --skip_ont_header_cleaning true --nanopore_chemistry r941_min_hac_g507 --platform nanopore --outdir ../test_nanopore -profile docker -resume` + +The above command would run single-end Nanopore data using Docker as a container service, using resume (e.g if picks up where the pipeline left off if being run again), outputting data into a folder called `../test_nanopore` and skipping the process of verifying all Nanopore fastq data headers are unique. **Note: your sample sheet does not need to be called sample_sheet.csv** + +## Running a hybrid assembly using Unicycler +`nextflow run main.nf --input sample_sheet.csv --hybrid_unicycler true --nanopore_chemistry r941_min_hac_g507 --platform hybrid --outdir ../test_hybrid -profile apptainer -resume` + +The above command would run single-end Nanopore and paired-end Illumina data using apptainer as a container service, using resume (e.g if picks up where the pipeline left off if being run again), outputting data into a folder called `../test_hybrid` and using Unicycler for assembly. **Note: your sample sheet does not need to be called sample_sheet.csv** + +## Running a hybrid assembly without Unicycler +`nextflow run main.nf --input sample_sheet.csv --platform hybrid --outdir ../test_hybrid -profile singularity -resume` + +The above command would run single-end Nanopore and paired-end Illumina data using singularity as a container service, using resume (e.g if picks up where the pipeline left off if being run again), outputting data into a folder called `../test_hybrid`. **Note: your sample sheet does not need to be called sample_sheet.csv** + +## Running metagenomic Nanopore data +`nextflow run main.nf --skip_depth_sampling true --input sample_sheet.csv --skip_polishing true --skip_bakta true --metagenomic_run true --nanopore_chemistry r941_prom_hac_g507 --platform nanopore --outdir ../test_nanopore_meta -profile singularity -resume` + +The above command would run single-end Nanopore and paired-end Illumina data using singularity as a container service, using resume (e.g if picks up where the pipeline left off if being run again), outputting data into a folder call `../test_nanopore_meta`, all samples would be labeled treated as metagenomic, assembly polishing would be turned off and annotation of assemblies with Bakta would not be performed, depth sampling would not be performed either. **Note: your sample sheet does not need to be called sample_sheet.csv** diff --git a/docs/usage/tool_params.md b/docs/usage/tool_params.md index 59b8f668..b9999ef6 100644 --- a/docs/usage/tool_params.md +++ b/docs/usage/tool_params.md @@ -18,7 +18,7 @@ Screens contigs for antimicrobial and virulence genes. If you wish to use a diff - report_tag: determines the name of the Abricate output in the final summary file. **Do no touch this unless doing pipeline development.** - header_p: This field tells the report module that the Abricate output contains headers. **Do no touch this unless doing pipeline development.** -### Raw Read Metrics +### Raw Read Metrics A custom Python script that gathers quality metrics for each fastq file. - raw_reads @@ -64,13 +64,18 @@ Fastp is fast and widely used program for gathering of read quality metrics, ada - html_ext: Extension of the html report output by fastp, do no touch unless doing pipeline development. - json_ext: Extension of json report output by FastP do not touch unless doing pipeline development. - report_tag: Title of FastP data in the summary report. - - **average_quality_e**: If a read/read-pair quality is less than this value it is discarded - - **cut_mean_quality**: The quality to trim reads too - - **qualified_quality_phred**: the quality of a base to be qualified if filtering by unqualified bases - - **unqualified_percent_limit**: The percent amount of bases that are allowed to be unqualified in a read. This parameter is affected by the above qualified_quality_phred parameter - - **illumina_length_min**: The minimum read length to be allowed in illumina data - - **single_end_length_min**: the minimum read length allowed in Pacbio or Nanopore data - - **dedup_reads**: A parameter to be turned on to allow for deduplication of reads. + - **average_quality_e**: If a read/read-pair quality is less than this value it is discarded. Can be set from the command line with `--fp_average_quality`. + - **cut_tail_mean_quality**: The mean quality threshold for a sliding window below which trailing bases are trimmed from the reads. Can be set from the command line with `--fp_cut_tail_mean_quality` + - **cut_tail_window_size**: The window size to cut a tail with. Can be set from the command line with `--fp_cut_tail_window_size`. + - **complexity_threshold**: the threshold for low complexity filter. Can be set from the command line with `--fp_complexity_threshold`. + - **qualified_quality_phred**: the quality of a base to be qualified if filtering by unqualified bases. Can be set from the command line with `--fp_qualified_phred`. + - **unqualified_percent_limit**: The percent amount of bases that are allowed to be unqualified in a read. This parameter is affected by the above qualified_quality_phred parameter and can be specified from the command line with `--fp_unqualified_percent_limit`. + - **polyg_min_len**: The minimum length to detect a polyG tail. This value can be set from the command line with `--fp_polyg_min_len`. + - **polyx_min_len**: The minimum length to detect a polyX tail. This value can be set from the command line with `--fp_polyx_min_len`. + - **illumina_length_min**: The minimum read length to be allowed in illumina data. This value can be set from the command line with `--fp_illumina_length_min`. + - **illumina_length_max**: The maximum read length allowed for illumina data. This value can be set from the command line with `--fp_illumina_length_max`. + - **single_end_length_min**: the minimum read length allowed in Pacbio or Nanopore data. This value can be set from the command line with `--fp_single_end_length_min`. + - **dedup_reads**: A parameter to be turned on to allow for deduplication of reads. This value can be set from the command line with `--fp_dedup_reads`. - **illumina_args**: The command string passed to Fastp when using illumina data, if you override this parameter other set parameters such as average_quality_e must be overridden as well as the command string will be passed to FastP as written - **single_end_args**: The command string passed to FastP if single end data is used e.g. Pacbio or Nanopore data. If this option is overridden you must specify all parameters passed to Fastp as this string is passed to FastP as written. - report_exclude_fields: Fields in the summary json to be excluded from the final aggregated report. Do not alter this field unless doing pipeline development @@ -130,7 +135,7 @@ Quast is used to gather assembly metrics which automated quality control criteri - suffix: The suffix attached to quast outputs. Do not alter this field unless doing pipeline development. - report_base: The base term for output quast files to be used in reporting. Do not alter this field unless doing pipeline development. - report_prefix: The prefix of the quast outputs to be used in reporting. Do not alter this field unless doing pipeline development. - - **min_contig_length**: The minimum length of for contigs to be used in quasts generation of metrics. Do not alter this field unless doing pipeline development. + - **min_contig_length**: The minimum length of for contigs to be used in quasts generation of metrics. Do not alter this field unless doing pipeline development. This argument can be set from the command line with `--qt_min_contig_length`. - **args**: A command string to past to quast, altering this is unadvised as certain options may affect your reporting output. This string will be passed to quast verbatim. Do not alter this field unless doing pipeline development. - header_p: This tells the pipeline that the Quast report outputs contains a header. Do not alter this field unless doing pipeline development. @@ -167,7 +172,7 @@ Kraken2 can be used a substitute for mash in speciation of samples, and it is us - singularity: Singularity container for the Kraken2. - docker: Docker container for Kraken2. - classified_suffix: Suffix for classified data from Kraken2. Do not alter this field unless doing pipeline development. - - unclassified_suffix: Suffic for unclassified data from Kraken2. Do not alter this field unless doing pipeline development. + - unclassified_suffix: Suffix for unclassified data from Kraken2. Do not alter this field unless doing pipeline development. - report_suffix: The name of the report output by Kraken2. - output_suffix: The name of the output file from Kraken2. Do not alter this field unless doing pipeline development. - **tophit_level**: The taxonomic level to classify a sample at. e.g. default is `S` for species but you could use `S1` or `F`. @@ -179,7 +184,7 @@ Kraken2 can be used a substitute for mash in speciation of samples, and it is us - headers: A list of headers in the Kraken2 report. Do not alter this field unless doing pipeline development. ### Seven Gene MLST -Run Torstein Tseemans seven gene MLST program. +Run Torsten Seemann's seven gene MLST program. - mlst - singularity: Singularity container for mlst. @@ -202,7 +207,7 @@ Mash is used repeatedly througout the pipeline for estimation of genome size fro - sketch_ext: File extension of a mash sketch. Do not alter this field unless doing pipeline development. - json_ext: File extension of json data output by Mash. Do not alter this field unless doing pipeline development. - sketch_kmer_size: The size of the kmers used in the sketching in genome size estimation. - - **min_kmer**: The minimum number of kmer copies required to pass the noise filter. this value is used in estimation of genome size from reads. The default value is 10 as it seems to work well for Illumina data. + - **min_kmer**: The minimum number of kmer copies required to pass the noise filter. this value is used in estimation of genome size from reads. The default value is 10 as it seems to work well for Illumina data. This value can be set from the command line by setting `--mh_min_kmer`. - final_sketch_name: **to be removed** This parameter was originally part of a subworkflow included in the pipeline for generation of the GTDB sketch. But this has been removed and replaced with scripting. - report_tag: Report tag for Mash in the summary report. Do not alter this field unless doing pipeline development. - header_p: Tells the pipeline if the output data contains headers. Do not alter this field unless doing pipeline development. @@ -338,7 +343,7 @@ StarAMR provides annotation of antimicrobial resistance genes within your data. - txt_ext: File extension of the text reports from StarAMR. Do not alter this field unless doing pipeline development. - xlsx_ext: File extension of the excel spread sheet from StarAMR. Do not alter this field unless doing pipeline development. - **args**: Additional arguments to pass to StarAMR. Do not alter this field unless doing pipeline development. - - point_finder_dbs: A list containing the valid databases StarAMR supports for pointfinder. The way they are structured matches what StarAMR needs for input. Do not alter this field unless doing pipeline development. Do not alter this field unless doing pipeline development. + - point_finder_dbs: A list containing the valid databases StarAMR supports for pointfinder. The way they are structured matches what StarAMR needs for input. Do not alter this field unless doing pipeline development. - report_tag: The field name of StarAMR in the final summary report. Do not alter this field unless doing pipeline development. - header_p: Indicates the final report from StarAMR contains a header line. Do not alter this field unless doing pipeline development. @@ -348,7 +353,7 @@ Bakta is used to provide annotation of genomes, it is very reliable but it can b - bakta - singularity: The singularity container containing Bakta. - docker: The Docker container containing Bakta. - - **db**: the path where the downloaded Bakta database should be downloaded. + - **db**: the path where the downloaded Bakta database should be downloaded. This can be set from the command line using the argument `--bakta_db`. - output_dir: The name of the folder where Bakta data is saved too. Do not alter this field unless doing pipeline development. - embl_ext: File extension of embl file. Do not alter this field unless doing pipeline development. - faa_ext: File extension of faa file. Do not alter this field unless doing pipeline development. @@ -361,7 +366,7 @@ Bakta is used to provide annotation of genomes, it is very reliable but it can b - hypotheticals_faa_ext: File extension of hypothetical genes fasta. Do not alter this field unless doing pipeline development. - tsv_ext: The file extension of the final bakta tsv report. Do not alter this field unless doing pipeline development. - txt_ext: The file extension of the txt report. Do not alter this field unless doing pipeline development. - - min_contig_length: The minimum contig length to be annotated by Bakta. + - min_contig_length: The minimum contig length to be annotated by Bakta. This can be set from the command line using the argument `--ba_min_contig_length`. ### Bandage Bandage is included to make bandage plots of the initial assemblies e.g. Spades, Flye or Unicycler. These images can be useful in determining the quality of an assembly. @@ -436,7 +441,7 @@ Performs typing of *Staphylococcus* species. - header_p: Denotes if the output table of Lissero contains a header. Do not alter this field unless doing pipeline development. ### Shigeifinder -*in-silico Shigella* typing. +*in-silico Shigella* typing. >**NOTE:** It is unlikely this subtyper will be triggered as GTDB has merged *E.coli* and *Shigella* in an updated sketch. An updated version of ECTyper will be released soon to address the shortfalls of this sketch. If you are relying on *Shigella* detection add `--run_kraken true` to your command line or update the value in the `.nextflow.config` as Kraken2 (while slower) can still detect *Shigella*. - shigeifinder diff --git a/docs/usage/useage.md b/docs/usage/usage.md similarity index 69% rename from docs/usage/useage.md rename to docs/usage/usage.md index 02c465be..32e2035f 100644 --- a/docs/usage/useage.md +++ b/docs/usage/usage.md @@ -1,11 +1,11 @@ # Running MikroKondo -## Useage +## Usage MikroKondo can be run like most other nextflow pipelines. The most basic usage is as follows: `nextflow run main.nf --input PATH_TO_SAMPLE_SHEET --outdir OUTPUT_DIR --platform SEQUENCING_PLATFORM -profile CONTAINER_TYPE` -Many parameters can be altered or accessed from the command line. For a full list of parameters to be altered please refer to the `nextflow.config` file in the repo. +Many parameters can be altered or accessed from the command line. For a full list of parameters to be altered please refer to the `nextflow.config` file in the repo. ## Input @@ -15,13 +15,13 @@ This pipeline requires the following as input: This pipeline requires sample files to be gzipped (symlinks may be problematic). ### Samplesheet (CSV) -Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) contains the samples names and allows a user to combine read-sets based on that name if provided. The sample-sheet can utilize the following header fields: +Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) contains the samples names and allows a user to combine read-sets based on that name if provided. The sample-sheet can utilize the following header fields: -- sample -- fastq_1 -- fastq_2 -- long_reads -- assembly +- sample +- fastq_1 +- fastq_2 +- long_reads +- assembly Example layouts for different sample-sheets include: @@ -50,6 +50,14 @@ _Starting with assembly only_ |------|--------| |sample_name|path_to_assembly| +_Example merging paired-end data_ + +|sample|fastq_1|fastq_2| +|------|-------|-------| +|my_sample|path_to_forward_reads_1|path_to_reversed_reads_1| +|my_sample|path_to_forward_reads_2|path_to_reversed_reads_2| + +*The value of `my_sample` is repeated twice allowing sample to be merged on the common key. This works for nanopore data, hybrid assembly data and assemblies* ## Command line arguments @@ -100,6 +108,51 @@ Numerous steps within mikrokondo can be turned off without compromising the stab - `--skip_starmar`: turn off starAMR AMR detection. - `--skip_subtyping`: to turn off automatic triggering of subtyping in the pipeline (useful when target organism does not have a subtyping tool installed within mikrokondo). - `--skip_version_gathering`: prevents the collation of tool versions. This process generally takes a couple minutes (at worst) but can be useful when during recurrent runs of the pipeline (like when testing settings). +- `--skip_report`: Prevents creation of final report summary report amalgamating outputs of all other files, this will also turnoff the creation of individual sub-reports. +- `--skip_metagenomic_detection`: Skips classification of sample as metagnomic and forces a sample to be analyzed as an isolate. +- `--skip_raw_read_metrics`: Prevents generation of raw read metrics, e.g. metrics generated about the reads before any trimming or filtering is performed. +- `--skip_mlst`: Skip seven gene MLST. + +#### Datasets +Different databases/pre-computed files are required for usage within mikrokondo. These can be downloaded or created by the user, and if not configured within the `nextflow.config` file they can be passed in as files with the following command-line arguments. + +- `--dehosting_idx`: The minimap2 index to be used for dehosting. +- `--mash_sketch`: The mash sketch to be used for contamination detection and speciation. +- `--bakta_db`: Bakta database for genome annotation. +- `--kraken2_db`: Kraken2 database that can be used for speciation and binning of meta-genomically assembled contigs. +- `--staramr_db`: An optional StarAMR database to be passed in, it is recommended to use the database packaged in the container. + +#### FastP Arguments +For simplicity parameters affecting FastP have been moved to the top level. Each argument matches one listed within the [FastP](https://phac-nml.github.io/mikrokondo/usage/tool_params/#fastp) usage section with only a `fp_` being appended to the front of the argument. For a more detailed description of what each argument does please review the tool specific parameters for [FastP](https://phac-nml.github.io/mikrokondo/usage/tool_params/#fastp) here. + +- `--fp_average_quality`: If a read/read-pair quality is less than this value it is discarded +- `--fp_cut_tail_mean_quality`: the quality threshold to trim reads to +- `--fp_cut_tail_window_size`: The window size to cut a tail with. +- `--fp_complexity_threshold`: The threshold for low complexity filter. +- `--fp_qualified_phred`: The quality of a base to be qualified if filtering by unqualified bases. +- `--fp_unqualified_percent_limit`: The percent amount of bases that are allowed to be unqualified in a read. +- `--fp_polyg_min_len`: The minimum length to detect a polyG tail. +- `--fp_polyx_min_len`: The minimum length to detect a polyX tail. +- `--fp_illumina_length_min`: The minimum read length to be allowed in illumina data. +- `--fp_illumina_length_max`: The maximum read length allowed for illumina data. +- `--fp_single_end_length_min`: the minimum read length allowed in Pacbio or Nanopore data. +- `--fp_dedup_reads`: A parameter to be turned on to allow for deduplication of reads. + +#### Bakta Parameters +Top level parameters that can be passed to Bakta. + +- `--ba_min_contig_length`: Minimum contig length to be analyzed by Bakta + +#### Quast Parameters +Top level parameters that can be passed to Quast. + +- `--qt_min_contig_length`: Minimum length of a contig to be analyzed within Quast. + +#### Mash parameters +Top level parameters to be passed to Mash. + +- `--mh_min_kmer`: The minimum time a kmer needs to appear to be used in genome size estimation by mash. + #### Containers @@ -119,7 +172,7 @@ Different container services can be specified from the command line when running - `--platform nanopore` for Nanopore. - `--platform pacbio` for Pacbio - `--platform hybrid` for hybrid assemblies. - > **Note:** when denoting your run as using a hybrid platform, you must also add in the long_read_opt parameter as the defualt value is nanopore**. `--long_read_opt nanopore` for nanopore or `--long_read_opt pacbio` for pacbio. + > **Note:** when denoting your run as using a hybrid platform, you must also add in the long_read_opt parameter as the default value is nanopore**. `--long_read_opt nanopore` for nanopore or `--long_read_opt pacbio` for pacbio. #### Slurm options @@ -135,8 +188,8 @@ All output files will be written into the `outdir` (specified by the user). More - **pipeline_info** - dir containing all pipeline related information including software versions used and execution reports. - **ReadQuality** - dir containing all read tool related output, including contamination, fastq, mash, and subsampled read sets (when present) - **subtyping** - dir containing all subtyping tool related output, including SISTR, ECtyper, etc. -- **SummaryReport** - dir containing collated results files for all tools, including: - - Individual sample flatted json reports +- **SummaryReport** - dir containing collated results files for all tools, including: + - Individual sample flattened json reports - **final_report** - All tool results for all samples in both .json (including a flattened version) and .tsv format - **bco.json** - data providence file generated from the nf-prov plug-in -- **manifest.json** - data providence file generated from the nf-prov plug-in \ No newline at end of file +- **manifest.json** - data providence file generated from the nf-prov plug-in diff --git a/docs/workflows/CleanAssemble.md b/docs/workflows/CleanAssemble.md index a4877694..faf7c916 100644 --- a/docs/workflows/CleanAssemble.md +++ b/docs/workflows/CleanAssemble.md @@ -1,39 +1,71 @@ -# Clean Assemble -## workflows/local/CleanAssemble - -## Included sub-workflows - -- `assemble_reads.nf` -- `clean_reads.nf` -- `hybrid_assembly.nf` -- `input_check.nf` -- `polish_assemblies.nf` - - -## Steps -1. **[QC reads](/subworkflows/clean_reads)** subworkflow steps in brief are listed below, for further information see [clean_reads.nf](https://github.com/phac-nml/mikrokondo/blob/main/subworkflows/local/clean_reads.nf) - - Reads are checked for known sequencing contamination - - Quality metrics are calculated - - Reads are trimmed - - Coverage is estimated - - Read set subsampled to set level (OPTIONAL) - - Read set is assessed to be either an isolate or metagenomic sample (from presence of multiple taxa) - -2. **[Assemble reads](/subworkflows/assemble_reads)** using the `params.platform` flag, read sets will be diverted to either the assemble_reads (short reads) or hybrid_assembly (short and/or long reads) workflow. Though the data is handled differently in eash subworklow, both generate a contigs file and a bandage image, with an option of initial polishing via Racon. See [assemble_reads.nf](https://github.com/phac-nml/mikrokondo/blob/main/subworkflows/local/assemble_reads.nf) and [hybrid_assembly.nf](https://github.com/phac-nml/mikrokondo/blob/main/subworkflows/local/hybrid_assembly.nf) subworkflow pages for more details. - -3. **[Polish assembles](/subworkflows/polish_assemblies)** (OPTIONAL) Polishing of contigs can be added [polish_assemblies.nf](https://github.com/phac-nml/mikrokondo/blob/main/subworkflows/local/polish_assemblies.nf). To make changes to the default workflow, see setting 'optional flags' page. - -## Input -- Next generation sequencing reads: - + Short read - Illumina - + Long read: - * Nanopore - * Pacbio - -## Output -- quality trimmed and deconned reads (fastq) -- estimated genome size -- estimated heterozygozity -- assembled contigs (fasta) -- bandage image (png) -- software versions +# Clean Assemble +## workflows/local/CleanAssemble + +## Included sub-workflows + +- `assemble_reads.nf` +- `clean_reads.nf` +- `hybrid_assembly.nf` +- `input_check.nf` +- `polish_assemblies.nf` + + +## Steps +1. **[QC reads](/mikrokondo/subworkflows/clean_reads)** subworkflow steps in brief are listed below, for further information see [clean_reads.nf](https://github.com/phac-nml/mikrokondo/blob/main/subworkflows/local/clean_reads.nf) + - Reads are checked for known sequencing contamination + - Quality metrics are calculated + - Reads are trimmed + - Coverage is estimated + - Read set subsampled to set level (OPTIONAL) + - Read set is assessed to be either an isolate or metagenomic sample (from presence of multiple taxa) + +2. **[Assemble reads](/mikrokondo/subworkflows/assemble_reads)** using the `params.platform` flag, read sets will be diverted to either the assemble_reads (short reads) or hybrid_assembly (short and/or long reads) workflow. Though the data is handled differently in eash subworklow, both generate a contigs file and a bandage image, with an option of initial polishing via Racon. See [assemble_reads.nf](https://github.com/phac-nml/mikrokondo/blob/main/subworkflows/local/assemble_reads.nf) and [hybrid_assembly.nf](https://github.com/phac-nml/mikrokondo/blob/main/subworkflows/local/hybrid_assembly.nf) subworkflow pages for more details. + +3. **[Polish assembles](/mikrokondo/subworkflows/polish_assemblies)** (OPTIONAL) Polishing of contigs can be added [polish_assemblies.nf](https://github.com/phac-nml/mikrokondo/blob/main/subworkflows/local/polish_assemblies.nf). To make changes to the default workflow, see setting 'optional flags' page. + +## Input +- Next generation sequencing reads: + + Short read - Illumina + + Long read: + * Nanopore + * Pacbio + +## Output +- Reads + + FinalReads + * SAMPLE + + **Processing** + * Dehosting + - Trimmed + - FastP + - MashSketches + + Quality + * RawReadQuality + * Trimmed + - FastP + - MashScreen +- Assembly + + Assembling + * Bandage + * ConsensusGeneration + - Polishing + - Pilon + - BAMs + - Changes + - Fasta + - VCF + - Racon + - Consensus + * Spades + - Contigs + - GeneClusters + - Graphs + - Logs + - Scaffolds + - Transcripts + + FinalAssembly + * SAMPLE + +>NOTE: +>Bolded directories contain the nested structure of tool output. The further into the structure you go, the further along the workflow that that tool was run. +> \ No newline at end of file diff --git a/docs/workflows/PostAssembly.md b/docs/workflows/PostAssembly.md index d518fdff..db9a67e9 100644 --- a/docs/workflows/PostAssembly.md +++ b/docs/workflows/PostAssembly.md @@ -1,31 +1,47 @@ -# Post assembly -## workflows/local/PostAssembly - -This workflow is triggered in two ways: 1. when assemblies are used for initial input to the pipeline; and 2. after the `CleanAssemble.nf` workflow completes. Within this workflow, Quast, CheckM, species determination (Using Kraken2 or Mash), annotation and subtyping are all performed. - -## Included sub-workflows - -- `annotate_genomes.nf` -- `determine_species.nf` -- `polish_assemblies.nf` -- `qc_assemblies.nf` -- `split_metagenomic.nf` -- `subtype_genome.nf` - -## Steps -1. **Determine type** using the `metagenomic_samples` flag, this workflow will direct assemblies to the following two paths: - 1. Isolate: proceeds to step 2. - 2. Metagenomic: runs the following two modules before proceeding to step 2. - 1. [kraken.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/kraken.nf) runs kraken2 on contigs - 2. [bin_kraken2.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bin_kraken2.nf) bins contigs to respective genus level taxa -2. **[QC assemblies](/subworkflows/qc_assembly)** (OPTIONAL) runs quast and assigns quality metrics to generated assemblies -3. **[Determine species](/subworkflows/determine_species)** (OPTIONAL) runs classifier tool (default: [Mash](https://github.com/marbl/Mash)) to determine sample or binned species -4. **[Subtype genome](/subworkflows/subtype_genome)** (OPTIONAL) species specific subtyping tools are launched using a generated MASH screen report. -5. **[Annotate genome](/subworkflows/genomes_annotate)** (OPTIONAL) tools for annotation and identification of genes of interest are launched as a part of this step. - -## Input -- Contig file (fasta) - -## Output -- Tab delimited file containing collated results from all subworkflows -- JSON file containing output of workflow outputs +# Post assembly +## workflows/local/PostAssembly + +This workflow is triggered in two ways: 1. when assemblies are used for initial input to the pipeline; and 2. after the `CleanAssemble.nf` workflow completes. Within this workflow, Quast, CheckM, species determination (Using Kraken2 or Mash), annotation and subtyping are all performed. + +## Included sub-workflows + +- `annotate_genomes.nf` +- `determine_species.nf` +- `polish_assemblies.nf` +- `qc_assemblies.nf` +- `split_metagenomic.nf` +- `subtype_genome.nf` + +## Steps +1. **Determine type** using the `metagenomic_samples` flag, this workflow will direct assemblies to the following two paths: + 1. Isolate: proceeds to step 2. + 2. Metagenomic: runs the following two modules before proceeding to step 2. + 1. [kraken.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/kraken.nf) runs kraken2 on contigs + 2. [bin_kraken2.nf](https://github.com/phac-nml/mikrokondo/blob/main/modules/local/bin_kraken2.nf) bins contigs to respective genus level taxa. +2. **[QC assemblies](/mikrokondo/subworkflows/qc_assembly)** (OPTIONAL) runs quast and assigns quality metrics to generated assemblies. +3. **[Determine species](/mikrokondo/subworkflows/determine_species)** (OPTIONAL) runs classifier tool (default: [Mash](https://github.com/marbl/Mash)) to determine sample or binned species. +4. **[Subtype genome](/mikrokondo/subworkflows/subtype_genome)** (OPTIONAL) species specific subtyping tools are launched using a generated MASH screen report. +5. **[Annotate genome](/mikrokondo/subworkflows/genomes_annotate)** (OPTIONAL) tools for annotation and identification of genes of interest are launched as a part of this step. + +## Input +- Contig file (fasta) from the `FinalAssembly` dir + - This is the final contig file from the last step in the `CleanAssemble` workflow (taking into account any skip flags that have been used) + +## Output +- Subtyping + + TYPINGTOOL + * SAMPLE +- FinalReports + + Aggregated + * Json + * Tables + + FlattenedReports + + Sample + * Json + +>SUBTYPING: +>Within the subtyping directory there will be directories for each of the different subtyping tools used during that run. The number and type of tools will differ depending on the organisms present in the set of samples submitted to the pipeline. + + +>FINAL REPORTS: +>Within mikrokondo, a number of reports have been created to collate the different tool outputs. These are a quicker way to view the final results for your sample runs. diff --git a/main.nf b/main.nf index 5e7daaab..71223572 100644 --- a/main.nf +++ b/main.nf @@ -76,8 +76,6 @@ workflow MIKROKONDO { logger2.setLevel(ch.qos.logback.classic.Level.DEBUG) } - - log.info paramsSummaryLog(workflow) ch_reports = Channel.empty() diff --git a/modules/local/abricate.nf b/modules/local/abricate.nf index eeaa661b..698f7891 100644 --- a/modules/local/abricate.nf +++ b/modules/local/abricate.nf @@ -4,7 +4,7 @@ process ABRICATE { tag "$meta.id" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: diff --git a/modules/local/bakta_annotate.nf b/modules/local/bakta_annotate.nf index e44ae55c..f8cc3b63 100644 --- a/modules/local/bakta_annotate.nf +++ b/modules/local/bakta_annotate.nf @@ -5,7 +5,7 @@ process BAKTA_ANNOTATE { tag "$meta.id" label 'process_high' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta) diff --git a/modules/local/bakta_download_db.nf b/modules/local/bakta_download_db.nf index 42e220bd..754424dd 100644 --- a/modules/local/bakta_download_db.nf +++ b/modules/local/bakta_download_db.nf @@ -4,7 +4,7 @@ process BAKTA_DB_DOWNLOAD { label 'process_single' storeDir "${params.bakta.db_output}" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" output: path "db*", emit: db diff --git a/modules/local/bandage_image.nf b/modules/local/bandage_image.nf index b82e5c2e..642e9e6d 100644 --- a/modules/local/bandage_image.nf +++ b/modules/local/bandage_image.nf @@ -3,7 +3,7 @@ process BANDAGE_IMAGE { tag "$meta.id" label "process_low" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(gfa) diff --git a/modules/local/bin_kraken2.nf b/modules/local/bin_kraken2.nf index 2ac7725d..3931146c 100644 --- a/modules/local/bin_kraken2.nf +++ b/modules/local/bin_kraken2.nf @@ -8,7 +8,7 @@ process BIN_KRAKEN2{ tag "$meta.id" label "process_low" cache 'deep' // ! Deep caching is required to not bungle up the later metadata updates on resumes - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(contigs), path(kraken_report), path(kraken_output) @@ -21,7 +21,7 @@ process BIN_KRAKEN2{ prefix = task.ext.prefix ?: "${meta.id}" """ kraken2_bin.py ${kraken_report} ${kraken_output} ${contigs} ${taxonomic_level} - for i in *_binned.fasta + for i in *.binned.fasta do mv \$i ${prefix}_\$i gzip ${prefix}_\$i diff --git a/modules/local/check_ont.nf b/modules/local/check_ont.nf index b730d273..d5122d5b 100644 --- a/modules/local/check_ont.nf +++ b/modules/local/check_ont.nf @@ -3,7 +3,7 @@ process CHECK_ONT{ tag "$meta.id" label "process_single" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" // TODO add to publish dir diff --git a/modules/local/checkm_lineagewf.nf b/modules/local/checkm_lineagewf.nf index 1e55cada..d865be0b 100644 --- a/modules/local/checkm_lineagewf.nf +++ b/modules/local/checkm_lineagewf.nf @@ -4,13 +4,13 @@ process CHECKM_LINEAGEWF { tag "$meta.id" label 'process_high' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta) output: - tuple val(meta), path("${prefix}/*"), emit: checkm_output + tuple val(meta), path("${prefix}/**"), emit: checkm_output tuple val(meta), path("${prefix}/${prefix}${params.checkm.results_ext}"), emit: checkm_results tuple val(meta), path("${prefix}/${params.checkm.lineage_ms}"), emit: lineage_ms path "versions.yml", emit: versions diff --git a/modules/local/chopper_trim.nf b/modules/local/chopper_trim.nf index db37adb5..436c218c 100644 --- a/modules/local/chopper_trim.nf +++ b/modules/local/chopper_trim.nf @@ -3,7 +3,7 @@ process CHOPPER_TRIM{ tag "${meta.id}" label "process_medium" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: diff --git a/modules/local/combine_data.nf b/modules/local/combine_data.nf index 05ea2cd6..cf76dded 100644 --- a/modules/local/combine_data.nf +++ b/modules/local/combine_data.nf @@ -3,7 +3,7 @@ process COMBINE_DATA{ tag "${meta.id}" label "process_low" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fastq_1), path(fastq_2), path(long_reads), path(assembly) diff --git a/modules/local/ectyper.nf b/modules/local/ectyper.nf index 42853588..fe574aac 100644 --- a/modules/local/ectyper.nf +++ b/modules/local/ectyper.nf @@ -1,13 +1,12 @@ // Module for running ectyper -// borrowed from: https://github.com/nf-core/modules/blob/master/modules/nf-core/ectyper/main.nf +// adapted from: https://github.com/nf-core/modules/blob/master/modules/nf-core/ectyper/main.nf process ECTYPER{ tag "$meta.id" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" - // TODO add ECTyper temporary directory issues to docs + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta) diff --git a/modules/local/fastp_trim.nf b/modules/local/fastp_trim.nf index f12d82a7..ed714480 100644 --- a/modules/local/fastp_trim.nf +++ b/modules/local/fastp_trim.nf @@ -6,7 +6,7 @@ process FASTP_TRIM{ tag "$meta.id" label "process_medium" // fastp uses very little memory in reality, but for duplicate analysis it is better to give it more memory - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads) diff --git a/modules/local/fastqscan.nf b/modules/local/fastqscan.nf index 2fc607dd..2004b124 100644 --- a/modules/local/fastqscan.nf +++ b/modules/local/fastqscan.nf @@ -1,7 +1,7 @@ process FASTQSCAN { tag "$meta.id" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: diff --git a/modules/local/flye_assemble.nf b/modules/local/flye_assemble.nf index aae765f0..e03d2944 100644 --- a/modules/local/flye_assemble.nf +++ b/modules/local/flye_assemble.nf @@ -8,7 +8,7 @@ process FLYE_ASSEMBLE{ label 'process_high' label 'process_high_memory' label 'process_long' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" memory { task.memory * task.attempt} // TODO check if --debug flag should be added to flye to actually turns off the debug logging? diff --git a/modules/local/gzip_files.nf b/modules/local/gzip_files.nf index a349d55a..69ee6402 100644 --- a/modules/local/gzip_files.nf +++ b/modules/local/gzip_files.nf @@ -5,7 +5,7 @@ GZIP output files process GZIP_FILES { tag "${meta.id}" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(file_in) diff --git a/modules/local/kat_hist.nf b/modules/local/kat_hist.nf index 71fa6908..6018377d 100644 --- a/modules/local/kat_hist.nf +++ b/modules/local/kat_hist.nf @@ -7,7 +7,7 @@ process KAT_HIST{ tag "$meta.id" label "process_medium" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: diff --git a/modules/local/kleborate.nf b/modules/local/kleborate.nf index efd939e9..1167cef9 100644 --- a/modules/local/kleborate.nf +++ b/modules/local/kleborate.nf @@ -6,7 +6,7 @@ process KLEBORATE { tag "$meta.id" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fastas) diff --git a/modules/local/kraken.nf b/modules/local/kraken.nf index f6a8a8cc..9a7fefbd 100644 --- a/modules/local/kraken.nf +++ b/modules/local/kraken.nf @@ -3,7 +3,7 @@ process KRAKEN { tag "$meta.id" label "process_high" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: diff --git a/modules/local/lissero.nf b/modules/local/lissero.nf index f5a8d540..e56d55e7 100644 --- a/modules/local/lissero.nf +++ b/modules/local/lissero.nf @@ -6,7 +6,7 @@ process LISSERO { tag "$meta.id" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" afterScript "rm ${prefix}.fasta" // TODO add in log message saying what went wrong with the sample errorStrategy 'ignore' // TODO set a proper strategy once the issues with the mash parsing script are solved e.g. the ambiguous top hits diff --git a/modules/local/locidex_extract.nf b/modules/local/locidex_extract.nf new file mode 100644 index 00000000..0b513838 --- /dev/null +++ b/modules/local/locidex_extract.nf @@ -0,0 +1,44 @@ +/* +Locidex extract fastas for allele calling + +*/ + +process LOCIDEX_EXTRACT { + + tag "$meta.id" + label "process_low" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" + + + input: + tuple val(meta), path(fasta), path(db) + + output: + tuple val(meta), path("${meta.id}/*${params.locidex.extracted_seqs_suffix}"), path(db), emit: extracted_seqs + path "versions.yml", emit: versions + + script: + def original_file_suffix = "${params.locidex.extracted_seqs_suffix}".replace(".gz", "") + """ + locidex extract --mode ${params.locidex.extraction_mode} \\ + -i ${fasta} \\ + --n_threads ${task.cpus} \\ + -o ${meta.id} -d ${db} --force \\ + --min_evalue ${params.locidex.min_evalue} \\ + --min_dna_len ${params.locidex.min_dna_len} \\ + --min_aa_len ${params.locidex.min_aa_len} \\ + --max_dna_len ${params.locidex.max_dna_len} \\ + --min_dna_ident ${params.locidex.min_dna_ident} \\ + --min_aa_ident ${params.locidex.min_aa_ident} \\ + --min_dna_match_cov ${params.locidex.min_dna_match_cov} \\ + --min_aa_match_cov ${params.locidex.min_aa_match_cov} \\ + --max_target_seqs ${params.locidex.max_target_seqs} + + gzip ${meta.id}/*${original_file_suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + locidex extract: \$(echo \$(locidex extract -V 2>&1) | sed 's/^.*locidex //' ) + END_VERSIONS + """ +} diff --git a/modules/local/locidex_report.nf b/modules/local/locidex_report.nf new file mode 100644 index 00000000..5eae4144 --- /dev/null +++ b/modules/local/locidex_report.nf @@ -0,0 +1,41 @@ +/* Locidex report from the seq-store + +*/ + +process LOCIDEX_REPORT { + tag "$meta.id" + label "process_low" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" + + input: + tuple val(meta), path(seq_store) + + output: + tuple val(meta), path(output_name), emit: report + path "versions.yml", emit: versions + + script: + output_name = "${meta.id}${params.locidex.report_suffix}" + def is_compressed = seq_store.getName().endsWith(".gz") + def seq_store_name = seq_store.getName().replace(".gz", "") + """ + if [ "$is_compressed" == "true" ]; then + gzip -c -d $seq_store > $seq_store_name + fi + locidex report -i $seq_store_name -o . --name ${meta.id} \\ + --mode ${params.locidex.report_mode} \\ + --prop ${params.locidex.report_prop} \\ + --max_ambig ${params.locidex.report_max_ambig} \\ + --max_stop ${params.locidex.report_max_stop} \\ + --force + + gzip -c profile.json > $output_name + rm profile.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + locidex report: \$(echo \$(locidex report -V 2>&1) | sed 's/^.*locidex //' ) + END_VERSIONS + """ + +} diff --git a/modules/local/locidex_search.nf b/modules/local/locidex_search.nf new file mode 100644 index 00000000..bcb66f66 --- /dev/null +++ b/modules/local/locidex_search.nf @@ -0,0 +1,54 @@ +/* Locidex search function for the allele calling + +*/ + +process LOCIDEX_SEARCH { + + tag "$meta.id" + label "process_medium" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" + + + input: + tuple val(meta), path(fasta), path(db) + + output: + tuple val(meta), path("${output_json}"), emit: allele_calls + tuple val(meta), path("${output_gbk}"), emit: annotations, optional: true + path "versions.yml", emit: versions + + script: + // Large portion of arguments cutout due to causing issues when running + output_json = "${meta.id}${params.locidex.seq_store_suffix}" + output_gbk = "${meta.id}${params.locidex.gbk_suffix}" + + def is_compressed = fasta.getName().endsWith(".gz") ? true : false + def fasta_name = fasta.getName().replace(".gz", "") + """ + if [ "$is_compressed" == "true" ]; then + gzip -c -d $fasta > $fasta_name + fi + + locidex search -q ${fasta_name} \\ + --n_threads ${task.cpus} \\ + -o . \\ + -d ${db} --force \\ + --min_evalue ${params.locidex.min_evalue} \\ + --min_dna_len ${params.locidex.min_dna_len} \\ + --min_aa_len ${params.locidex.min_aa_len} \\ + --max_dna_len ${params.locidex.max_dna_len} \\ + --min_dna_ident ${params.locidex.min_dna_ident} \\ + --min_aa_ident ${params.locidex.min_aa_ident} \\ + --min_dna_match_cov ${params.locidex.min_dna_match_cov} \\ + --min_aa_match_cov ${params.locidex.min_aa_match_cov} \\ + --max_target_seqs ${params.locidex.max_target_seqs} + + gzip -c seq_store.json > $output_json && rm seq_store.json + test -f annotations.gbk && gzip -c annotations.gbk > $output_gbk && rm annotations.gbk + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + locidex search: \$(echo \$(locidex search -V 2>&1) | sed 's/^.*locidex //' ) + END_VERSIONS + """ +} diff --git a/modules/local/mash_estimate.nf b/modules/local/mash_estimate.nf index 31beb4d0..792a6798 100644 --- a/modules/local/mash_estimate.nf +++ b/modules/local/mash_estimate.nf @@ -6,7 +6,7 @@ Sketch reads to estimate the samples genome size process MASH_ESTIMATE{ label 'process_low' tag "${prefix}" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads) diff --git a/modules/local/mash_paste.nf b/modules/local/mash_paste.nf index 0ccf9cec..4c8c98b0 100644 --- a/modules/local/mash_paste.nf +++ b/modules/local/mash_paste.nf @@ -3,7 +3,7 @@ process MASH_PASTE{ tag "$meta.id" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(sketches) // getting all output sketches diff --git a/modules/local/mash_screen.nf b/modules/local/mash_screen.nf index 8572be15..46fed8e8 100644 --- a/modules/local/mash_screen.nf +++ b/modules/local/mash_screen.nf @@ -8,7 +8,7 @@ identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment process MASH_SCREEN { tag "$meta.id" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads) diff --git a/modules/local/mash_sketch.nf b/modules/local/mash_sketch.nf index 293b8e01..89d2ca7d 100644 --- a/modules/local/mash_sketch.nf +++ b/modules/local/mash_sketch.nf @@ -4,7 +4,7 @@ process MASH_SKETCH{ tag "$meta.id" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta), val(comment) diff --git a/modules/local/medaka_polish.nf b/modules/local/medaka_polish.nf index 15644d46..8a4e0536 100644 --- a/modules/local/medaka_polish.nf +++ b/modules/local/medaka_polish.nf @@ -3,7 +3,7 @@ process MEDAKA_POLISH{ tag "$meta.id" label 'process_high' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" afterScript 'rm -rf medaka' // clean up medaka output directory, having issues on resumes beforeScript 'rm -rf medaka' // Same reasoning as above errorStrategy { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } // May be having issues with medaka model copying diff --git a/modules/local/minimap2_index.nf b/modules/local/minimap2_index.nf index ac77fc67..cad28343 100644 --- a/modules/local/minimap2_index.nf +++ b/modules/local/minimap2_index.nf @@ -3,7 +3,7 @@ process MINIMAP2_INDEX{ tag "${meta.id}" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(contigs) diff --git a/modules/local/minimap2_map.nf b/modules/local/minimap2_map.nf index b6af76bc..a5bfc530 100644 --- a/modules/local/minimap2_map.nf +++ b/modules/local/minimap2_map.nf @@ -5,7 +5,7 @@ process MINIMAP2_MAP { tag "$meta.id" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads), path(index), path(contigs) diff --git a/modules/local/mlst.nf b/modules/local/mlst.nf index a394b5b0..d62e8224 100644 --- a/modules/local/mlst.nf +++ b/modules/local/mlst.nf @@ -3,7 +3,7 @@ process MLST { tag "$meta.id" label "process_low" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta) diff --git a/modules/local/mob_recon.nf b/modules/local/mob_recon.nf index cc91348c..c37b8ddd 100644 --- a/modules/local/mob_recon.nf +++ b/modules/local/mob_recon.nf @@ -4,7 +4,7 @@ process MOBSUITE_RECON { tag "$meta.id" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta) diff --git a/modules/local/parse_kat.nf b/modules/local/parse_kat.nf index 63e1675c..34f0190c 100644 --- a/modules/local/parse_kat.nf +++ b/modules/local/parse_kat.nf @@ -5,7 +5,7 @@ import groovy.json.JsonSlurper process PARSE_KAT { tag "$meta.id" label "process_single" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), val(json) diff --git a/modules/local/parse_kraken.nf b/modules/local/parse_kraken.nf index d6934dfe..5a4274b4 100644 --- a/modules/local/parse_kraken.nf +++ b/modules/local/parse_kraken.nf @@ -3,7 +3,7 @@ process PARSE_KRAKEN { tag "$meta.id" label "process_low" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(kraken_report) diff --git a/modules/local/parse_mash.nf b/modules/local/parse_mash.nf index 13010615..7d81de45 100644 --- a/modules/local/parse_mash.nf +++ b/modules/local/parse_mash.nf @@ -6,7 +6,7 @@ process PARSE_MASH{ tag "$meta.id" label "process_low" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(mash_screen) diff --git a/modules/local/pilon_polish.nf b/modules/local/pilon_polish.nf index 7f308529..9265a6f0 100644 --- a/modules/local/pilon_polish.nf +++ b/modules/local/pilon_polish.nf @@ -5,7 +5,7 @@ process PILON_POLISH { tag "$meta.id" label 'process_high' memory {task.memory * task.attempt} - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" errorStrategy { task.attempt <= params.pilon.max_memory_multiplier ?: 'retry'} diff --git a/modules/local/pilon_polisher.nf b/modules/local/pilon_polisher.nf index 0fe0e11e..ed50969c 100644 --- a/modules/local/pilon_polisher.nf +++ b/modules/local/pilon_polisher.nf @@ -4,7 +4,7 @@ process PILON_ITER { tag "$meta.id" label 'process_medium' memory {task.memory * task.attempt} - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads), path(contigs) diff --git a/modules/local/quast_assembly.nf b/modules/local/quast_assembly.nf index 12658cf2..d5f28e02 100644 --- a/modules/local/quast_assembly.nf +++ b/modules/local/quast_assembly.nf @@ -4,15 +4,15 @@ process QUAST { tag "$meta.id" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(contigs), path(trimmed_reads) output: - tuple val(meta), path("${prefix}/*"), path(contigs), emit: quast_data - tuple val(meta), path("${prefix}/${params.quast.report_prefix}${prefix}${params.quast.report_ext}"), path(contigs), emit: quast_table + tuple val(meta), path("${prefix}/**"), path(contigs), emit: quast_data + tuple val(meta), path("${prefix}/quast_table/${params.quast.report_prefix}${prefix}${params.quast.report_ext}"), path(contigs), emit: quast_table path "versions.yml", emit: versions script: @@ -20,6 +20,8 @@ process QUAST { def args = task.ext.args ?: "" def reads = null prefix = meta.id + + def quast_table_name = "${params.quast.report_prefix}${meta.id}${params.quast.report_ext}" def long_read_string = "--single" if (params.platform == params.opt_platforms.ont){ @@ -47,6 +49,8 @@ process QUAST { do mv \$i \${i/$params.quast.report_base/$prefix} done + mkdir '${prefix}/quast_table' + cp '${prefix}/${quast_table_name}' '${prefix}/quast_table/${quast_table_name}' cat <<-END_VERSIONS > versions.yml "${task.process}": quast: \$(quast.py --version 2>&1 | sed 's/^.*QUAST v//; s/ .*\$//') diff --git a/modules/local/racon_polish.nf b/modules/local/racon_polish.nf index ddc80943..616b2b60 100644 --- a/modules/local/racon_polish.nf +++ b/modules/local/racon_polish.nf @@ -5,7 +5,7 @@ process RACON_POLISH { tag "${meta.id}" label 'process_high' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" afterScript "rm ${input_reads}" input: diff --git a/modules/local/read_summary.nf b/modules/local/read_summary.nf index 3ed60398..c00fc388 100644 --- a/modules/local/read_summary.nf +++ b/modules/local/read_summary.nf @@ -5,7 +5,7 @@ process READ_SCAN{ label 'process_medium' tag "${meta.id}" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads), path(l_reads) diff --git a/modules/local/remove_contaminants.nf b/modules/local/remove_contaminants.nf index 9625082a..17dcf95d 100644 --- a/modules/local/remove_contaminants.nf +++ b/modules/local/remove_contaminants.nf @@ -3,7 +3,7 @@ process REMOVE_CONTAMINANTS { tag "$meta.id" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads) diff --git a/modules/local/report.nf b/modules/local/report.nf index 67aec4c1..d1f39a69 100644 --- a/modules/local/report.nf +++ b/modules/local/report.nf @@ -487,6 +487,7 @@ def range_comp(fields, qc_data, comp_val, qc_obj){ if(vals[0] <= comp_val && comp_val <= vals[1]){ qc_obj.status = true qc_obj.message = "[PASSED ${qc_obj.field}] ${comp_val} is within acceptable QC range for ${qc_data.search} (${fields[0]}: ${vals[0]} - ${fields[1]} ${vals[1]})" + qc_obj.qc_status = "PASSED" }else{ if(comp_val < vals[0]){ qc_obj.low = true @@ -494,6 +495,7 @@ def range_comp(fields, qc_data, comp_val, qc_obj){ qc_obj.low = false } qc_obj.message = "[FAILED ${qc_obj.field}] ${comp_val} is outside the acceptable ranges for ${qc_data.search} (${fields[0]}: ${vals[0]} - ${fields[1]} ${vals[1]})" + qc_obj.qc_status = "FAILED" } return qc_obj } @@ -514,9 +516,11 @@ def greater_equal_comp(fields, qc_data, comp_val, qc_obj){ if(comp_val >= vals ){ qc_obj.status = true qc_obj.message = "[PASSED ${qc_obj.field}] ${comp_val} meets QC parameter of => ${vals} for ${qc_data.search}" + qc_obj.qc_status = "PASSED" }else{ qc_obj.low = true qc_obj.message = "[FAILED ${qc_obj.field}] ${comp_val} is less than QC parameter of ${vals} for ${qc_data.search}" + qc_obj.qc_status = "FAILED" } return qc_obj } @@ -538,16 +542,18 @@ def lesser_equal_comp(fields, qc_data, comp_val, qc_obj){ if(comp_val <= vals ){ qc_obj.status = true qc_obj.message = "[PASSED ${qc_obj.field}] ${comp_val} meets QC parameter of <= ${vals} for ${qc_data.search}" + qc_obj.qc_status = "PASSED" }else{ qc_obj.low = false qc_obj.message = "[FAILED ${qc_obj.field}] ${comp_val} is greater than than QC parameter of ${vals} for ${qc_data.search}" + qc_obj.qc_status = "FAILED" } return qc_obj } def prep_qc_vals(qc_vals, qc_data, comp_val, field_val){ // Low value is added to designate if a value was too low or too high if it fails a qc threshold - def status = ["status": false, "message": "", "field": field_val, "low": false] + def status = ["status": false, "message": "", "field": field_val, "low": false, "value": comp_val, "qc_status": "WARNING"] def comp_fields = qc_vals.compare_fields switch(qc_vals.comp_type.toUpperCase()){ case 'GE': diff --git a/modules/local/report_aggregate.nf b/modules/local/report_aggregate.nf index 0dabf746..36f67c59 100644 --- a/modules/local/report_aggregate.nf +++ b/modules/local/report_aggregate.nf @@ -5,7 +5,7 @@ process REPORT_AGGREGATE{ tag "Creating alternate output formats" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: path summary_report diff --git a/modules/local/sam_to_bam.nf b/modules/local/sam_to_bam.nf index 4fcee61f..982f00f3 100644 --- a/modules/local/sam_to_bam.nf +++ b/modules/local/sam_to_bam.nf @@ -4,7 +4,7 @@ process SAM_TO_BAM{ tag "$meta.id" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(sam) diff --git a/modules/local/select_pointfinder.nf b/modules/local/select_pointfinder.nf index ecc54e0b..49376311 100644 --- a/modules/local/select_pointfinder.nf +++ b/modules/local/select_pointfinder.nf @@ -39,19 +39,15 @@ process IDENTIFY_POINTDB { // Remove spurious characters and strings that may affect database identification e.g. Entercoccus_B -> it would get rid of the B species_data = species_data.findAll { it.size() >= shortest_entry } - def db_opt = null + def db_opt = params.staramr.point_finder_db_default // Find exact match for(int db in 0..databases.size()-1){ - //println databases[db] def match_size = databases[db].size() // if match size is a single value, only need to match one value // tile the species list def tokens = tokenize_values(species_data, match_size) def db_found = compare_lists(databases[db], tokens) if(db_found){ - //println db_found - //println params.staramr.point_finder_dbs[db] - //println databases[db] db_opt = params.staramr.point_finder_dbs[db] break } diff --git a/modules/local/seqkit_filter.nf b/modules/local/seqkit_filter.nf index 591c652b..1dbc03c5 100644 --- a/modules/local/seqkit_filter.nf +++ b/modules/local/seqkit_filter.nf @@ -4,7 +4,7 @@ process SEQKIT_FILTER { tag "$meta.id" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: @@ -12,7 +12,7 @@ process SEQKIT_FILTER { val min_length output: - tuple val(meta), path("${prefix}${params.seqkit.fasta_ext}"), emit: filtered_sequences + tuple val(meta), path("${prefix}${params.seqkit.fasta_ext}"), path(reads), emit: filtered_sequences path "versions.yml", emit: versions script: diff --git a/modules/local/seqkit_stats.nf b/modules/local/seqkit_stats.nf index 1c3413a0..ad4d2415 100644 --- a/modules/local/seqkit_stats.nf +++ b/modules/local/seqkit_stats.nf @@ -5,7 +5,7 @@ process SEQKIT_STATS { tag "$meta.id" label 'process_single' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(contigs), path(trimmed_reads) diff --git a/modules/local/seqtk_fasta_fastq.nf b/modules/local/seqtk_fasta_fastq.nf index e7a146ea..07ec2b8b 100644 --- a/modules/local/seqtk_fasta_fastq.nf +++ b/modules/local/seqtk_fasta_fastq.nf @@ -5,7 +5,7 @@ process SEQTK_FASTA_FASTQ{ tag "$meta.id" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta) diff --git a/modules/local/seqtk_sample.nf b/modules/local/seqtk_sample.nf index 5c3e811c..61628b94 100644 --- a/modules/local/seqtk_sample.nf +++ b/modules/local/seqtk_sample.nf @@ -4,7 +4,7 @@ process SEQTK_SAMPLE{ tag "${meta.id}" label "process_low" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads), val(sample_fraction) diff --git a/modules/local/seqtk_size.nf b/modules/local/seqtk_size.nf index b7366bbd..a720ab95 100644 --- a/modules/local/seqtk_size.nf +++ b/modules/local/seqtk_size.nf @@ -1,7 +1,7 @@ process SEQTK_SIZE{ tag "${meta.id}" label "process_low" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads) @@ -17,7 +17,7 @@ process SEQTK_SIZE{ def prefix = task.ext.prefix ?: "${meta.id}" output = "${meta.id}_basecounts.txt" """ - seqtk size ${reads.join(" ")} > ${output} + zcat ${reads.join(" ")} | seqtk size - > ${output} cat <<-END_VERSIONS > versions.yml\n"${task.process}":\n seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//')\nEND_VERSIONS """ diff --git a/modules/local/shigatyper.nf b/modules/local/shigatyper.nf index f45358e2..2f020f6c 100644 --- a/modules/local/shigatyper.nf +++ b/modules/local/shigatyper.nf @@ -6,7 +6,7 @@ process SHIGATYPER{ tag "$meta.id" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads) // in mkkondo it will only work with assemblies diff --git a/modules/local/shigeifinder.nf b/modules/local/shigeifinder.nf index 282e3ef8..a77ce468 100644 --- a/modules/local/shigeifinder.nf +++ b/modules/local/shigeifinder.nf @@ -5,7 +5,7 @@ process SHIGEIFINDER { tag "$meta.id" label 'process_low' afterScript "rm $tmp_file" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(seqs) diff --git a/modules/local/sistr.nf b/modules/local/sistr.nf index c9d74f10..e709a2ee 100644 --- a/modules/local/sistr.nf +++ b/modules/local/sistr.nf @@ -1,10 +1,10 @@ -// SISTR on the cluster! borrowing from https://github.com/nf-core/modules/blob/master/modules/nf-core/sistr/main.nf agina +// SISTR on the cluster! taking from https://github.com/nf-core/modules/blob/master/modules/nf-core/sistr/main.nf agina process SISTR { tag "$meta.id" label 'process_medium' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta) @@ -17,7 +17,10 @@ process SISTR { path "versions.yml" , emit: versions script: - def args = task.ext.args ?: '' + def args = "" + if(params.sistr.full_cgmlst){ + args << "--use-full-cgmlst-db" + } def prefix = task.ext.prefix ?: "${meta.id}" def is_compressed = fasta.getName().endsWith(".gz") ? true : false def fasta_name = fasta.getName().replace(".gz", "") @@ -30,9 +33,9 @@ process SISTR { --qc \\ $args \\ --threads $task.cpus \\ - --alleles-output ${prefix}-allele.json \\ - --novel-alleles ${prefix}-allele.fasta \\ - --cgmlst-profiles ${prefix}-cgmlst.csv \\ + --alleles-output ${prefix}.allele.json \\ + --novel-alleles ${prefix}.allele.fasta \\ + --cgmlst-profiles ${prefix}.cgmlst.csv \\ --output-prediction ${prefix} \\ --output-format tab \\ $fasta_name diff --git a/modules/local/spades_assemble.nf b/modules/local/spades_assemble.nf index b3443364..f97aa1a5 100644 --- a/modules/local/spades_assemble.nf +++ b/modules/local/spades_assemble.nf @@ -4,7 +4,7 @@ process SPADES_ASSEMBLE { tag "$meta.id" label 'process_high' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(reads) diff --git a/modules/local/spatyper.nf b/modules/local/spatyper.nf index 8d6e1a51..daf3b4df 100644 --- a/modules/local/spatyper.nf +++ b/modules/local/spatyper.nf @@ -6,7 +6,7 @@ process SPATYPER { tag "$meta.id" label 'process_low' - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta) diff --git a/modules/local/staramr.nf b/modules/local/staramr.nf index e7f3553c..bd0dd9c4 100644 --- a/modules/local/staramr.nf +++ b/modules/local/staramr.nf @@ -3,7 +3,7 @@ process STARAMR { tag "${meta.id}" label "process_medium" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(fasta), val(point_finder_db) @@ -18,6 +18,7 @@ process STARAMR { tuple val(meta), path("$prefix/mlst${params.staramr.tsv_ext}"), emit: mlst tuple val(meta), path("$prefix/settings${params.staramr.txt_ext}"), emit: settings tuple val(meta), path("$prefix/results${params.staramr.xlsx_ext}"), emit: results_xlsx + tuple val(meta), path("$prefix/hits/*"), emit: hits, optional: true path "versions.yml", emit: versions script: diff --git a/modules/local/staramr_version.nf b/modules/local/staramr_version.nf deleted file mode 100644 index 3b1b0fd1..00000000 --- a/modules/local/staramr_version.nf +++ /dev/null @@ -1,37 +0,0 @@ -// Dump database versions for StarAMR - - -process STARAMR_DUMP_DB_VERSIONS { - tag "StarAMR DB Versions" - label "process_low" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" - cache false - - input: - path(db) - - output: - path("StarAMRDBVersions.txt"), emit: db_versions - path "versions.yml", emit: versions - - - script: - def args = "" - if(db){ - args = args + " $db" - } - """ - staramr db info $args > StarAMRDBVersions.txt - cat <<-END_VERSIONS > versions.yml - "${task.process}": - staramr: \$(echo \$(staramr -V 2>&1) | sed 's/^.*staramr //; s/ .*\$//') - END_VERSIONS - """ - - stub: - """ - touch StarAMRDBVersions.txt - touch versions.yml - """ - -} diff --git a/modules/local/unicycler_assemble.nf b/modules/local/unicycler_assemble.nf index 1aa5c7e6..bf95f961 100644 --- a/modules/local/unicycler_assemble.nf +++ b/modules/local/unicycler_assemble.nf @@ -4,7 +4,7 @@ process UNICYCLER_ASSEMBLE { tag "$meta.id" label "process_high" - container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" input: tuple val(meta), path(shortreads), path(longreads) diff --git a/nextflow.config b/nextflow.config index 9106e349..529ac098 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,7 +43,7 @@ params { show_hidden_params = false validationS3PathCheck = true validationShowHiddenParams = false - validationSchemaIgnoreParams = 'abricate,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler' + validationSchemaIgnoreParams = 'abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler' validationFailUnrecognisedParams = false // for the qcreport fields // SKIP options @@ -64,6 +64,8 @@ params { skip_mobrecon = false skip_staramr = false skip_metagenomic_detection = false // Skip classifying if sample is metagenomic + skip_allele_calling = false + skip_length_filtering_contigs = false metagenomic_run = false // Label all samples as @@ -82,7 +84,7 @@ params { fp_cut_tail_window_size = 4 fp_complexity_threshold = 20 fp_qualified_phred = 15 - fp_unqualified_precent_limit = 40 + fp_unqualified_percent_limit = 40 fp_polyg_min_len = 10 fp_polyx_min_len = 10 fp_illumina_length_min = 35 @@ -91,7 +93,7 @@ params { fp_dedup_reads = false // Bakta Parmeters - ba_min_conting_length = 200 + ba_min_contig_length = 200 // Quast parameters @@ -100,6 +102,37 @@ params { // Mash mh_min_kmer = 10 + // ECTyper Parameters + ec_opid = 90 // Minimum percent identity to determine O antigens prescence + ec_opcov = 90 // Minimum percent coverage of O antigen + ec_hpid = 95 // Miniumum percent identity to determine H antigens prescence + ec_hpcov = 50 // Minimum percent coverage of H antigen + ec_enable_verification = true // Enable species verification in ECTyper + + // SISTR parameters + sr_full_cgmlst = true // Use full set of cgMLST alleles which can include highly similar alleles + + // Locidex Options + lx_min_evalue = 0.0001 + lx_min_dna_len = 1 + lx_min_aa_len = 1 + lx_max_dna_len = 10000000 + lx_max_aa_len = 10000000 + lx_min_dna_ident = 80.0 + lx_min_aa_ident = 80.0 + lx_min_dna_match_cov = 80.0 + lx_min_aa_match_cov = 80.0 + lx_max_target_seqs = 10 + lx_extraction_mode = "raw" + lx_report_mode = "normal" + lx_report_prop = "locus_name" + lx_report_max_ambig = 0 + lx_report_max_stop = 0 + + // Overide an allele calling scheme, this will be applied globally if auto selection is not opted for + allele_scheme = null + + // Boilerplate options outdir = null tracedir = "${params.outdir}/pipeline_info" @@ -148,7 +181,7 @@ params { raw_reads { - high_precision = false // Makes things realllyyy slow + high_precision = false // Makes things really slow report_tag = "RawReadSummary" } @@ -164,26 +197,11 @@ params { docker = "docker.io/python:3.11.6" } - //KAT options - kat { - singularity = 'https://depot.galaxyproject.org/singularity/kat:2.4.2--py38hfc5f9d8_2' - docker = 'quay.io/biocontainers/kat:2.4.2--py38hfc5f9d8_2' - hist_ext = ".hist" - json_ext = ".hist.dist_analysis.json" - png_ext = ".png" - postscript_ext = ".ps" - pdf_ext = ".pdf" - jfhash_ext = ".jf" - output_type = "png" - report_tag = "KatHist" - - } - seqtk { singularity = 'https://depot.galaxyproject.org/singularity/seqtk%3A1.4--he4a0461_1' docker = 'quay.io/biocontainers/seqtk:1.4--he4a0461_1' seed = 42 - reads_ext = "_sampled.fastq.gz" + reads_ext = ".sampled.fastq.gz" assembly_fastq = ".fastq.gz" report_tag = "Seqtk" } @@ -194,6 +212,39 @@ params { report_tag = "SeqtkBaseCount" } + locidex { + // awaiting singluarity image build + //singularity = "https://depot.galaxyproject.org/singularity/locidex%3A0.1.1--pyhdfd78af_1" + singularity = "quay.io/biocontainers/locidex:0.1.1--pyhdfd78af_1" + docker = "quay.io/biocontainers/locidex:0.1.1--pyhdfd78af_1" + min_evalue = params.lx_min_evalue + min_dna_len = params.lx_min_dna_len + min_aa_len = params.lx_min_aa_len + max_dna_len = params.lx_max_dna_len + max_aa_len = params.lx_max_aa_len + min_dna_ident = params.lx_min_dna_ident + min_aa_ident = params.lx_min_aa_ident + min_dna_match_cov = params.lx_min_dna_match_cov + min_aa_match_cov = params.lx_min_aa_match_cov + max_target_seqs = params.lx_max_target_seqs + extraction_mode = params.lx_extraction_mode + report_mode = params.lx_report_mode + report_prop = params.lx_report_prop + report_max_ambig = params.lx_report_max_ambig + report_max_stop = params.lx_report_max_stop + extracted_seqs_suffix = ".extracted.seqs.fasta.gz" + seq_store_suffix = ".seq_store.json.gz" + gbk_suffix = ".gbk.gz" + extraction_dir = "extracted" + report_suffix = ".profile.mlst.json.gz" + schemes { + salmonella { + search = params.QCReport.salmonella + db = null + } + } + } + // FASTP options fastp { fastq_ext = ".trimmed.fastq.gz" @@ -207,7 +258,7 @@ params { cut_tail_window_size = params.fp_cut_tail_window_size // default is 4 complexity_threshold = params.fp_complexity_threshold // FastP default is 30 not 20 qualified_quality_phred = params.fp_qualified_phred // min quality for a read to contain - unqualified_percent_limit = params.fp_unqualified_precent_limit // if the minimum quality of a read is below 10 that read is discarded + unqualified_percent_limit = params.fp_unqualified_percent_limit // if the minimum quality of a read is below 10 that read is discarded polyg_min_len = params.fp_polyg_min_len polyx_min_len = params.fp_polyx_min_len illumina_length_min = params.fp_illumina_length_min @@ -260,13 +311,13 @@ params { spades { singularity = 'https://depot.galaxyproject.org/singularity/spades:3.15.5--h95f258a_1' docker = 'quay.io/biocontainers/spades:3.15.5--h95f258a_1' + outdir = "assembly" scaffolds_ext = ".scaffolds.fasta.gz" contigs_ext = ".contigs.fasta.gz" transcripts_ext = ".transcripts.fasta.gz" gene_clusters_ext = ".gene_clusters.fasta.gz" assembly_graphs_ext = ".assembly.gfa.gz" log_ext = ".log" - outdir = "assembly" } // Fastqc options @@ -280,7 +331,7 @@ params { singularity = 'https://depot.galaxyproject.org/singularity/seqkit:2.2.0--h9ee0642_0' docker = 'quay.io/biocontainers/seqkit:2.2.0--h9ee0642_0' report_ext = ".tsv" - fasta_ext = "_filtered.fasta.gz" + fasta_ext = ".filtered.fasta.gz" filter_field = "max_len" report_tag = "Seqkit_stats" header_p = true @@ -309,8 +360,8 @@ params { // TODO add to trouble shooting if checkm fails and provides EOF errors, to try changing the container singularity = 'https://depot.galaxyproject.org/singularity/checkm-genome%3A1.2.2--pyhdfd78af_1' docker = 'quay.io/biocontainers/checkm-genome:1.2.2--pyhdfd78af_1' - alignment_ext = "-genes.aln" - results_ext = "-results.txt" + alignment_ext = ".genes.aln" + results_ext = ".results.txt" tsv_ext = ".tsv" folder_name = "checkm" gzip_ext = ".gz" @@ -487,9 +538,9 @@ params { } staramr { - singularity = "https://depot.galaxyproject.org/singularity/staramr%3A0.9.1--pyhdfd78af_0" - docker = "quay.io/biocontainers/staramr:0.9.1--pyhdfd78af_0" - //db = "${projectDir}/databases/staramr_databases" + singularity = "quay.io/biocontainers/staramr:0.10.0--pyhdfd78af_0" + docker = "quay.io/biocontainers/staramr:0.10.0--pyhdfd78af_0" + point_finder_db_default = null db = null tsv_ext = ".tsv" txt_ext = ".txt" @@ -523,7 +574,7 @@ params { hypotheticals_faa_ext = ".hypotheticals.faa" tsv_ext = ".tsv" txt_ext = ".txt" - min_contig_length = params.ba_min_conting_length + min_contig_length = params.ba_min_contig_length args = { "" } } @@ -545,7 +596,12 @@ params { log_ext = ".log" tsv_ext = ".tsv" txt_ext = ".txt" - args = { "--verify" } + opid = params.ec_opid + opcov = params.ec_opcov + hpid = params.ec_hpid + hpcov = params.ec_hpcov + verify = params.ec_enable_verification ? "--verify" : "" + args = { "${params.ectyper.verify} -opid ${params.ectyper.opid} -opcov ${params.ectyper.opcov} -hpid ${params.ectyper.hpcov} -hpcov ${params.ectyper.hpcov}" } report_tag = "ECTyper${params.subtyping_report.report_tag}" header_p = true } @@ -573,9 +629,10 @@ params { singularity = "https://depot.galaxyproject.org/singularity/sistr_cmd:1.1.1--pyh864c0ab_2" docker = 'quay.io/biocontainers/sistr_cmd:1.1.1--pyh864c0ab_2' tsv_ext = ".tab" - allele_fasta_ext = "-allele.fasta" - allele_json_ext = "-allele.json" - cgmlst_ext = "-cgmlst.csv" + allele_fasta_ext = ".allele.fasta" + allele_json_ext = ".allele.json" + cgmlst_ext = ".cgmlst.csv" + full_cgmlst = params.sr_full_cgmlst report_tag = "SISTR${params.subtyping_report.report_tag}" header_p = true } @@ -629,11 +686,11 @@ params { kraken_bin { // Python only taxonomic_level = "G" - fasta_ext = "_binned.fasta.gz" + fasta_ext = ".fasta.gz" } report_aggregate { - sample_flat_suffix = "_flat_sample.json" + sample_flat_suffix = ".flat_sample.json" } @@ -1026,7 +1083,7 @@ manifest { description = """Mikrokondo beta""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.1.2' + version = '0.2.0' defaultBranch = 'main' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index d4a1a6ab..722de4a4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -110,6 +110,13 @@ "exists": true, "format": "directory-path", "hidden": true + }, + "allele_scheme": { + "type": "string", + "pattern": "^\\S+$", + "exists": true, + "format": "directory-path", + "description": "An allele scheme to to use for Locidex" } }, "required": [ @@ -402,6 +409,68 @@ "skip_staramr": { "type": "boolean", "description": "Skip running StarAMR" + }, + "skip_allele_calling": { + "type": "boolean", + "description": "Skip allele calling with Locidex" + }, + "skip_length_filtering_contigs": { + "type": "boolean", + "description": "Skip filtering contigs by length." + } + } + }, + "ectyper": { + "title": "ECTyper", + "type": "object", + "description": "Options for ECTyper (E.coli serotyping)", + "default": "", + "properties": { + "ec_opid": { + "type": "number", + "default": 90, + "description": "Minimum percent identity to determine O antigens prescence", + "minimum": 0, + "maximum": 100 + }, + "ec_opcov": { + "type": "number", + "default": 90, + "description": "Minimum percent coverage of O antigen", + "minimum": 0, + "maximum": 100 + }, + "ec_hpid": { + "type": "number", + "default": 95, + "description": "Miniumum percent identity to determine H antigens prescence", + "minimum": 0, + "maximum": 100 + }, + "ec_hpcov": { + "type": "number", + "default": 50, + "description": "Minimum percent coverage of H antigen", + "minimum": 0, + "maximum": 100 + }, + "ec_enable_verification": { + "type": "boolean", + "default": true, + "description": "Enable species verification in ECTyper" + } + } + }, + "sistr": { + "title": "SISTR", + "type": "object", + "description": "Options for SISTR (Salmonella serotyping)", + "default": "", + "properties": { + "sr_full_cgmlst": { + "type": "boolean", + "default": true, + "description": "Run SISTR using the full set of cgMLST alleles which can include highly similar alleles." } } }, @@ -444,7 +513,7 @@ "minimum": 0, "description": "the quality value that a base is qualified." }, - "fp_unqualified_precent_limit": { + "fp_unqualified_percent_limit": { "type": "integer", "default": 40, "minimum": 0, @@ -487,6 +556,117 @@ } } }, + "locidex": { + "title": "Locidex", + "type": "object", + "description": "Options for allele calling with locidex.", + "default": "", + "properties": { + "lx_min_evalue": { + "type": "number", + "default": 0.0001, + "description": "Minimum e-value required for match.", + "minimum": 0, + "maximum": 100 + }, + "lx_min_dna_len": { + "type": "integer", + "default": 1, + "description": "Global minimum query length of DNA strand.", + "minimum": 1 + }, + "lx_min_aa_len": { + "type": "integer", + "default": 1, + "description": "Global minimum query length of an AminoAcid strand.", + "minimum": 1 + }, + "lx_max_dna_len": { + "type": "integer", + "default": 10000000, + "description": "Global maximum query length of DNA strand.", + "minimum": 10 + }, + "lx_max_aa_len": { + "type": "integer", + "default": 10000000, + "description": "Gloval maximum query length of AminoAcid strand.", + "minimum": 10 + }, + "lx_min_dna_ident": { + "type": "number", + "default": 80, + "description": "Global minimum DNA percent identity required for match. (Float value)", + "minimum": 0, + "maximum": 100 + }, + "lx_min_aa_ident": { + "type": "number", + "default": 80, + "description": "Global minimum minimum AminoAcid percent identiy required for match. (Float value)", + "minimum": 0, + "maximum": 100 + }, + "lx_min_dna_match_cov": { + "type": "number", + "default": 80, + "description": "Gloval minimum DNA percent hit coverage identity required for match. (float)", + "minimum": 0.001, + "maximum": 100 + }, + "lx_min_aa_match_cov": { + "type": "number", + "default": 80, + "description": "Global minimum AminoAcid hit coverage identity required for match. (float)", + "minimum": 0, + "maximum": 100 + }, + "lx_max_target_seqs": { + "type": "integer", + "default": 10, + "description": "Maximum number of sequence hits per a query", + "minimum": 1 + }, + "lx_extraction_mode": { + "type": "string", + "default": "raw", + "description": "Different ways to run locidex.", + "enum": [ + "snps", + "trim", + "raw", + "extend" + ] + }, + "lx_report_mode": { + "type": "string", + "default": "normal", + "description": "Allele profile assignment.", + "enum": [ + "normal", + "conservative" + ] + }, + "lx_report_prop": { + "type": "string", + "default": "locus_name", + "description": "Metadata label to use for aggregation. Only alphanumeric characters, underscores and dashes are allowed in names", + "pattern": "^[A-Za-z0-9_-]*$" + }, + "lx_report_max_ambig": { + "type": "integer", + "default": 0, + "description": "Maximum number of ambiguous characters allowed in a sequence.", + "minimum": 0 + }, + "lx_report_max_stop": { + "type": "integer", + "default": 0, + "description": "Maximum number of internal stop codons allowed in a sequence.", + "minimum": 0 + } + } + }, "data_processing_thresholds": { "title": "Data processing thresholds", "type": "object", @@ -505,7 +685,7 @@ "description": "Minimum number of reads a sample requires to move forward for assembly", "minimum": 1 }, - "ba_min_conting_length": { + "ba_min_contig_length": { "type": "integer", "default": 200, "description": "Minimum contig length for processing in Bakta", @@ -533,7 +713,8 @@ "properties": { "nanopore_chemistry": { "type": "string", - "description": "The guppy base calling model. See the docs for a link of valid options" + "description": "The guppy base calling model. See the docs for a link of valid options", + "pattern": "^[A-Za-z0-9_-]*$" }, "flye_read_type": { "type": "string", @@ -567,9 +748,18 @@ { "$ref": "#/definitions/control_flow_options" }, + { + "$ref": "#/definitions/ectyper" + }, + { + "$ref": "#/definitions/sistr" + }, { "$ref": "#/definitions/fastp_options" }, + { + "$ref": "#/definitions/locidex" + }, { "$ref": "#/definitions/data_processing_thresholds" }, @@ -577,4 +767,4 @@ "$ref": "#/definitions/other" } ] -} +} \ No newline at end of file diff --git a/subworkflows/local/annotate_genomes.nf b/subworkflows/local/annotate_genomes.nf index 89afc111..2b86ad17 100644 --- a/subworkflows/local/annotate_genomes.nf +++ b/subworkflows/local/annotate_genomes.nf @@ -3,14 +3,12 @@ include { BAKTA_ANNOTATE } from '../../modules/local/bakta_annotate.nf' include { ABRICATE } from "../../modules/local/abricate.nf" include { MOBSUITE_RECON } from "../../modules/local/mob_recon.nf" include { STARAMR } from "../../modules/local/staramr.nf" -include { STARAMR_DUMP_DB_VERSIONS } from "../../modules/local/staramr_version.nf" include { IDENTIFY_POINTDB } from "../../modules/local/select_pointfinder.nf" workflow ANNOTATE_GENOMES { take: contig_data // val(meta), path(assembly) top_hit // val(meta), val(species) - // TODO add in species so that point finder can run main: versions = channel.empty() @@ -63,15 +61,24 @@ workflow ANNOTATE_GENOMES { } if(!params.skip_staramr){ - // TODO test and verify + def db_star = [] // set default value for database if(params.staramr.db){ db_star = Channel.value("${params.staramr.db}") } + // Dump db versions - STARAMR_DUMP_DB_VERSIONS(db_star) + // Removed as data is in results + + point_finder_organism = channel.empty() + if(params.skip_species_classification){ + point_finder_organism = contig_data.map{ meta, assembly -> + tuple(meta, params.staramr.point_finder_db_default) + } // Add in default null value for StarAMR + }else{ + point_finder_organism = IDENTIFY_POINTDB(top_hit).pointfinder_db + } - point_finder_organism = IDENTIFY_POINTDB(top_hit).pointfinder_db // Report point finder databases used reports = reports.mix(point_finder_organism.map{ diff --git a/subworkflows/local/clean_reads.nf b/subworkflows/local/clean_reads.nf index b2de50f0..f8fc73d6 100644 --- a/subworkflows/local/clean_reads.nf +++ b/subworkflows/local/clean_reads.nf @@ -15,7 +15,37 @@ include { SEQTK_SAMPLE } from '../../modules/local/seqtk_sample.nf' +process PUBLISH_FINAL_READS { + tag "$meta.id" + label "process_low" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" + + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*/*"), emit: final_reads + path "versions.yml", emit: versions + + script: + """ + mkdir ${meta.sample} + for i in ${reads.join(" ")} + do + mv \$i ${meta.sample}/ + done + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mkdir: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + mv: \$(echo \$(touch --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ +} + + workflow QC_READS { + // TODO add in nanoplot for nanopore data take: reads // channel [[meta etc], [Read paths], opt: long reads] @@ -187,11 +217,13 @@ workflow QC_READS { ch_cleaned_reads = CHECK_ONT(ch_cleaned_reads) } + published_reads = PUBLISH_FINAL_READS(ch_cleaned_reads) + versions = versions.mix(published_reads.versions) + + emit: trimmed_reads = ch_cleaned_reads // channel: [val(meta), [ reads ]] - //genome_size = PARSE_KAT.out.genome_size genome_size = genome_sizes - //heterozygozity = PARSE_KAT.out.heterozygozity reports = reports versions = versions } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 676b5dcc..2792d88b 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -20,6 +20,7 @@ workflow INPUT_CHECK { meta -> tuple(meta.id[0], meta[0]) } + if(params.opt_platforms.ont == params.platform && params.nanopore_chemistry == null){ exit 1, "ERROR: Nanopore data was selected without a model being specified." } diff --git a/subworkflows/local/locidex.nf b/subworkflows/local/locidex.nf new file mode 100644 index 00000000..d8657de0 --- /dev/null +++ b/subworkflows/local/locidex.nf @@ -0,0 +1,63 @@ +include { LOCIDEX_EXTRACT } from "../../modules/local/locidex_extract.nf" +include { LOCIDEX_SEARCH } from "../../modules/local/locidex_search.nf" +include { LOCIDEX_REPORT } from "../../modules/local/locidex_report.nf" + + + +workflow LOCIDEX { + take: + contigs // val(meta), path(contigs) + top_hit // val(meta), top_hit + + main: + reports = Channel.empty() + versions = Channel.empty() + + paired_species = top_hit.join(contigs) + + paired_dbs = paired_species.map{ + meta, top_hit, contigs -> tuple(meta, contigs, id_scheme(top_hit)) + }.branch{ + paired: it[2] + fallthrough: true + } + + // TODO add to reports the database for allele calls to report + paired_dbs.fallthrough.subscribe { + log.info "No allele scheme identified for ${it[0].id}." + } + + extracted_lx = LOCIDEX_EXTRACT(paired_dbs.paired) + versions = versions.mix(extracted_lx.versions) + + allele_calls = LOCIDEX_SEARCH(extracted_lx.extracted_seqs) + versions = versions.mix(allele_calls.versions) + + report_lx = LOCIDEX_REPORT(allele_calls.allele_calls) + versions = versions.mix(report_lx.versions) + + emit: + versions + + +} + +def id_scheme(top_hit){ + /* Pick the correct allele scheme based off of the species top-hit + */ + + def selected_scheme = params.allele_scheme + if(selected_scheme){ + return selected_scheme + } + + for( scheme in params.locidex.schemes){ + search_param = scheme.value.search.search + if(top_hit.contains(search_param)){ + selected_scheme = scheme.value.db ? file(scheme.value.db) : null // Need a value present that will send the scheme to the fallthrough case + break + } + } + + return selected_scheme +} diff --git a/subworkflows/local/qc_assemblies.nf b/subworkflows/local/qc_assemblies.nf index 3c8d60c6..05283acc 100644 --- a/subworkflows/local/qc_assemblies.nf +++ b/subworkflows/local/qc_assemblies.nf @@ -5,6 +5,36 @@ include { SEQKIT_FILTER } from "../../modules/local/seqkit_filter.nf" include { CHECKM_LINEAGEWF } from "../../modules/local/checkm_lineagewf.nf" include { MLST } from "../../modules/local/mlst.nf" + +process PUBLISH_FINAL_ASSEMBLIES { + tag "$meta.id" + label "process_low" + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" + + + input: + tuple val(meta), path(contigs), path(reads) + + output: + tuple val(meta), path("*/*"), emit: final_assembly + path "versions.yml", emit: versions + + script: + """ + mkdir ${meta.sample} + for i in ${contigs.join(" ")} + do + mv \$i ${meta.sample}/ + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mkdir: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + mv: \$(echo \$(touch --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ +} + workflow QC_ASSEMBLIES { take: assembled_reads // tuple val(meta), path(contigs), path(reads) @@ -51,10 +81,17 @@ workflow QC_ASSEMBLIES { }) min_length = Channel.value(params.quast.min_contig_length) - filterd_contigs = SEQKIT_FILTER(pre_checked_data, min_length) - versions = versions.mix(filterd_contigs.versions) + + if(!params.skip_length_filtering_contigs){ + filterd_contigs = SEQKIT_FILTER(pre_checked_data, min_length) + versions = versions.mix(filterd_contigs.versions) + assembled_reads = filterd_contigs.filtered_sequences + } + + pub_final_assembly = PUBLISH_FINAL_ASSEMBLIES(assembled_reads) + versions = versions.mix(pub_final_assembly.versions) if(!params.skip_checkm){ CHECKM_LINEAGEWF(assembled_reads.map{ diff --git a/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.fasta b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.fasta new file mode 100755 index 00000000..a03cb89d --- /dev/null +++ b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.fasta @@ -0,0 +1,106 @@ +>0 +AAATTCCGTCCCGGACATGCGGACTACACCTATCACCAAAAATACGGTGTGCGAGATTACCGTGGCGGCGGCCGTTCATCGGCACGTGAAACCGCCATGCGTGTTGCTGCGGGAGCGATTGCCAAAAAATATCTGCAGCAAGAGTTTGGCATTGAAGTGCGTGCTTACTTGTCGCAAATGGGGGATGTCGCGATTGATAAAGTGGATTGGAATGAGATTGAAAACAACGATTTCTTCTGTCCTGATGTCGATAAAGTGGCTGCGTTTGACGAGCTGATCCGCGAGCTGAAAAAAGAAGGCGATTCGATCGGCGCGAAAATCCAAGTGGTCGCTACAGGCGTGCCGGTTGGACTGGGTGAGCCTGTGTTTGATCGCTTAGATGCGGATATTGCCCATGCCTTGATGAGCATCAACGCCGTGAAAGGAGTCGAGATTGGTGATGGCTTTGATGTGGTGCGCCAAAAAGGCAGCCAACACCGTGACCCGCTCACTCCACAAGGT +>1 +GTTTTCCGCCCGGGCCATGCCGACTATACCTACGAGCAGAAATACGGTCTGCGCGATTACCGTGGCGGCGGTCGTTCTTCCGCCCGTGAAACGGCGATGCGCGTCGCGGCTGGCGCGATTGCTAAAAAATATCTGGCGGAGAAACACGGCATCGTCATTCAGGGGTGTCTGACCCAGATGGGCGATATTCCGCTTGAAATCAAAGACTGGCAGCAGGTTGAACAAAACCCGTTTTTCTGTCCTGATCCAGATAAAATCGACGCGCTGGATGAACTGATGCGCGCCCTGAAGAAAGAGGGCGATTCGATTGGGGCAAAAGTGACCGTCGTGGCAAACGGCGTTCCGGCCGGGCTTGGCGAACCGGTCTTTGACCGTCTGGATGCGGACATCGCTCATGCGCTGATGAGCATCAACGCGGTAAAAGGCGTGGAGATTGGCGATGGGTTTGATGTGGTCGCGTTGCGAGGCAGCCAGAATCGCGATGAAATTACCAAAGAGGGC +>2 +GTTTTCCGTCCAGGACACGCTGACTATACCTATGAGCAGAAATATGGCCTGCGCGACTACCGTGGCGGCGGACGTTCATCCGCGCGTGAAACGGCGATGCGCGTTGCGGCTGGCGCGATTGCCAAAAAATATCTGGCGGAAAAATTCGGCGTTGAAATTCGCGGCTGTCTGACGCAGATGGGGGATATTCCGCTGGAGATCAAAGACTGGTCTCAGGTGGAGCTTAACCCGTTCTTTTGTCCAGACCCGGATAAAATCGAAGTGCTGGACGAACTGATGCGCGGGCTGAAGAAAGAGGGCGACTCCATCGGGGCAAAAGTGACCGTTGTTGCAAGCGGCGTACCGGCGGGTCTCGGCGAACCTGTATTCGACCGTCTGGATGCCGACATCGCCCATGCGCTGATGAGCATTAACGCCGTTAAGGGCGTTGAGATTGGCGACGGTTTTGACGTTGTTGCGCTGCGCGGCAGTCAGAACCGCGATGAGATCACCAAAGAAGGT +>3 +GTTTTCCGCCCAGGGCATGCTGATTATACCTATGAACAAAAATATGGTTTGCGTGATTATCGTGGTGGTGGACGTTCTTCTGCTCGTGAAACGGCAATGCGTGTCGCCGCAGGTGCGATTGCTAAAAAATATCTAAAAGAGAAATTAGGCATCGAAGTTCGAGGATATCTTTCTCAGCTAGGACCTATTACATGTGATCTTGTTGATTGGTCTATTGTTGAAAGCAATCCATTTTTCTGTCCTGATCCTTCACGTTTAGATGCGCTTGATGAATACATGCGTGCACTTAAAAAAGAAGGTAATTCTATTGGTGCAAAAGTCACTGTGGTTGCACAGGGTGTACCTGCTGGATTTGGTGAACCTGTCTTTGATCGATTAGATGCTGATTTAGCGCATGCTTTGATGAGTATCAATGCTGTCAAAGGTATAGAAATTGGTGATGGATTTGGTGTTGTAACATTAAAAGGTACAGAAAACCGAGATGAAATCACTAAAAAGGGA +>4 +GTTTTCCGTCCAGGCCATGCCGATTACACCTACGAACAAAAATACGGTCTGCGCGATTATCGCGGCGGCGGGCGCTCTTCCGCCCGCGAAACCGCCATGCGCGTGGCGGCAGGGGCGATTGCAAAAAAATATCTCGCCGAGAAATTTGGCATTGAGATTCGCGGCTGCCTGACCCAGATGGGTGACATTCCGCTGGAAATCAAAGACTGGTCGCAGGTCGAGCAAAATCCGTTTTTCTGCCCGGACCCGGACAAAATCGACGCGTTAGATGAACTGATGCGCGCGCTGAAAAAAGAGGGCGACTCCATCGGCGCGAAAGTCACCGTTGTTGCCAGTGGCGTCCCCGCCGGACTTGGCGAGCCGGTCTTTGACCGCCTGGATGCCGACATCGCCCATGCGCTGATGAGCATCAACGCGGTGAAAGGCGTAGAAATTGGTGATGGTTTTGACGTGGTGGCGCTGCGTGGCAGCCAGAACCGCGACGAAATCACCAAAGACGGT +>5 +GTTTTCCGTCCTGGTCACGCCGACTATACCTACGAACAAAAATATGGCTTTCGCGACTATCGCGGCGGCGGGCGTTCTTCCGCGCGTGAAACCGCGATGCGCGTGGCGGCAGGGGCAATTGCCAAAAAATATCTCCAGCAGAAATTCGGCATCGTTATCCGCGGCTGTCTGTCCCAGATGGGCGACATTCCGCTGGCAATCAAAGACTGGGATCAGGTAGAGCTCAACCCGTTCTTCTGCGCCGATGCCGACAAGCTGGACGCGCTGGATGAGCTGATGCGTGGCCTGAAAAAAGAGGGCGACTCCATTGGTGCGAAAGTCACCGTGGTGGCCGACGGCGTGCCGGCTGGCTGGGGCGAGCCGGTATTTGACCGCCTTGACGCCGACATCGCCCACGCGCTGATGAGCATCAACGCGGTGAAAGGCGTCGAAATCGGCGACGGTTTTGACGTGGTCAAGCTTCGCGGCAGCCAGAACCGCGACGAAATCACGAAGGCGGGT +>6 +GTGTTCCGTCCGGGGCACGCGGATTACACCTACGAACAAAAATACGGCCTGCGCGACTATCGCGGCGGCGGGCGTTCATCCGCCCGTGAAACCGCCATGCGCGTCGCGGCAGGCGCTATCGCCAAAAAATATCTGGCGCAGAAATTCGGCGTGGTGATTCGCGGCTGCCTGACCCAGATGGGTGATATTCCGCTGGAAATCAAAGACTGGGATCAGGTAGAGCAAAACCCGTTCTTCTGCCCGGACCCGGATAAAATCGAGGCGCTGGATGAGCTGATGCGCGCTCTGAAAAAAGAGGGCGATTCCATCGGCGCGAAAGTCACCGTGGTGGCCGACAGCGTGCCCGCCGGGCTTGGCGAGCCGGTATTTGACCGCCTGGACGCCGATATCGCCCACGCGCTGATGAGCATTAACGCCGTGAAGGGCGTGGAAATCGGCGACGGTTTCGGCGTGGTGCAACTGCGCGGCAGCCAGAACCGCGACGAAATCACCACTGCCGGT +>7 +ATGGAGATGGTCGCGCGCGTTACGCTTTCTCAGCCGCATGAGCCAGGCGCCACTACCGTGCCGGCGCGGAAATTCTTTGATATCTGCCGCGGCCTGCCGGAGGGCGCGGAGATTGCCGTTCAGTTGGAAGGCGATCGGATGCTGGTGCGTTCTGGCCGTAGCCGCTTCTCGCTGTCTACGCTGCCTGCCGCCGATTTCCCGAATCTTGACGACTGGCAAAGCGAAGTTGAATTTACGCTGCCGCAGGCCACGATGAAGCGCCTGATTGAAGCGACCCAGTTTTCGATGGCCCATCAGGATGTGCGCTACTACTTAAACGGTATGCTGTTTGAAACGGAAGGTAGCGAACTGCGCACTGTTGCGACCGACGGCCACCGTCTGGCGGTGTGCTCAATGCCGCTGGAGGCGTCTTTACCTAGCCACTCGGTGATTGTGCCGCGTAAAGGCGTGATTGAACTGATGCGTATGCTCGACGGTGGCGAAAACCCGCTGCGCGTGCAG +>8 +ATGGAGATGGTCGCGCGCGTTACGCTTTCTCAGCCGCATGAACCCGGCGCTACTACCGTGCCGGCGCGGAAATTCTTTGATATCTGCCGTGGCCTGCCGGAAGGGGCGGAAATCGCCGTTCAGCTGGAGGGCGATCGGATGCTGGTGCGTTCTGGCCGTAGTCGCTTTTCGCTGTCTACCTTACCGGCAGCAGACTTCCCGAATCTGGATGACTGGCAAAGCGAAGTGGAATTCACGCTGCCTCAGGCGACGATGAAACGCTTGATTGAGGCCACCCAGTTTTCGATGGCCCATCAGGACGTGCGCTACTACCTGAACGGTATGTTGTTTGAAACGGAAGGAAGCGAACTGCGCACCGTCGCGACCGACGGCCACCGTCTGGCGGTCTGTTCAATGCCGCTGGAGGCCTCTTTACCGAGCCATTCAGTGATCGTACCGCGTAAAGGCGTGATTGAACTGATGCGTATGCTTGACGGCGGTGAAAATCCACTGCGTGTACAG +>9 +ATGGAAATGGTGGCGCGCGTTGCGTTGATTCAGCCTCATGAACCAGGCGCAACTACCGTCCCGGCGCGGAAATTCTTTGATATCTGCCGTGGCTTGCCGGAAGGGGCTGAAATTGCCGTCCAGCTGGAAGGCGATCGGATGCTGGTGCGCTCCGGGCGTAGCCGTTTCTCGCTTTCCACGCTGCCTGCCGCCGATTTCCCTAATCTGGATGACTGGCAGAGCGAAGTCGAATTCACCCTGCCGCAGGCAACGATGAAGCGCCTGATTGAAGCCACCCAGTTCTCAATGGCGCATCAGGACGTGCGTTACTACTTAAACGGCATGCTGTTTGAGACTGAAGGTGAAGAGTTGCGTACCGTCGCGACCGACGGTCACCGTCTGGCGGTCTGCTCTATGCCGGTCGGGCAATCTCTGCCTAACCATTCGGTGATTGTGCCGCGTAAAGGCGTGATTGAGCTGATGCGTATGCTCGACGGCGGCGAAACCCCGCTGCGCGTACAG +>10 +ATGGAGATGGTGGCGCGCGTGGCGCTGATCCAGCCTCATGAACCTGGTGCGACCACCGTTCCGGCGCGTAAATTCTTCGATATTTGCCGTGGATTACCAGAAGGGGCGGAAATTGCCGTTCAACTGGAAGGCGACCGTATGCTGGTGCGTTCTGGCCGCAGCCGTTTCTCGCTGTCTACGCTGCCTGCCGCCGACTTCCCGAATCTGGACGACTGGCAGAGCGAAGTCGAATTCACCCTGCCACAGGCGACAATGAAGCGCCTGATTGAAGCCACGCAGTTTTCGATGGCGCATCAGGACGTGCGTTACTACTTAAACGGCATGCTGTTTGAAACCGAAGGGGAAGAGTTGCGTACCGTGGCGACCGACGGTCACCGCCTGGCGGTCTGTTCAATGCCTGTCGGTCAGCCGTTGCCTAGCCATTCGGTGATCGTACCGCGTAAAGGTGTGATTGAACTGATGCGTATGCTCGACGGCGGCGATAACCCGCTGCGCGTGCAG +>11 +ATGGAAATGGTGGCACGCGTTGCGCTGGTTCAGCCGCACGAACCAGGGGCGACGACCGTTCCAGCGCGCAAATTCTTTGATATCTGCCGTGGTCTGCCTGAAGGCGCGGAAATTGCCGTGCAGCTGGAAGGTGAGCGGATGCTGGTGCGCTCCGGGCGTAGCCGTTTTTCGCTGTCTACCCTGCCAGCGGCGGATTTCCCGAATCTCGATGACTGGCAGAGCGAAGTCGAATTTACCCTGCCGCAGGCGACGATGAAGCGTCTGATTGAAGCGACCCAGTTTTCTATGGCGCATCAGGACGTTCGCTATTACTTAAACGGTATGCTGTTTGAAACCGAAGGTGAAGAACTGCGCACCGTGGCGACCGACGGCCACCGTCTGGCAGTCTGTTCAATGCCAATTGGTCAATCTTTGCCAAGCCATTCGGTGATCGTGCCGCGTAAAGGCGTGATTGAACTGATGCGTATGCTCGACGGCGGCGACAATCCGCTGCGCGTGCAG +>12 +ATGGAAATGATCGCGCGCGTTACGCTGACTCAGCCGCACGACGCGGGCGCGACCACGGTTCCGGCACGTAAATTCTTTGATATTTGCCGTGGGCTGCCGGAAGGCGCTGAAATCGCAGTGCAGCTGGAGGGCGACCGCATGCTGGTGCGCTCTGGCCGCAGCCGTTTCTCCCTCTCCACGTTGCCCGCTGCGGACTTCCCGAACCTGGATGACTGGCAGAGCGAAGTTGAATTTACCCTGCCGCAGGCGACGATGAAGCGTCTGATTGAAGCCACGCAGTTCTCCATGGCGCATCAGGACGTTCGTTACTACTTAAACGGCATGCTGTTCGAAACCGAAGGTGAAGAGCTGCGTACCGTGGCGACCGACGGTCACCGTCTGGCGGTTTGTTCCATGCCGATTGGCGATTCACTGCCAAACCATTCGGTGATCGTACCGCGTAAAGGCGTAATTGAACTGATGCGTATGCTCGACGGCGGTGAAACGCCGCTGCGCGTGCAG +>13 +ATGGAGATGATCGCGCGTGTGGCGCTGTCGCTACCGCACCAGGCGGGCGCGACCACCGTGCCGGCGCGCAAATTCTTCGATATCTGCCGTGGCTTGCCGGAAGGGGCGGAAATCGCCGTTACGCTGGAAGGCGACAGAATGCTGGTGCGCTCCGGGCGCAGCCGCTTCTCGCTGTCTACGTTACCGGCGGCAGACTTCCCGAATCTGGACGACTGGCAGAGCGAAGTGGAGTTCACGCTCCCGCAGGCCACCATGAAGCGCCTGATCGAAGCGACCCAGTTCTCCATGGCCCATCAGGACGTGCGGTATTACCTGAACGGGATGCTGTTTGAAACCGAAGGCGAAGAGCTGCGCACCGTGGCGACTGACGGCCACCGTCTGGCGGTATGCGCGATGCCGGTAGGCCAACCGCTGCCAAACCATTCGGTGATTGTACCGCGTAAAGGCGTGCTGGAGCTGATGCGTATGCTCGATGGCGGCGACAGCCCGCTGCGCATTCAG +>14 +TCGGCGCTGACGGAAAACGATCTGGTCTTCGCCCTCTCGCAGCACGCCGTCACCTTTGCAGATGCCGAGCTTCAGCAACAAGGGAAAAGCTGGCCCTCCCTTCCGCGTTATTTTGCCATTGGTCGCACAACGGCGCTGGCGCTGCATACCGTTAGCGGTTTCAATATTCACTACCCTCTGGATCGGGAAATTAGCGAAGTCTTGCTACAATTACCTGAATTACAAAATATTGCGGGAAAACGCGCGCTTATATTACGCGGCAATGGTGGCCGTGAGCTGATAGGTGAAACCCTGACAGCACGCGGAGCTGATGTCGATTTTTGTGAATGTTATCAACGCAGTGCAAAATATTACGATGGTGCAGAAGAAGCGATGCGCTGGCAATCTCGTGGTGTGACCACGGTGGTTGTCACCAGCGGAGAGATGCTACAA +>15 +GCGGCGCTGGGGGAGAGCGATCTGTTGTTTGCCCTCTCGCAACACGCGGTTGCTTTTGCCCAATCACAGCTGCATCAGCAAGATCGTAAATGGCCCCGACTACCTACTTATTTCGCCATTGGACGCACCACCGCACTGGCGCTACATACCGTAAGCGGACAGAAGATTCTCTACCCGCAGGATCGGGAAATCAGCGAAGTCTTGCTACAATTACCTGAATTACAAAATATTGCGGGCAAACGTGCGCTGATATTACGTGGCAATGGCGGTCGTGAGCTAATTGGGGATACCCTGACGGCGCGCGGTGCTGAGGTCACTTTTTGTGAATGTTATCAACGATGCGCAATCCATTACGATGGTGCAGAAGAAGCGATGCGCTGGCAATCCCGCGAGGTGACGACGGTCGTTGTTACCAGCGGTGAAATGTTGCAG +>16 +GCGACGTTGACGGAAAACGATCTGGTTTTTGCCCTTTCACAGCACGCCGTCGCCTTTGCCCACGCCCAACTCCAGCGAGATGGTCGAAACTGGCCTGCGTCGCCGCGCTATTTCGCGATTGGTCGCACCACGGCGCTCGCCCTTCATACCGTTAGCGGGTTCGATATTCGTTATCCATTGGATCGGGAAATCAGCGAAGTCTTGCTACAATTACCTGAATTACAAAATATTGCGGGCAAACGCGCGCTGATTTTGCGTGGCAATGGCGGTCGCGGTCGCGAACTGCTGGGCGAAACCCTGACAGCTCGCGGAGCCGAAGTCAGTTTTTGTGAATGTTATCAACGAAGTGCGAAACATTACGATGGCGCAGAAGAGGCGATGCGCTGGCACACTCGCGGCGTAACGACGCTTGTTGTCACCAGCGGCGAGATGTTGCAA +>17 +GCGGCGCTCACGGACAACGATCTGGTGTTCGCCCTCTCGCAACACGCCGTCGCCTTTGCCCACGCCCAACTGCAACAGCAGGAGCTGGACTGGCCTGTGCAACCACGCTACTTCGCCATCGGGCGCACAACGGCGCTGGCGCTGCATACCGTTAACGGATGCGATATTCGCTATCCTCTGGATCGGGAAATCAGCGAAGTCTTGCTACAATTACCTGAATTACAAAATATTGCGGGAAAACGAGCGCTTATTTTACGGGGCAACGGCGGGCGTGAACTGTTAGGCAAAACCCTCACAGAACGCGGCGCTGAAGTCACCTTTTGTGAATGTTATCAACGCAGTGCAAAACATTACGATGGCGCGGAAGAGGCGATGCGCTGGCACTCTCGCGGCGTGACGACGATTGTTGTCACCAGCGGCGAAATGCTGCAA +>18 +GAAACACTTGGCGATAACGATCTGCTCTTTGCACTTTCTCAACATGCAGTGTCATTCGCCCATGCGCAGTTGCAACAGCAGGGGCTAAACTGGCCATCACTTCCGCATTATTTCGCTATTGGCCGTACTACCGCTCTCGCCCTGCACACCGTAAGCGGACATAAGATTCGCTATCCACAAGATCGGGAAATCAGCGAAGTCTTGCTACAATTACCGGAATTACAAAGTATTGCGGGAAAACGCGCACTTATTTTGCGCGGTAACGGCGGCCGTGAATTGATCGGTCAGACGCTGACATCACGTGGTGCCGACGTTACTTTTTGTGAATGTTATCAACGCAGTGCGAAGCATTACGATGGTGCGGAAGAAGCTATGCGCTGGCAGTCTCGCGGCGTAACAACCGTCGTTGTAACCAGCGGTGAAATGCTGCAA +>19 +CGTCTCTTGCAGGAAGGCGATCTGCTCTTTGCGCTGTCGCAGCATGCCGTGGAGTTTGCCCATGCGCAGCTGCAACAGCATGCCGTTAGCTGGCCTCACGCCCCCCGCTATTTCGCCATCGGGCGCACCACGGCGCTGGCCTTACATACCGCGAGCGGAATCGATGTTCGTTACCCGTTAGATCGGGAAATCAGCGAAGTCTTGCTACAATTACCTGAATTACAAACCATTGCCGGAAAGCGCGCGCTCATTTTGCGCGGCAACGGTGGCCGCGAACTGCTGGGCGAAACGCTGCGCGAACGCGGCGCAGACGTGACGTTTGTGGAGTGCTATCAGCGCTGTGCGAAACACTATGATGGCGCGGAAGAAGCAATGCGCTGGCACGCCCGCGGTATTAATACGCTGGTGGTCACCAGCGGTGAAATGTTACAA +>20 +ATTGCGGGATGCCAGAAGGTGGTTCTGTGCTCGCCGCCACCCATCGCTGATGAAATCCTCTATGCGGCGCAACTGTGTGGCGTGCAGGAAATCTTTAACGTCGGCGGCGCGCAGGCGATTGCCGCTCTGGCCTTCGGCAGCGAGTCCGTACCGAAAGTGGATAAAATTTTTGGCCCCGGCAACGCCTTTGTAACCGAAGCCAAGCGTCAGGTCAGCCAGCGTCTCGACGGCGCGGCTATCGATATGCCAGCCGGGCCGTCTGAAGTGCTGGTGATCGCCGACAGCGGCGCAACACCGGATTTCGTCGCTTCTGACCTGCTCTCCCAGGCTGAGCACGGCCCGGATTCCCAGGTGATCCTGCTGACGCCGGATGCTGACATTGCCCGCAAGGTGGCGGAGGCGGTAGAACGTCAACTGGCGGAACTGCCGCGCGCGGGCACCGCCCGGCAGGCCCTGAGCGCCAGTCGTCTGATTGTGACCAAAGATTTAGCGCAGTGCGTC +>21 +ATTGCCGGATGCAAAAAAGTGGTGTTGTGCTCGCCACCGCCTATCGCGGATGAAATCCTTTACGCTGCGCAGCTGTGCGGCGTGCAGGAAATCTTCAACGTCGGCGGCGCCCAGGCCATTGCCGCTCTGGCGTTCGGCAGCGAATCCGTGCCAAAAGTGGACAAAATTTTTGGCCCCGGCAACGCGTTTGTCACCGAGGCGAAACGCCAGGTCAGCCAGCGTCTCGACGGCGCGGCAATTGATATGCCTGCCGGCCCTTCTGAAGTGCTGGTGATCGCCGACAGCGGCGCCACGCCAGATTTCGTGGCGTCTGACCTGCTCTCTCAGGCGGAACACGGCCCGGATTCTCAGGTCATCCTGCTGACCCCGGATGCCGGTATTGCGCAGAACGTCGCAGAGGCCGTCGAACGCCAGTTAGCGGAGTTACCGCGTGCAGAAACGGCGCGTCAGGCATTAAGCGCCAGCCGTCTGATCGTGACGAAAGACTTAGCCCAGTGCGTC +>22 +ATTGCAGGCTGTAAAAAAGTGGTGTTGTGCTCTCCCCCACCTATCGCCGATGAAATTCTGTATGCTGCGCAGCTCTGCGGCGTACAGGATGTGTTTAACGTTGGGGGCGCACAAGCTATTGCCGCGCTGGCATTTGGCAGTGAATCCGTGCCGAAAGTGGACAAAATTTTTGGCCCCGGTAATGCCTTTGTGACCGAAGCCAAACGTCAGGTGAGTCAGCGTCTGGACGGCGCCGCCATCGATATGCCAGCAGGTCCGTCTGAAGTGCTGGTGATTGCCGACAGCGGCGCCACGCCGGATTTCGTTGCCTCTGACTTACTCTCGCAGGCCGAACACGGCCCCGATTCCCAAGTGATCCTGCTGACGCCGGATGCCGGTATGGCCAGCCGGGTTGCTGAAGCAGTAGAACGCCAGCTTGCAGCGCTGCCACGCGCTGAAACCGCGCGGCAGGCGTTAAGCGCCAGTCGTCTGATTGTCACCCGCTCCCTTGCGCAATGCGTA +>23 +ATTGCGGGCTGTAAAAAAGTGGTGCTGTGCTCACCGCCGCCGATTGCCGATGAGATCCTTTACGCGGCGCAGCTGTGCGGTGTGCAGGACGTGTTTAACGTCGGCGGCGCACAGGCCATTGCCGCGCTGGCGTTTGGTACAGAATCCGTGCCGAAAGTGGACAAAATCTTCGGGCCAGGTAACGCCTTTGTCACCGAGGCAAAACGTCAGGTGAGCCAGCGTCTGGACGGTGCGGCGATCGATATGCCCGCAGGCCCGTCGGAAGTGCTGGTGATTGCTGACAGCGGCGCAACGCCGGATTTCGTGGCTTCTGATTTGCTCTCCCAGGCTGAACACGGCCCGGACTCTCAGGTGATTTTACTGACGCCCGCTGCTGATATGGCGCGTCGCGTAGCCGAAGCTGTCGAACGCCAGCTGGCAGAACTGCCGCGAGCTGAAACCGCCCGCCAGGCACTGAACGCCAGCCGCCTGATCGTGACTAAAGATTTAGCGCAGTGCGTG +>24 +ATTGCCGGTTGTCAGAAGGTGGTGCTCTGCTCTCCTCCACCGATCGCCGATGAGATCCTGTACGCGGCGAAGCTGTGCGGCGTGCAGGCGATCTATAAAGTGGGCGGTGCGCAGGCGATTTCTGCCCTGGCGTTCGGAACAGTATCCATTCCTAAGGTCGACAAAATCTTTGGCCCGGGCAATGCCTACGTGACCGAGGCGAAGCGCCAGGTCAGCCAGCGTCTGGACGGCGCGGCGATTGATATGCCTGCCGGTCCGTCTGAAGTGCTGGTGATTGCCGACAGCGGCGCTACACCGGATTTCGTGGCCTCTGACCTGCTCTCGCAGGCCGAGCACGGCCCTGACTCGCAGGTGATTTTACTGACGCCAGATGCCGACATGGCAAAACGCGTGGGCGACGCCGTTGAGCGTCAGCTGGCTGACCTGCCGCGGGCGGAAACGGCGCGTCAGGCGCTATCCGCCAGCCGCCTGATTGTGGCCCGCGATCTTGACCAGTGCATC +>25 +ATCGCCGGCTGTAAAAAAGTGGTGCTGTGCTCGCCGCCGCCGATTGCCGATGAAATCCTCTACGCCGCGCAACTCTGTGGCGTGAAAGAAGTGTTTAACGTGGGTGGCGCACAGGCCATTGCCGCGCTGGCGCTGGGCACGGAGTCTATTCCAAAAGTCGATAAAATCTTTGGGCCGGGCAACGCCTATGTGACCGAAGCCAAGCGCCAGGTCAGCCAGCGTCTTGACGGCGCGGCAATCGATATGCCCGCCGGACCGTCCGAAGTATTGGTTATCGCCGACAGCGGCGCAACGCCGGATTTTGTCGCCTCCGACCTGCTTTCTCAGGCCGAGCACGGCCCAGACTCGCAGGTGATCCTGCTGACGCCGGACGCTAAGCTTGCCGAGGGCGTGGCCGAAGCCGTTGAACGCCAGCTCGCCGAGCTGTCCCGCGCCGACACCGCGCGTCAGGCGCTCTCCGCCAGCCGTTTAATCGTAGCGAAAGATCTGGCGCAGTGCGTG +>26 +ATCGCGGGCTGTAAAAAAGTGGTGCTGTGCTCGCCGCCGCCGATTGCCGATGAAATCCTCTATGCGGCGCGTTTGTGCGGGGTACAGCAGGTCTATCAGGTGGGCGGCGCTCAGGCCATCGCGGCGCTGGCGTTTGGCACCGAGACCGTACCCAAAGTGGACAAAATCTTCGGGCCGGGCAATGCGTTTGTCACCGAAGCCAAACGTCAGGTCAGCCAGCGGCTGGATGGCGCGGCGATTGATATGCCTGCCGGGCCGTCTGAAGTGCTGGTGATCGCCGATAGCGGCGCGACCACGGATTTCGTGGCCTCGGATTTGCTGTCCCAGGCGGAACACGGCCCGGATTCGCAGGTGATCCTGCTGACACCGGACAGCGCCATGGCGCAGGCGGTGGCCGACGCGGTTGAGCGTCAACTCGCCGAACTGCCGCGCGCGGAAACAGCTCGCCAGGCGCTGGCGGAAAGCCGCCTGATTGTGGCGCGCGATTTAGCGCAGTGCGTG +>27 +AGCGACTGGGCTACCATGCAATTCGCCGCCGAAATTTTTGACATTCTGGATATTCCGCACCATGTCGAAGTGGTTTCTGCTCACCGTACCCCCGATAAACTGTTCAGCTTTGCCGAAAATGCTGAAGAAAACGGCTTTCAGGTAATTATTGCCGGCGCGGGCGGCGCGGCGCATCTGCCAGGAATGATTGCGGCAAAAACGCTGGTGCCGGTACTTGGCGTTCCGGTACAAAGCGCTGCGCTAAGCGGTGTGGACAGTCTCTATTCTATTGTACAGATGCCGCGCGGTATTCCGGTTGGCACACTGGCCATCGGCAAAGCTGGCGCCGCTAACGCGGCGCTGCTGGCGGCGCAAATTCTGGCCACCCACGATAACGCACTGCATCAGCGCCTTCGCGAC +>28 +AGCGACTGGACTACCATGCAATTCGCCGCCGAAATTTTTGAAATTCTGGATGTTCCGCACCATGTAGAAGTGGTTTCCGCCCATCGAACCCCTGATAAACTGTTCAGCTTCGCCGAAACGGCGGAAGAGAACGGATATCACGTGATTATTGCCGGCGCGGGCGGCGCGGCGCATCTGCCGGGAATGATTGCGGCAAAAACATTGGTGCCGGTACTCGGCGTTCCGGTACAAAGCGCAGCATTAAGCGGTGTGGATAGCCTTTACTCCATTGTTCAGATGCCGCGTGGCATTCCGGTGGGTACACTGGCTATCGGCAAAGCCGGGGCTGCGAACGCCGCGCTGCTGGCAGCGCAAATTTTGGCCACACACGATAATGCGCTGCACCAGCGCCTGAGCAAC +>29 +AGCGACTGGGCTACCATGCAGTTCGCCGCAGAAATCCTCGATATTCTGAACGTACCTCACCATGTTGAAGTGGTTTCCGCCCACCGCACGCCCGATAAACTGTTCAGCTTCGCCGAAGACGCCGAAAGCAACGGTTATCAGGTGATTATTGCCGGTGCCGGCGGCGCTGCGCACTTACCCGGAATGATTGCCGCCAAAACGCTGGTCCCGGTATTAGGTGTACCCGTCCAGAGCGCCGCATTAAGCGGTGTCGATAGCCTCTACTCCATCGTGCAGATGCCGCGCGGCATTCCGGTCGGTACGCTGGCGATCGGTAAAGCCGGTGCCGCTAACGCCGCCCTGCTCGCCGCGCAGATTCTGGCGCAACACGACGCGGAACTGCATCAGCGCATCGCCGAC +>30 +AGCGACTGGGCTACCATGCAGTTCGCCGTCGAAATCTTCGAAATCCTGAATGTCCCGCACCACGTTGAAGTGGTTTCTGCTCACCGCACCCCCGATAAACTGTTCAGCTTCGCCGAAAGCGCCGAAGAGAACGGTTATCAGGTGATTATTGCGGGCGCAGGCGGCGCAGCGCACCTGCCAGGCATGATTGCCGCCAAAACGCTGGTGCCGGTGCTGGGCGTGCCAGTACAGAGCGCCGCACTGAGCGGTGTCGATAGCCTCTACTCCATCGTACAAATGCCGCGCGGCATTCCGGTGGGTACGCTGGCGATTGGTAAAGCTGGCGCGGCAAACGCGGCATTACTGGCAGCACAAATTCTCGCGACTCACGATAAAGAGCTACACCAGCGTCTGAATGGC +>31 +AGCGACTGGGCTACCATGCAGTTTGCCGCCGAAATCTTCGATATCCTGAACGTTCCACACCACGTTGAAGTGGTTTCCGCACACCGCACCCCCGATAAGCTGTTCAGCTTCGCCGAAAGCGCCGAAGAGAAGGGTTATCAGGTGATTATTGCCGGTGCTGGCGGCGCGGCGCATCTGCCGGGAATGATTGCGGCAAAAACGCTGGTGCCGGTACTGGGCGTGCCGGTGCAAAGCGCTGCGCTGAGCGGCGTGGACAGCCTCTACTCTATCGTCCAGATGCCGCGCGGCATTCCGGTCGGCACGCTGGCGATCGGCAAAGCGGGCGCGGCGAACGCGGCGTTACTGGCAGCGCAAATTCTGGCGACACACGATAAAGACCTGCGCCAACGTCTGGCGGAC +>32 +AGCGACTGGGCTACCATGCAGTTCGCCGCCGAAATCTTCGAAATGCTGGACGTTCCGCACCATGTTGAAGTCGTCTCAGCCCACCGTACCCCTGATAAACTGTTCAGCTTCGCCGAAAGCGCTGAAGAAAACGGTTATCAGGTTATTATTGCGGGTGCTGGCGGTGCAGCGCATCTGCCGGGCATGATTGCAGCGAAAACGCTGGTCCCCGTGTTAGGCGTTCCGGTACAAAGCGCAGCGTTGAGCGGCGTAGATAGCCTCTACTCAATCGTGCAGATGCCACGCGGCATCCCCGTGGGTACGCTGGCGATTGGGAAAGCGGGTGCGGCAAATGCGGCCCTGCTGGCAGCACAAATTCTGGCAACACACGACAAAGCATTACATCAGCGTCTGAGCGAC +>33 +AGTGACTGGGCAACCATGTCTCATGCCGCAGATGTATTAGATACACTACAAATTCCTTACCATGTTGAGATTGTCTCTGCACACCGAACCCCTGATAAGTTATTTAGTTTTGCTGAAAAAGCAAAAAGTAATGGCTTTGATGTCATTATTGCTGGTGCAGGAGGAGCTGCCCATTTACCAGGAATGCTTGCAGCTAAAACGTTAGTACCCGTATTTGGTGTTCCTGTTCAAAGTGCGACATTAAGCGGTGTTGATAGCCTCTATTCAATCGTACAAATGCCAAAAGGTATCCCTGTAGGAACCTTAGCGATTGGTAAAGCAGGGGCTGCCAATGCGGCTTTATTAGCGGCTCAAGTTTTAGCGTTACATTCTCCTGCTATTTTAGATGCATTGACTGCA +>34 +AGCGACTGGGCTACCATGCAGTTCGCCGCCGAAATCTTTGAAATCCTGAATGTTCCGCACCACGTCGAAGTGGTTTCCGCACACCGTACCCCGGACAAACTGTTCAGCTTCGCCGAAAGCGCCGAAGAGAACGGTTACGAGGTGATCATTGCCGGTGCGGGCGGCGCAGCACATCTGCCGGGCATGATTGCCGCCAAAACGCTGGTGCCGGTACTGGGTGTTCCCGTGCAAAGCGCCGCGTTAAGCGGGGTGGATAGCCTTTACTCTATTGTCCAGATGCCGCGCGGTATTCCTGTCGGTACCCTGGCGATTGGTAAAGCAGGTGCGGCAAATGCCGCCCTGCTGGCCGCGCAGATCCTGGCGACGCATGATAAAGATTTGCACCAGCGTCTGGCGGAG +>35 +AGCGACTGGGCTACCATGCAATTCGCCGCCGAAACGGCGGAAGAGAACGGATATCAAGTGATTATTGCCGGCGCGGGCGGCGCGGCGCACCTGCCGGGAATGATTGCGGCAAAAACGCTGGTCCCGGTACTCGGCGTGCCGGTACAAAGCGCTGCGCTAAGCGGCGTGGATAGCCTTTACTCCATTGTGCAGATGCCGCGCGGCATTCCGGTGGGTACGCTGGCGATCGGTAAAGCCGGTGCGGCTAATGCCGCCCTGCTCGCCGCGCAGATTCTGGCGCAACACGACGCGGAACTGCATCAGCGCATCGCCGAC +>36 +AGCGACTGGGCCACCATGCAGCATGCCGCTGAAATTCTTGATGCCCTTGATGTTCCTTACCATGTTGAAGTGGTTTCCGCTCACCGCACGCCTGATAAGCTTTTCAGCTTTGCTGAATCCGCGCAGCACAACGGTTATCAGGTGATTATTGCTGGCGCAGGCGGTGCGGCGCATCTGCCGGGCATGATCGCCGCGAAAACCCTGGTGCCGGTATTAGGCGTGCCGGTGCAAAGCGCGGCCCTGAGCGGCGTGGACAGCCTCTACTCTATCGTGCAAATGCCGCGCGGCATTCCGGTAGGGACGCTGGCGATCGGCAAAGCGGGTGCTGCAAACGCCGCACTGCTGGCGGCGCAGATCCTCGCCCAGCATGACGATGCGCTACTGGCGCGTCTGGCGGCA +>37 +AAACGCTTCCTGAACGAACTGACCGCCGCTGAAGGGCTGGAACGTTATCTGGGCGCCAAATTCCCGGGTGCGAAACGTTTCTCGCTCGAGGGGGGAGATGCGCTGATACCTATGCTGAAAGAGATGGTTCGCCATGCGGGTAACAGCGGCACTCGCGAAGTGGTGCTGGGGATGGCGCACCGCGGTCGTCTGAACGTGCTGATCAACGTACTGGGTAAAAAACCGCAGGATCTGTTCGACGAGTTTGCCGGTAAACATAAAGAACATCTGGGTACCGGCGACGTGAAGTATCACATGGGCTTCTCGTCAGATATCGAAACTGAAGGCGGTCTGGTTCACCTGGCGCTGGCGTTTAACCCATCGCATCTGGAAATTGTGAGCCCGGTGGTGATGGGCTCCGTGCGCGCCCGTCTGGACCGACTGGACGAACCGAGCAGTAATAAAGTGCTGCCGATCACTATTCACGGCGACGCCGCGGTGACCGGCCAGGGCGTGGTTCAG +>38 +AAACGCTTCCTGAACGAACTGACCGCTGCAGAAGGGCTGGAACGTTATCTGGGGGCAAAATTCCCTGGCGCGAAACGTTTTTCGCTGGAAGGCGGCGATGCGTTAATTCCGATGCTCAAAGAGATGGTCCGCCATGCGGGCAACAGCGGCACCCGCGAAGTGGTGTTGGGAATGGCGCACCGTGGTCGCCTGAACGTACTGGTCAACGTGCTGGGTAAAAAACCTCAGGATCTGTTTGACGAGTTTGCCGGTAAACATAAAGAACATTTGGGCACCGGCGACGTGAAGTACCATATGGGTTTCTCGTCGGATATCGAAACCGAAGGCGGACTGGTTCACCTGGCGCTGGCGTTTAACCCGTCGCACCTGGAAATCGTCAGCCCGGTAGTGATGGGGTCTGTGCGCGCACGTCTCGACCGGCTCGACGAACCGAGCAGCAACAAAGTGTTGCCAATCACCATTCATGGTGATGCAGCAGTTACCGGGCAGGGCGTGGTTCAG +>39 +AAACGCTTCTTAAGCGAACTGACCGCCGCTGAAGGCCTTGAACGTTACCTCGGCGCAAAATTCCCTGGCGCAAAACGCTTCTCGCTGGAAGGCGGTGACGCGTTAATCCCGATGCTTAAAGAGATGATCCGCCACGCTGGCAACAGCGGCACCCGCGAAGTGGTTCTCGGGATGGCGCACCGTGGTCGTCTGAACGTGCTGGTGAACGTGCTGGGTAAAAAACCGCAAGACTTGTTCGACGAGTTCGCCGGTAAACATAAAGAACACCTCGGCACGGGTGACGTGAAATACCACATGGGCTTCTCGTCTGACTTCCAGACCGATGGCGGCCTGGTGCACCTGGCGCTGGCGTTTAACCCGTCTCACCTTGAGATTGTAAGCCCGGTAGTTATCGGTTCTGTTCGTGCCCGTCTGGACAGACTTGATGAGCCGAGCAGCAACAAAGTGCTGCCAATCACCATCCACGGTGACGCCGCAGTGACCGGGCAGGGTGTGGTTCAG +>40 +AAACGCTTCCTCAGCGAACTGACTGCAGCGGAAGGTCTGGAACGCTACCTGGGCGCGAAATTCCCGGGCGCGAAACGCTTCTCGCTGGAAGGCGGTGATGCGTTAATCCCAATGCTCAAAGAGATGATCCGCCACGCCGGTAACAGCGGTACCCGTGAAGTGGTACTGGGTATGGCGCACCGTGGTCGTCTGAACGTCCTGGTTAACGTGCTGGGTAAAAAGCCGCAGGATCTATTCGACGAATTTGCGGGCAAACATAAAGAACACCTCGGTACCGGTGACGTGAAGTACCACATGGGCTTCTCATCGGATATCGAAACCGAAGGCGGTCTGGTGCATCTGGCGCTGGCGTTTAACCCGTCGCACCTGGAAATCGTTAGCCCGGTGGTTATCGGTTCCGTACGTGCACGCTTGGATCGTCTGGACGAGCCGAGCAGCAATAAAGTGCTGCCAATCACTATTCATGGTGATGCGGCAGTAACCGGGCAAGGCGTGGTTCAG +>41 +CGTACTTTCCTTGAAGAGCTGACTGCCGCTGAAGGTTTAGAGCGCTATCTTGGTGCGAAATTCCCTGGTGCTAAACGTTTCTCTCTCGAAGGGGGGGATGCCTTAGTTCCGATGACCAAAGAGATGATCCGTCACGCGGGTGCCAGTGGCATGCGTGAAGTGGTGATTGGGATGGCGCACCGCGGTCGCTTGAACATGCTGGTCAACGTTCTGGGTAAAAAACCGCAAGATCTGTTTGATGAGTTTGCCGGTAAACATGGCGAAGGCTGGGGCACAGGTGATGTGAAATATCACCAAGGTTTCTCCGCTGACTTTGCGACACCGGGCGGTGATGTTCACTTAGCACTGGCTTTCAACCCATCGCATCTTGAGATTGTGAACCCTGTTGTGATGGGTTCAGTTCGCGCGCGTCAAGACCGCCTAGGTGATGAAGATGGCAGTAAAGTGCTACCTATCACTATCCATGGTGACTCTGCGATTGCCGGACAAGGTGTGGTGGCT +>42 +AAACGCTTCCTGAGCGAGCTGACCGCAGCCGAAGGCCTTGAGCGCTACCTGGGCGCGAAGTTCCCGGGCGCGAAACGCTTCTCGCTGGAAGGCGGCGACGCGCTGATCCCGATGCTGAAAGAGATGATTCGCCACGCGGGCAACAGCGGCACGCGTGAAGTGGTGCTGGGTATGGCGCACCGCGGTCGTCTTAACGTGCTGGTTAACGTGCTGGGTAAAAAACCGCAGGACCTGTTCGACGAGTTCGCGGGCAAACACAAAGAACACCTTGGCACCGGCGACGTGAAGTACCACATGGGCTTCTCGTCAGATATCGAAACTGAAGGCGGCCTGGTTCACCTGGCGCTGGCGTTTAACCCGTCGCACCTGGAAATCGTTAGCCCGGTGGTAATTGGTTCGGTACGTGCCCGTCTGGATCGGCTGGACGAGCCGAGCAGCAACAAAGTACTGCCGATCACCATTCACGGCGACGCCGCGGTGACCGGTCAGGGCGTGGTTCAG +>43 +GTGCTGGGCCGTAATGGTTCCGACTATTCCGCCGCCGTGCTGGCCGCCTGTTTACGCGCTGACTGCTGTGAAATCTGGACTGACGTCGATGGCGTGTATACCTGTGACCCGCGCCAGGTGCCGGACGCCAGACTGCTGAAATCGATGTCCTACCAGGAAGCGATGGAACTCTCTTACTTCGGCGCCAAAGTCCTTCACCCTCGCACCATAACGCCTATCGCCCAGTTCCAGATCCCCTGTCTGATTAAAAATACCGGTAATCCGCAGGCGCCAGGAACGCTGATCGGCGCGTCCAGCGACGATGATAATCTGCCGGTTAAAGGGATCTCTAACCTTAACAACATGGCGATGTTTAGCGTCTCCGGCCCGGGAATGAAAGGGATGATTGGGATGGCGGCGCGTGTTTTCGCCGCCATGTCTCGCGCCGGGATCTCGGTGGTGCTCATTACCCAGTCCTCCTCTGAGTACAGCATCAGCTTCTGTGTGCCGCAGAGTGACTGC +>44 +GTGCTGGGGCGTAACGGTTCCGACTATTCCGCTGCGGTACTGGCCGCCTGTTTACGCGCCGACTGTTGCGAAATCTGGACGGACGTTGACGGTGTGTATACCTGCGACCCGCGCCAGGTGCCGGATGCCAGACTGCTGAAGTCAATGTCCTATCAGGAAGCGATGGAACTTTCCTACTTCGGCGCCAAAGTGCTTCACCCGCGTACCATTACTCCCATCGCTCAATTCCAGATCCCATGTCTGATAAAAAATACCGGTAATCCGCAAGCGCCGGGCACGCTGATTGGCGCCAACAGCGATGAAGACGGGCTACCGGTAAAAGGCATCTCGAACCTCAATAATATGGCGATGTTTAGCGTCTCCGGCCCGGGAATGAAAGGCATGGTCGGGATGGCGGCGCGCGTGTTCGCCACCATGTCGCGTGCCGGGATTTCGGTAGTGCTGATCACCCAATCCTCTTCGGAGTACAGCATCAGCTTCTGCGTGCCGCCAAAGCGATGC +>45 +GTGCTGGGCCGTAACGGCTCCGATTATTCCGCCGCCGTACTGGCCGCCTGTTTACGCGCTGACTGTTGTGAAATCTGGACTGACGTCGACGGCGTGTATACCTGCGACCCGCGTCAGGTGCCAGACGCCAGGCTGCTGAAGTCGATGTCTTATCAGGAAGCAATGGAGCTTTCTTACTTCGGCGCTAAAGTACTACATCCGCGCACTATTACTCCTATTGCCCAGTTCCAGATCCCTTGTCTGATTAAAAATACCGGCAATCCACAAGCGCCCGGTACGCTGATCGGCGCTGCCAGCGACGATGATGCTCTGCCGGTTAAAGGGATTTCTCACCTTAACAACATGGCGATGTTTAGTGTCTCCGGTCCGGGGATGAAAGGCATGGTGGGTATGGCGGCGCGCGTTTTTGCCGCTATGTCACGTGCGGGAATCTCGGTGGTGTTGATCACGCAATCTTCATCTGAATACAGCATCAGCTTCTGCGTGCCGCAGAGCGACTGC +>46 +GTGCTGGGCCGCAACGGTTCTGATTACTCCGCTGCGGTGTTGGCTGCCTGCTTACGCGCCGACTGTTGTGAGATCTGGACTGACGTTGACGGCGTGTATACCTGTGACCCGCGCCAGGTGCCGGACGCCAGGTTGCTGAAGTCGATGTCCTATCAGGAGGCGATGGAGCTTTCTTACTTCGGCGCCAAAGTCCTTCATCCTCGCACCATCACCCCCATTGCCCAGTTCCAAATCCCATGCCTGATTAAAAACACCGGAAACCCGCAGGCCCCTGGTACGCTGATCGGCGCCAGCGTGGATGAAGACGAACTGCCGGTGAAAGGGATCTCGAACCTGAACAATATGGCGATGTTCAGCGTTTCCGGCCCAGGAATGAAAGGGATGATCGGGATGGCGGCGCGCGTCTTCGCGGCAATGTCCCGCGCGGGGATCTCCGTGGTGCTGATCACGCAATCCTCTTCTGAATACAGCATCAGTTTCTGCGTACCGCAGGGCGACTGC +>47 +GTGTTGGGGCGCAATGGCTCTGACTACTCTGCCGCTGTGCTGGCTGCCTGTTTACGCGCGGACTGTTGTGAGATCTGGACCGATGTCGACGGCGTATATACCTGCGATCCGCGCCAGGTACCCGATGCCCGACTGCTGAAGTCGATGTCTTATCAGGAAGCGATGGAGCTTTCTTACTTCGGCGCCAAAGTTCTGCATCCGCGCACCATTACCCCAATTGCCCAGTTCCAGATCCCGTGCCTGATTAAAAATACCGGCAATCCACAAGCGCCTGGCACGTTGATCGGCGCCAGCAGTGATGAAGACGATTTGCCGGTAAAAGGTATTTCTAACCTCAATAACATGGCGATGTTTAGCGTCTCCGGCCCTGGAATGAAAGGCATGGTAGGCATGGCGGCGCGCGTTTTTGCCGCGATGTCGCGTGCGGGCATCTCGGTGGTGCTGATCACGCAGTCTTCTTCTGAATACAGCATCAGCTTCTGCGTTCCGCAGGGCGACTGC +>48 +GTATTAGGTCGCAATGGTTCAGACTACTCAGCTGCAGTATTAGCAGCCTGTTTACGTGCTAAATGCTGTGAAATTTGGACTGATGTTGACGGTGTTTATACTTGTGATCCACGTTTAGTGCCTGATGCACGTTTGTTAAAAGGCATGTCATATCAAGAGGCAATGGAACTGTCTTACTTTGGTGCCAAGGTACTTCATCCTCGTACAATTGCGCCTATTGCCCAATTCCAAATACCTTGTTTAATTAAAAATACGGGCAATCCAGATGCGCCGGGTACCTTGATTGGTGATGGTCAAAAAGATGAGAGCACACCTGTTAAAGGAATAACTAACCTTAATAATATGGCAATGATCAACGTATCTGGGCCTGGAATGAAAGGAATGGTAGGAATGGCGGCTCGCGTGTTCTCGGTAATGTCGAGAGCGGGGATTTCAGTTGTTCTAATCACACAGTCTTCTTCTGAATACAGCATTAGTTTTTGTGTGCCACAAAAAGAGCTG +>49 +GTGCTTGGACGCAACGGTTCCGACTACTCTGCTGCGGTGCTGGCTGCCTGTTTACGCGCCGATTGTTGCGAGATTTGGACAGACGTTGACGGGGTCTATACCTGCGACCCGCGTCAGGTGCCCGATGCGAGGTTGTTGAAGTCGATGTCCTACCAGGAAGCGATGGAGCTTTCCTACTTCGGCGCTAAAGTTCTTCACCCCCGCACCATTACCCCCATCGCCCAGTTCCAGATCCCTTGCCTGATTAAAAATACCGGAAATCCTCAAGCACCAGGTACGCTCATTGGTGCCAGCCGTGATGAAGACGAATTACCGGTCAAGGGCATTTCCAATCTGAATAACATGGCAATGTTCAGCGTTTCCGGCCCGGGGATGAAAGGAATGGTTGGCATGGCGGCGCGCGTCTTTGCAGCGATGTCACGCGCCCGTATTTCCGTGGTGCTGATTACGCAATCATCTTCCGAATACAGTATCAGTTTCTGCGTTCCACAAAGCGACTGT +>50 +GTGCTCGGGCGCAACGGCTCCGATTATTCCGCAGCGGTACTGGCAGCGTGTTTACGCGCCGATTGTTGCGAGATCTGGACTGATGTCGATGGTGTCTATACCTGCGACCCACGTCAGGTACCGGATGCCCGATTACTTAAGTCGATGTCGTACCAGGAGGCTATGGAACTCTCCTATTTCGGCGCCAAAGTCCTCCATCCTCGAACCATCACTCCCATCGCCCAGTTCCAGATTCCCTGCCTGATAAAAAATACCGGAAACCCGCAAGCACCAGGAACGCTGATTGGCGCCAGCCGCGACGAAGATGATCTGCCGGTGAAGGGCATTTCAAATCTCAATAATATGGCGATGTTCAGCGTCTCCGGGCCGGGGATGAAGGGAATGGTCGGCATGGCTGCTCGCGTGTTTGCGGCAATGTCTCGCTCAGGAATTTCGGTAGTCCTGATTACGCAATCCTCCTCTGAGTACAGCATTAGCTTCTGTGTACCGCAGGCTGACTGT +>51 +GTGCTGGGGCGTAACGGCTCTGACTACTCCGCCGCCGTGCTGGCGGCCTGCTTACGCGCGGACTGCTGTGAGATCTGGACTGACGTCGACGGCGTTTATACCTGCGATCCGCGCCAGGTACCGGACGCCAGGCTGCTGAAGTCGATGTCGTACCAGGAAGCGATGGAGCTTTCCTACTTCGGCGCTAAAGTTCTTCACCCGCGTACCATCTCCCCGATTGCCCAGTTCCAAATCCCTTGCCTGATTAAGAATACCGGTAACCCTCAGGCGCCGGGCACGCTGATTGGCGCCAGCGCGGATGAAGATGAACTGCCGGTGAAAGGCATTTCTAACCTCAATAACATGGCGATGTTCAGCGTCTCCGGCCCGGGGATGAAGGGCATGGTCGGCATGGCGGCACGCGTATTTGCCGCTATGTCCCGCAACGGGATCTCCGTGGTGCTGATCACGCAGTCTTCTTCCGAATACAGCATCAGCTTCTGCGTTCCGCAGGGTGATTGC +>52 +GTATTAGGCCGTAACGGTTCCGACTACTCCGCCGCCGTGCTGGCCGCGTGTTTGCGCGCCGACTGTTGTGAGATCTGGACTGACGTCGACGGCGTCTATACCTGCGACCCGCGCCAGGTGCCGGACGCCAGGCTGCTGAAGTCGATGTCGTATCAGGAAGCCATGGAACTCTCCTACTTCGGCGCTAAAGTTCTCCACCCCCGCACCATTGCCCCCATCGCCCAGTTCCAAATCCCCTGTCTGATCAAAAACACTGGTAACCCGCAAGCGCCAGGCACCCTGATCGGTGCCAGCAGCGATGAAGACGGCCTGCCGGTGAAGGGCATCAGTAACCTGAATAATATGGCGATGTTCAGCGTCTCTGGTCCGGGCATGAAAGGCATGGTGGGAATGGCGGCGCGCGTGTTCGCGGCGATGTCCCGTGCGGGCATCTCGGTGGTGCTGATCACCCAATCGTCTTCTGAATACAGCATCAGCTTCTGCGTGCCGCAGGCCGACAGC diff --git a/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.ndb b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.ndb new file mode 100755 index 00000000..dfa7d2e2 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.ndb differ diff --git a/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nhr b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nhr new file mode 100755 index 00000000..974e6ed6 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nhr differ diff --git a/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nin b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nin new file mode 100755 index 00000000..db46f531 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nin differ diff --git a/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.njs b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.njs new file mode 100755 index 00000000..f86c829e --- /dev/null +++ b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.njs @@ -0,0 +1,22 @@ +{ + "version": "1.2", + "dbname": "nucleotide", + "dbtype": "Nucleotide", + "db-version": 5, + "description": "/Users/jrobertson/PycharmProjects/locidex/locidex/example/mlst_locidex_db/blast/nucleotide/nucleotide.fasta", + "number-of-letters": 25041, + "number-of-sequences": 53, + "last-updated": "2024-02-26T15:55:00", + "number-of-volumes": 1, + "bytes-total": 48288, + "bytes-to-cache": 7129, + "files": [ + "nucleotide.ndb", + "nucleotide.nhr", + "nucleotide.nin", + "nucleotide.not", + "nucleotide.nsq", + "nucleotide.ntf", + "nucleotide.nto" + ] +} diff --git a/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.not b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.not new file mode 100755 index 00000000..7c2dda10 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.not differ diff --git a/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nsq b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nsq new file mode 100755 index 00000000..7fc5e3b8 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nsq differ diff --git a/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.ntf b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.ntf new file mode 100755 index 00000000..005ac416 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.ntf differ diff --git a/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nto b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nto new file mode 100755 index 00000000..ad19396e Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/nucleotide/nucleotide.nto differ diff --git a/tests/data/databases/locidex_salm/blast/protein/protein.fasta b/tests/data/databases/locidex_salm/blast/protein/protein.fasta new file mode 100755 index 00000000..47910ff3 --- /dev/null +++ b/tests/data/databases/locidex_salm/blast/protein/protein.fasta @@ -0,0 +1,106 @@ +>0 +KFRPGHADYTYHQKYGVRDYRGGGRSSARETAMRVAAGAIAKKYLQQEFGIEVRAYLSQMGDVAIDKVDWNEIENNDFFCPDVDKVAAFDELIRELKKEGDSIGAKIQVVATGVPVGLGEPVFDRLDADIAHALMSINAVKGVEIGDGFDVVRQKGSQHRDPLTPQG +>1 +VFRPGHADYTYEQKYGLRDYRGGGRSSARETAMRVAAGAIAKKYLAEKHGIVIQGCLTQMGDIPLEIKDWQQVEQNPFFCPDPDKIDALDELMRALKKEGDSIGAKVTVVANGVPAGLGEPVFDRLDADIAHALMSINAVKGVEIGDGFDVVALRGSQNRDEITKEG +>2 +VFRPGHADYTYEQKYGLRDYRGGGRSSARETAMRVAAGAIAKKYLAEKFGVEIRGCLTQMGDIPLEIKDWSQVELNPFFCPDPDKIEVLDELMRGLKKEGDSIGAKVTVVASGVPAGLGEPVFDRLDADIAHALMSINAVKGVEIGDGFDVVALRGSQNRDEITKEG +>3 +VFRPGHADYTYEQKYGLRDYRGGGRSSARETAMRVAAGAIAKKYLKEKLGIEVRGYLSQLGPITCDLVDWSIVESNPFFCPDPSRLDALDEYMRALKKEGNSIGAKVTVVAQGVPAGFGEPVFDRLDADLAHALMSINAVKGIEIGDGFGVVTLKGTENRDEITKKG +>4 +VFRPGHADYTYEQKYGLRDYRGGGRSSARETAMRVAAGAIAKKYLAEKFGIEIRGCLTQMGDIPLEIKDWSQVEQNPFFCPDPDKIDALDELMRALKKEGDSIGAKVTVVASGVPAGLGEPVFDRLDADIAHALMSINAVKGVEIGDGFDVVALRGSQNRDEITKDG +>5 +VFRPGHADYTYEQKYGFRDYRGGGRSSARETAMRVAAGAIAKKYLQQKFGIVIRGCLSQMGDIPLAIKDWDQVELNPFFCADADKLDALDELMRGLKKEGDSIGAKVTVVADGVPAGWGEPVFDRLDADIAHALMSINAVKGVEIGDGFDVVKLRGSQNRDEITKAG +>6 +VFRPGHADYTYEQKYGLRDYRGGGRSSARETAMRVAAGAIAKKYLAQKFGVVIRGCLTQMGDIPLEIKDWDQVEQNPFFCPDPDKIEALDELMRALKKEGDSIGAKVTVVADSVPAGLGEPVFDRLDADIAHALMSINAVKGVEIGDGFGVVQLRGSQNRDEITTAG +>7 +MEMVARVTLSQPHEPGATTVPARKFFDICRGLPEGAEIAVQLEGDRMLVRSGRSRFSLSTLPAADFPNLDDWQSEVEFTLPQATMKRLIEATQFSMAHQDVRYYLNGMLFETEGSELRTVATDGHRLAVCSMPLEASLPSHSVIVPRKGVIELMRMLDGGENPLRVQ +>8 +MEMVARVTLSQPHEPGATTVPARKFFDICRGLPEGAEIAVQLEGDRMLVRSGRSRFSLSTLPAADFPNLDDWQSEVEFTLPQATMKRLIEATQFSMAHQDVRYYLNGMLFETEGSELRTVATDGHRLAVCSMPLEASLPSHSVIVPRKGVIELMRMLDGGENPLRVQ +>9 +MEMVARVALIQPHEPGATTVPARKFFDICRGLPEGAEIAVQLEGDRMLVRSGRSRFSLSTLPAADFPNLDDWQSEVEFTLPQATMKRLIEATQFSMAHQDVRYYLNGMLFETEGEELRTVATDGHRLAVCSMPVGQSLPNHSVIVPRKGVIELMRMLDGGETPLRVQ +>10 +MEMVARVALIQPHEPGATTVPARKFFDICRGLPEGAEIAVQLEGDRMLVRSGRSRFSLSTLPAADFPNLDDWQSEVEFTLPQATMKRLIEATQFSMAHQDVRYYLNGMLFETEGEELRTVATDGHRLAVCSMPVGQPLPSHSVIVPRKGVIELMRMLDGGDNPLRVQ +>11 +MEMVARVALVQPHEPGATTVPARKFFDICRGLPEGAEIAVQLEGERMLVRSGRSRFSLSTLPAADFPNLDDWQSEVEFTLPQATMKRLIEATQFSMAHQDVRYYLNGMLFETEGEELRTVATDGHRLAVCSMPIGQSLPSHSVIVPRKGVIELMRMLDGGDNPLRVQ +>12 +MEMIARVTLTQPHDAGATTVPARKFFDICRGLPEGAEIAVQLEGDRMLVRSGRSRFSLSTLPAADFPNLDDWQSEVEFTLPQATMKRLIEATQFSMAHQDVRYYLNGMLFETEGEELRTVATDGHRLAVCSMPIGDSLPNHSVIVPRKGVIELMRMLDGGETPLRVQ +>13 +MEMIARVALSLPHQAGATTVPARKFFDICRGLPEGAEIAVTLEGDRMLVRSGRSRFSLSTLPAADFPNLDDWQSEVEFTLPQATMKRLIEATQFSMAHQDVRYYLNGMLFETEGEELRTVATDGHRLAVCAMPVGQPLPNHSVIVPRKGVLELMRMLDGGDSPLRIQ +>14 +SALTENDLVFALSQHAVTFADAELQQQGKSWPSLPRYFAIGRTTALALHTVSGFNIHYPLDREISEVLLQLPELQNIAGKRALILRGNGGRELIGETLTARGADVDFCECYQRSAKYYDGAEEAMRWQSRGVTTVVVTSGEMLQ +>15 +AALGESDLLFALSQHAVAFAQSQLHQQDRKWPRLPTYFAIGRTTALALHTVSGQKILYPQDREISEVLLQLPELQNIAGKRALILRGNGGRELIGDTLTARGAEVTFCECYQRCAIHYDGAEEAMRWQSREVTTVVVTSGEMLQ +>16 +ATLTENDLVFALSQHAVAFAHAQLQRDGRNWPASPRYFAIGRTTALALHTVSGFDIRYPLDREISEVLLQLPELQNIAGKRALILRGNGGRGRELLGETLTARGAEVSFCECYQRSAKHYDGAEEAMRWHTRGVTTLVVTSGEMLQ +>17 +AALTDNDLVFALSQHAVAFAHAQLQQQELDWPVQPRYFAIGRTTALALHTVNGCDIRYPLDREISEVLLQLPELQNIAGKRALILRGNGGRELLGKTLTERGAEVTFCECYQRSAKHYDGAEEAMRWHSRGVTTIVVTSGEMLQ +>18 +ETLGDNDLLFALSQHAVSFAHAQLQQQGLNWPSLPHYFAIGRTTALALHTVSGHKIRYPQDREISEVLLQLPELQSIAGKRALILRGNGGRELIGQTLTSRGADVTFCECYQRSAKHYDGAEEAMRWQSRGVTTVVVTSGEMLQ +>19 +RLLQEGDLLFALSQHAVEFAHAQLQQHAVSWPHAPRYFAIGRTTALALHTASGIDVRYPLDREISEVLLQLPELQTIAGKRALILRGNGGRELLGETLRERGADVTFVECYQRCAKHYDGAEEAMRWHARGINTLVVTSGEMLQ +>20 +IAGCQKVVLCSPPPIADEILYAAQLCGVQEIFNVGGAQAIAALAFGSESVPKVDKIFGPGNAFVTEAKRQVSQRLDGAAIDMPAGPSEVLVIADSGATPDFVASDLLSQAEHGPDSQVILLTPDADIARKVAEAVERQLAELPRAGTARQALSASRLIVTKDLAQCV +>21 +IAGCKKVVLCSPPPIADEILYAAQLCGVQEIFNVGGAQAIAALAFGSESVPKVDKIFGPGNAFVTEAKRQVSQRLDGAAIDMPAGPSEVLVIADSGATPDFVASDLLSQAEHGPDSQVILLTPDAGIAQNVAEAVERQLAELPRAETARQALSASRLIVTKDLAQCV +>22 +IAGCKKVVLCSPPPIADEILYAAQLCGVQDVFNVGGAQAIAALAFGSESVPKVDKIFGPGNAFVTEAKRQVSQRLDGAAIDMPAGPSEVLVIADSGATPDFVASDLLSQAEHGPDSQVILLTPDAGMASRVAEAVERQLAALPRAETARQALSASRLIVTRSLAQCV +>23 +IAGCKKVVLCSPPPIADEILYAAQLCGVQDVFNVGGAQAIAALAFGTESVPKVDKIFGPGNAFVTEAKRQVSQRLDGAAIDMPAGPSEVLVIADSGATPDFVASDLLSQAEHGPDSQVILLTPAADMARRVAEAVERQLAELPRAETARQALNASRLIVTKDLAQCV +>24 +IAGCQKVVLCSPPPIADEILYAAKLCGVQAIYKVGGAQAISALAFGTVSIPKVDKIFGPGNAYVTEAKRQVSQRLDGAAIDMPAGPSEVLVIADSGATPDFVASDLLSQAEHGPDSQVILLTPDADMAKRVGDAVERQLADLPRAETARQALSASRLIVARDLDQCI +>25 +IAGCKKVVLCSPPPIADEILYAAQLCGVKEVFNVGGAQAIAALALGTESIPKVDKIFGPGNAYVTEAKRQVSQRLDGAAIDMPAGPSEVLVIADSGATPDFVASDLLSQAEHGPDSQVILLTPDAKLAEGVAEAVERQLAELSRADTARQALSASRLIVAKDLAQCV +>26 +IAGCKKVVLCSPPPIADEILYAARLCGVQQVYQVGGAQAIAALAFGTETVPKVDKIFGPGNAFVTEAKRQVSQRLDGAAIDMPAGPSEVLVIADSGATTDFVASDLLSQAEHGPDSQVILLTPDSAMAQAVADAVERQLAELPRAETARQALAESRLIVARDLAQCV +>27 +SDWATMQFAAEIFDILDIPHHVEVVSAHRTPDKLFSFAENAEENGFQVIIAGAGGAAHLPGMIAAKTLVPVLGVPVQSAALSGVDSLYSIVQMPRGIPVGTLAIGKAGAANAALLAAQILATHDNALHQRLRD +>28 +SDWTTMQFAAEIFEILDVPHHVEVVSAHRTPDKLFSFAETAEENGYHVIIAGAGGAAHLPGMIAAKTLVPVLGVPVQSAALSGVDSLYSIVQMPRGIPVGTLAIGKAGAANAALLAAQILATHDNALHQRLSN +>29 +SDWATMQFAAEILDILNVPHHVEVVSAHRTPDKLFSFAEDAESNGYQVIIAGAGGAAHLPGMIAAKTLVPVLGVPVQSAALSGVDSLYSIVQMPRGIPVGTLAIGKAGAANAALLAAQILAQHDAELHQRIAD +>30 +SDWATMQFAVEIFEILNVPHHVEVVSAHRTPDKLFSFAESAEENGYQVIIAGAGGAAHLPGMIAAKTLVPVLGVPVQSAALSGVDSLYSIVQMPRGIPVGTLAIGKAGAANAALLAAQILATHDKELHQRLNG +>31 +SDWATMQFAAEIFDILNVPHHVEVVSAHRTPDKLFSFAESAEEKGYQVIIAGAGGAAHLPGMIAAKTLVPVLGVPVQSAALSGVDSLYSIVQMPRGIPVGTLAIGKAGAANAALLAAQILATHDKDLRQRLAD +>32 +SDWATMQFAAEIFEMLDVPHHVEVVSAHRTPDKLFSFAESAEENGYQVIIAGAGGAAHLPGMIAAKTLVPVLGVPVQSAALSGVDSLYSIVQMPRGIPVGTLAIGKAGAANAALLAAQILATHDKALHQRLSD +>33 +SDWATMSHAADVLDTLQIPYHVEIVSAHRTPDKLFSFAEKAKSNGFDVIIAGAGGAAHLPGMLAAKTLVPVFGVPVQSATLSGVDSLYSIVQMPKGIPVGTLAIGKAGAANAALLAAQVLALHSPAILDALTA +>34 +SDWATMQFAAEIFEILNVPHHVEVVSAHRTPDKLFSFAESAEENGYEVIIAGAGGAAHLPGMIAAKTLVPVLGVPVQSAALSGVDSLYSIVQMPRGIPVGTLAIGKAGAANAALLAAQILATHDKDLHQRLAE +>35 +SDWATMQFAAETAEENGYQVIIAGAGGAAHLPGMIAAKTLVPVLGVPVQSAALSGVDSLYSIVQMPRGIPVGTLAIGKAGAANAALLAAQILAQHDAELHQRIAD +>36 +SDWATMQHAAEILDALDVPYHVEVVSAHRTPDKLFSFAESAQHNGYQVIIAGAGGAAHLPGMIAAKTLVPVLGVPVQSAALSGVDSLYSIVQMPRGIPVGTLAIGKAGAANAALLAAQILAQHDDALLARLAA +>37 +KRFLNELTAAEGLERYLGAKFPGAKRFSLEGGDALIPMLKEMVRHAGNSGTREVVLGMAHRGRLNVLINVLGKKPQDLFDEFAGKHKEHLGTGDVKYHMGFSSDIETEGGLVHLALAFNPSHLEIVSPVVMGSVRARLDRLDEPSSNKVLPITIHGDAAVTGQGVVQ +>38 +KRFLNELTAAEGLERYLGAKFPGAKRFSLEGGDALIPMLKEMVRHAGNSGTREVVLGMAHRGRLNVLVNVLGKKPQDLFDEFAGKHKEHLGTGDVKYHMGFSSDIETEGGLVHLALAFNPSHLEIVSPVVMGSVRARLDRLDEPSSNKVLPITIHGDAAVTGQGVVQ +>39 +KRFLSELTAAEGLERYLGAKFPGAKRFSLEGGDALIPMLKEMIRHAGNSGTREVVLGMAHRGRLNVLVNVLGKKPQDLFDEFAGKHKEHLGTGDVKYHMGFSSDFQTDGGLVHLALAFNPSHLEIVSPVVIGSVRARLDRLDEPSSNKVLPITIHGDAAVTGQGVVQ +>40 +KRFLSELTAAEGLERYLGAKFPGAKRFSLEGGDALIPMLKEMIRHAGNSGTREVVLGMAHRGRLNVLVNVLGKKPQDLFDEFAGKHKEHLGTGDVKYHMGFSSDIETEGGLVHLALAFNPSHLEIVSPVVIGSVRARLDRLDEPSSNKVLPITIHGDAAVTGQGVVQ +>41 +RTFLEELTAAEGLERYLGAKFPGAKRFSLEGGDALVPMTKEMIRHAGASGMREVVIGMAHRGRLNMLVNVLGKKPQDLFDEFAGKHGEGWGTGDVKYHQGFSADFATPGGDVHLALAFNPSHLEIVNPVVMGSVRARQDRLGDEDGSKVLPITIHGDSAIAGQGVVA +>42 +KRFLSELTAAEGLERYLGAKFPGAKRFSLEGGDALIPMLKEMIRHAGNSGTREVVLGMAHRGRLNVLVNVLGKKPQDLFDEFAGKHKEHLGTGDVKYHMGFSSDIETEGGLVHLALAFNPSHLEIVSPVVIGSVRARLDRLDEPSSNKVLPITIHGDAAVTGQGVVQ +>43 +VLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASSDDDNLPVKGISNLNNMAMFSVSGPGMKGMIGMAARVFAAMSRAGISVVLITQSSSEYSISFCVPQSDC +>44 +VLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGANSDEDGLPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFATMSRAGISVVLITQSSSEYSISFCVPPKRC +>45 +VLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGAASDDDALPVKGISHLNNMAMFSVSGPGMKGMVGMAARVFAAMSRAGISVVLITQSSSEYSISFCVPQSDC +>46 +VLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASVDEDELPVKGISNLNNMAMFSVSGPGMKGMIGMAARVFAAMSRAGISVVLITQSSSEYSISFCVPQGDC +>47 +VLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASSDEDDLPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRAGISVVLITQSSSEYSISFCVPQGDC +>48 +VLGRNGSDYSAAVLAACLRAKCCEIWTDVDGVYTCDPRLVPDARLLKGMSYQEAMELSYFGAKVLHPRTIAPIAQFQIPCLIKNTGNPDAPGTLIGDGQKDESTPVKGITNLNNMAMINVSGPGMKGMVGMAARVFSVMSRAGISVVLITQSSSEYSISFCVPQKEL +>49 +VLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDC +>50 +VLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDDLPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRSGISVVLITQSSSEYSISFCVPQADC +>51 +VLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTISPIAQFQIPCLIKNTGNPQAPGTLIGASADEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRNGISVVLITQSSSEYSISFCVPQGDC +>52 +VLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTIAPIAQFQIPCLIKNTGNPQAPGTLIGASSDEDGLPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRAGISVVLITQSSSEYSISFCVPQADS diff --git a/tests/data/databases/locidex_salm/blast/protein/protein.ndb b/tests/data/databases/locidex_salm/blast/protein/protein.ndb new file mode 100755 index 00000000..707749d1 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/protein/protein.ndb differ diff --git a/tests/data/databases/locidex_salm/blast/protein/protein.nhr b/tests/data/databases/locidex_salm/blast/protein/protein.nhr new file mode 100755 index 00000000..974e6ed6 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/protein/protein.nhr differ diff --git a/tests/data/databases/locidex_salm/blast/protein/protein.nin b/tests/data/databases/locidex_salm/blast/protein/protein.nin new file mode 100755 index 00000000..13bc3563 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/protein/protein.nin differ diff --git a/tests/data/databases/locidex_salm/blast/protein/protein.njs b/tests/data/databases/locidex_salm/blast/protein/protein.njs new file mode 100755 index 00000000..75ee56f8 --- /dev/null +++ b/tests/data/databases/locidex_salm/blast/protein/protein.njs @@ -0,0 +1,22 @@ +{ + "version": "1.2", + "dbname": "protein", + "dbtype": "Nucleotide", + "db-version": 5, + "description": "/Users/jrobertson/PycharmProjects/locidex/locidex/example/mlst_locidex_db/blast/protein/protein.fasta", + "number-of-letters": 5570, + "number-of-sequences": 53, + "last-updated": "2024-02-26T15:55:00", + "number-of-volumes": 1, + "bytes-total": 55785, + "bytes-to-cache": 14626, + "files": [ + "protein.ndb", + "protein.nhr", + "protein.nin", + "protein.not", + "protein.nsq", + "protein.ntf", + "protein.nto" + ] +} diff --git a/tests/data/databases/locidex_salm/blast/protein/protein.not b/tests/data/databases/locidex_salm/blast/protein/protein.not new file mode 100755 index 00000000..7c2dda10 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/protein/protein.not differ diff --git a/tests/data/databases/locidex_salm/blast/protein/protein.nsq b/tests/data/databases/locidex_salm/blast/protein/protein.nsq new file mode 100755 index 00000000..ce5ee856 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/protein/protein.nsq differ diff --git a/tests/data/databases/locidex_salm/blast/protein/protein.ntf b/tests/data/databases/locidex_salm/blast/protein/protein.ntf new file mode 100755 index 00000000..005ac416 Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/protein/protein.ntf differ diff --git a/tests/data/databases/locidex_salm/blast/protein/protein.nto b/tests/data/databases/locidex_salm/blast/protein/protein.nto new file mode 100755 index 00000000..ad19396e Binary files /dev/null and b/tests/data/databases/locidex_salm/blast/protein/protein.nto differ diff --git a/tests/data/databases/locidex_salm/config.json b/tests/data/databases/locidex_salm/config.json new file mode 100755 index 00000000..4deb4371 --- /dev/null +++ b/tests/data/databases/locidex_salm/config.json @@ -0,0 +1,12 @@ +{ + "db_name": "Locidex Database", + "db_version": "1.0.0", + "db_date": "26/02/2024", + "db_author": "", + "db_desc": "", + "db_num_seqs": 53, + "is_nucl": true, + "is_prot": false, + "nucleotide_db_name": "nucleotide", + "protein_db_name": "" +} \ No newline at end of file diff --git a/tests/data/databases/locidex_salm/meta.json b/tests/data/databases/locidex_salm/meta.json new file mode 100755 index 00000000..f3b88fa9 --- /dev/null +++ b/tests/data/databases/locidex_salm/meta.json @@ -0,0 +1,1181 @@ +{ + "info": { + "num_seqs": 53, + "is_cds": "True", + "trans_table": 11, + "dna_min_len": 220, + "dna_max_len": 350, + "dna_min_ident": 80, + "aa_min_len": 73, + "aa_max_len": 116, + "aa_min_ident": 80 + }, + "meta": { + "0": { + "seq_id": 0, + "locus_name": "aroC", + "locus_name_alt": "SALM25359", + "locus_product": "chorismate synthase", + "locus_description": NaN, + "locus_uid": 609, + "dna_seq_len": 501, + "dna_seq_hash": "4811bc98591c74954ace3cb487330482", + "aa_seq_len": 167, + "aa_seq_hash": "a8fbcf8179d8548f980b7b15f29de1d4", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "1": { + "seq_id": 1, + "locus_name": "aroC", + "locus_name_alt": "SALM25359", + "locus_product": "chorismate synthase", + "locus_description": NaN, + "locus_uid": 614, + "dna_seq_len": 501, + "dna_seq_hash": "b66979eaf680fab872ffe1bde4c092d6", + "aa_seq_len": 167, + "aa_seq_hash": "3e034a4d80ac27352822774abd9319df", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "2": { + "seq_id": 2, + "locus_name": "aroC", + "locus_name_alt": "SALM25359", + "locus_product": "chorismate synthase", + "locus_description": NaN, + "locus_uid": 618, + "dna_seq_len": 501, + "dna_seq_hash": "f02a36ff6df05f9bf38428fa22a035da", + "aa_seq_len": 167, + "aa_seq_hash": "e2d30bb18231528ef65c34880704dd7a", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "3": { + "seq_id": 3, + "locus_name": "aroC", + "locus_name_alt": "SALM25359", + "locus_product": "chorismate synthase", + "locus_description": NaN, + "locus_uid": 619, + "dna_seq_len": 501, + "dna_seq_hash": "bee9d7360aa8e9b840fb29afa1de2c2e", + "aa_seq_len": 167, + "aa_seq_hash": "c3f71f5780b5f1031aaf21697a482ee3", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "4": { + "seq_id": 4, + "locus_name": "aroC", + "locus_name_alt": "SALM25359", + "locus_product": "chorismate synthase", + "locus_description": NaN, + "locus_uid": 620, + "dna_seq_len": 501, + "dna_seq_hash": "5b7956485455fdbc7c86d4834a8f7406", + "aa_seq_len": 167, + "aa_seq_hash": "60ce8f3b07f53378580ee528910ee623", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "5": { + "seq_id": 5, + "locus_name": "aroC", + "locus_name_alt": "SALM25359", + "locus_product": "chorismate synthase", + "locus_description": NaN, + "locus_uid": 624, + "dna_seq_len": 501, + "dna_seq_hash": "98ba14aac74444a253123aff3d20c69f", + "aa_seq_len": 167, + "aa_seq_hash": "bab41702c7c209def93f9c9930c27086", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "6": { + "seq_id": 6, + "locus_name": "aroC", + "locus_name_alt": "SALM25359", + "locus_product": "chorismate synthase", + "locus_description": NaN, + "locus_uid": 716, + "dna_seq_len": 501, + "dna_seq_hash": "6b9166d5d996897cae3cc288d7969d78", + "aa_seq_len": 167, + "aa_seq_hash": "5bc86c0a9226224922cbd6219c182622", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "7": { + "seq_id": 7, + "locus_name": "dnaN", + "locus_name_alt": "SALM25360", + "locus_product": "DNA polymerase III subunit beta", + "locus_description": NaN, + "locus_uid": 1, + "dna_seq_len": 501, + "dna_seq_hash": "d401763f2df6e5fe87e1e07d3c170fe6", + "aa_seq_len": 167, + "aa_seq_hash": "928ad814483bbffda3e3b3a0aa4ca072", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "8": { + "seq_id": 8, + "locus_name": "dnaN", + "locus_name_alt": "SALM25360", + "locus_product": "DNA polymerase III subunit beta", + "locus_description": NaN, + "locus_uid": 120, + "dna_seq_len": 501, + "dna_seq_hash": "9c50d73cc4ef8d0a447f07ad150ad8cc", + "aa_seq_len": 167, + "aa_seq_hash": "928ad814483bbffda3e3b3a0aa4ca072", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "9": { + "seq_id": 9, + "locus_name": "dnaN", + "locus_name_alt": "SALM25360", + "locus_product": "DNA polymerase III subunit beta", + "locus_description": NaN, + "locus_uid": 555, + "dna_seq_len": 501, + "dna_seq_hash": "fab4f658dfba0cd0174a4a87998cf948", + "aa_seq_len": 167, + "aa_seq_hash": "a081905e659429db1f40e145932ae277", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "10": { + "seq_id": 10, + "locus_name": "dnaN", + "locus_name_alt": "SALM25360", + "locus_product": "DNA polymerase III subunit beta", + "locus_description": NaN, + "locus_uid": 557, + "dna_seq_len": 501, + "dna_seq_hash": "acb2ed027124e2a54b7734cd538590f1", + "aa_seq_len": 167, + "aa_seq_hash": "970184ec5ccc9f02ee3c858d2687cc18", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "11": { + "seq_id": 11, + "locus_name": "dnaN", + "locus_name_alt": "SALM25360", + "locus_product": "DNA polymerase III subunit beta", + "locus_description": NaN, + "locus_uid": 558, + "dna_seq_len": 501, + "dna_seq_hash": "ad996a122298d55ab3d4b2ea7a4974b0", + "aa_seq_len": 167, + "aa_seq_hash": "945455021fffea9b793d16af630db961", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "12": { + "seq_id": 12, + "locus_name": "dnaN", + "locus_name_alt": "SALM25360", + "locus_product": "DNA polymerase III subunit beta", + "locus_description": NaN, + "locus_uid": 563, + "dna_seq_len": 501, + "dna_seq_hash": "815242e67f31f4e2968f7f0620565125", + "aa_seq_len": 167, + "aa_seq_hash": "1b117ca76a022ae63d6f7bfe2ead289e", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "13": { + "seq_id": 13, + "locus_name": "dnaN", + "locus_name_alt": "SALM25360", + "locus_product": "DNA polymerase III subunit beta", + "locus_description": NaN, + "locus_uid": 633, + "dna_seq_len": 501, + "dna_seq_hash": "532742ae95c046241789d79e68e30b7a", + "aa_seq_len": 167, + "aa_seq_hash": "fff51d2396f3da88a775416b4c6d14b6", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "14": { + "seq_id": 14, + "locus_name": "hemD", + "locus_name_alt": "SALM25361", + "locus_product": "uroporphyrinogen-III synthase", + "locus_description": NaN, + "locus_uid": 316, + "dna_seq_len": 432, + "dna_seq_hash": "3922f6256f2891400db415013eb0b208", + "aa_seq_len": 144, + "aa_seq_hash": "0af9d546dfcaf93373a8919df3e30323", + "dna_min_len": 302, + "dna_max_len": 734, + "aa_min_len": 100, + "aa_max_len": 244, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "15": { + "seq_id": 15, + "locus_name": "hemD", + "locus_name_alt": "SALM25361", + "locus_product": "uroporphyrinogen-III synthase", + "locus_description": NaN, + "locus_uid": 343, + "dna_seq_len": 432, + "dna_seq_hash": "f76c13e33ad5b502dfe64181dbdf2378", + "aa_seq_len": 144, + "aa_seq_hash": "32484f065f9013aaa5b3c694cc99cdbf", + "dna_min_len": 302, + "dna_max_len": 734, + "aa_min_len": 100, + "aa_max_len": 244, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "16": { + "seq_id": 16, + "locus_name": "hemD", + "locus_name_alt": "SALM25361", + "locus_product": "uroporphyrinogen-III synthase", + "locus_description": NaN, + "locus_uid": 472, + "dna_seq_len": 438, + "dna_seq_hash": "80bea3abd165ee14e51bc9e9779fc6a1", + "aa_seq_len": 146, + "aa_seq_hash": "4e9cc2d289f1c946738cc8e6e4ef1186", + "dna_min_len": 306, + "dna_max_len": 744, + "aa_min_len": 102, + "aa_max_len": 248, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "17": { + "seq_id": 17, + "locus_name": "hemD", + "locus_name_alt": "SALM25361", + "locus_product": "uroporphyrinogen-III synthase", + "locus_description": NaN, + "locus_uid": 489, + "dna_seq_len": 432, + "dna_seq_hash": "83a314185d9ff0bf7c2953d30979e7eb", + "aa_seq_len": 144, + "aa_seq_hash": "5f9fc3707789543f2f14b0f1a555a05c", + "dna_min_len": 302, + "dna_max_len": 734, + "aa_min_len": 100, + "aa_max_len": 244, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "18": { + "seq_id": 18, + "locus_name": "hemD", + "locus_name_alt": "SALM25361", + "locus_product": "uroporphyrinogen-III synthase", + "locus_description": NaN, + "locus_uid": 497, + "dna_seq_len": 432, + "dna_seq_hash": "c70622b317de74bdaf57eb8bb5134537", + "aa_seq_len": 144, + "aa_seq_hash": "56b3d46d3e517eb7f83f089f9ed5ae2a", + "dna_min_len": 302, + "dna_max_len": 734, + "aa_min_len": 100, + "aa_max_len": 244, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "19": { + "seq_id": 19, + "locus_name": "hemD", + "locus_name_alt": "SALM25361", + "locus_product": "uroporphyrinogen-III synthase", + "locus_description": NaN, + "locus_uid": 498, + "dna_seq_len": 432, + "dna_seq_hash": "f284b11b34de688e2ef54c1b73936595", + "aa_seq_len": 144, + "aa_seq_hash": "da558cdebd900031d0df8f58ef01454e", + "dna_min_len": 302, + "dna_max_len": 734, + "aa_min_len": 100, + "aa_max_len": 244, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "20": { + "seq_id": 20, + "locus_name": "hisD", + "locus_name_alt": "SALM25362", + "locus_product": "histidinol dehydrogenase", + "locus_description": NaN, + "locus_uid": 1, + "dna_seq_len": 501, + "dna_seq_hash": "9f762c246c542c52c94c5022ca62311c", + "aa_seq_len": 167, + "aa_seq_hash": "447381a0d286fa1037b5499e2242819a", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "21": { + "seq_id": 21, + "locus_name": "hisD", + "locus_name_alt": "SALM25362", + "locus_product": "histidinol dehydrogenase", + "locus_description": NaN, + "locus_uid": 754, + "dna_seq_len": 501, + "dna_seq_hash": "65b434bea0d1939d2b748dbc5dd6df8b", + "aa_seq_len": 167, + "aa_seq_hash": "2b685aa7892794b69c9faa20c58a9183", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "22": { + "seq_id": 22, + "locus_name": "hisD", + "locus_name_alt": "SALM25362", + "locus_product": "histidinol dehydrogenase", + "locus_description": NaN, + "locus_uid": 757, + "dna_seq_len": 501, + "dna_seq_hash": "eccfc35078428e44e5dd3e85d9ebf1fe", + "aa_seq_len": 167, + "aa_seq_hash": "35fa89ee4cd8689b89d553157471afe0", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "23": { + "seq_id": 23, + "locus_name": "hisD", + "locus_name_alt": "SALM25362", + "locus_product": "histidinol dehydrogenase", + "locus_description": NaN, + "locus_uid": 759, + "dna_seq_len": 501, + "dna_seq_hash": "ce01d780cd0ffe3197f708d7048a473b", + "aa_seq_len": 167, + "aa_seq_hash": "bc0edd26ea6032cc4939e8cbc17a12d3", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "24": { + "seq_id": 24, + "locus_name": "hisD", + "locus_name_alt": "SALM25362", + "locus_product": "histidinol dehydrogenase", + "locus_description": NaN, + "locus_uid": 768, + "dna_seq_len": 501, + "dna_seq_hash": "23377e95fe00bf6a16b51fe8929a938a", + "aa_seq_len": 167, + "aa_seq_hash": "9fb34628ef67396ed38c755280e04f7e", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "25": { + "seq_id": 25, + "locus_name": "hisD", + "locus_name_alt": "SALM25362", + "locus_product": "histidinol dehydrogenase", + "locus_description": NaN, + "locus_uid": 838, + "dna_seq_len": 501, + "dna_seq_hash": "8478cdd016753651cd73afc4ad20c7df", + "aa_seq_len": 167, + "aa_seq_hash": "6512669779521a6792ecdae3088467f7", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "26": { + "seq_id": 26, + "locus_name": "hisD", + "locus_name_alt": "SALM25362", + "locus_product": "histidinol dehydrogenase", + "locus_description": NaN, + "locus_uid": 907, + "dna_seq_len": 501, + "dna_seq_hash": "ab935d39fffeff601d95a8362ba454f3", + "aa_seq_len": 167, + "aa_seq_hash": "1c277aef51e883e29ee8b489c525ea1b", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "27": { + "seq_id": 27, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 24, + "dna_seq_len": 399, + "dna_seq_hash": "a7af783dc7084f1b8bc593aa29f80003", + "aa_seq_len": 133, + "aa_seq_hash": "46a0c532edb92303b1b9d12a80056a60", + "dna_min_len": 279, + "dna_max_len": 678, + "aa_min_len": 93, + "aa_max_len": 226, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "28": { + "seq_id": 28, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 48, + "dna_seq_len": 399, + "dna_seq_hash": "9fb313e6232b0d0e14d2fc4be7c409f7", + "aa_seq_len": 133, + "aa_seq_hash": "0e56efdd1f7fbaf132524616e29d98ca", + "dna_min_len": 279, + "dna_max_len": 678, + "aa_min_len": 93, + "aa_max_len": 226, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "29": { + "seq_id": 29, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 317, + "dna_seq_len": 399, + "dna_seq_hash": "50cd750e2f6860dd489040f1d5f64f9b", + "aa_seq_len": 133, + "aa_seq_hash": "18e887a66ce56a930dbf8db48b406596", + "dna_min_len": 279, + "dna_max_len": 678, + "aa_min_len": 93, + "aa_max_len": 226, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "30": { + "seq_id": 30, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 487, + "dna_seq_len": 399, + "dna_seq_hash": "0e1384e36f3897f65690f9230d2bcd73", + "aa_seq_len": 133, + "aa_seq_hash": "20c9a488aa6542257a151ced866d2f8f", + "dna_min_len": 279, + "dna_max_len": 678, + "aa_min_len": 93, + "aa_max_len": 226, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "31": { + "seq_id": 31, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 608, + "dna_seq_len": 399, + "dna_seq_hash": "e180fd1852382c132851674a9e379c03", + "aa_seq_len": 133, + "aa_seq_hash": "c7da76b50946241fe125348a19a9b6a3", + "dna_min_len": 279, + "dna_max_len": 678, + "aa_min_len": 93, + "aa_max_len": 226, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "32": { + "seq_id": 32, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 611, + "dna_seq_len": 399, + "dna_seq_hash": "0ec842f985e93041c928ab7bb137295d", + "aa_seq_len": 133, + "aa_seq_hash": "be3990f2abaa8780b14e62d4fc8cd82a", + "dna_min_len": 279, + "dna_max_len": 678, + "aa_min_len": 93, + "aa_max_len": 226, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "33": { + "seq_id": 33, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 612, + "dna_seq_len": 399, + "dna_seq_hash": "9d42e484ea2936f87312f07abf0ad84a", + "aa_seq_len": 133, + "aa_seq_hash": "7af624e3930c7a5ab7785b08d925081c", + "dna_min_len": 279, + "dna_max_len": 678, + "aa_min_len": 93, + "aa_max_len": 226, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "34": { + "seq_id": 34, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 619, + "dna_seq_len": 399, + "dna_seq_hash": "02949c6f858f3cc5de1b13c9f5a40705", + "aa_seq_len": 133, + "aa_seq_hash": "52d120d4090a22e450633e01e4ccb729", + "dna_min_len": 279, + "dna_max_len": 678, + "aa_min_len": 93, + "aa_max_len": 226, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "35": { + "seq_id": 35, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 631, + "dna_seq_len": 315, + "dna_seq_hash": "c4715d7df9a9eebfe5a334dd55ee469b", + "aa_seq_len": 105, + "aa_seq_hash": "31aa38918b303bf67374188e11413e59", + "dna_min_len": 220, + "dna_max_len": 535, + "aa_min_len": 73, + "aa_max_len": 178, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "36": { + "seq_id": 36, + "locus_name": "purE", + "locus_name_alt": "SALM25363", + "locus_product": "phosphoribosylaminoimidazole carboxylase catalytic subunit", + "locus_description": NaN, + "locus_uid": 724, + "dna_seq_len": 399, + "dna_seq_hash": "782d08e7ee8a031a1402020e708bfbbc", + "aa_seq_len": 133, + "aa_seq_hash": "b5f9063808b8be839e7f169bf73c88e4", + "dna_min_len": 279, + "dna_max_len": 678, + "aa_min_len": 93, + "aa_max_len": 226, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "37": { + "seq_id": 37, + "locus_name": "sucA", + "locus_name_alt": "SALM25364", + "locus_product": "2-oxoglutarate dehydrogenase E1 component", + "locus_description": NaN, + "locus_uid": 1, + "dna_seq_len": 501, + "dna_seq_hash": "481b6454f33fae7875b4978c14094ec3", + "aa_seq_len": 167, + "aa_seq_hash": "fa04457773c66ae015014e915af2516d", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "38": { + "seq_id": 38, + "locus_name": "sucA", + "locus_name_alt": "SALM25364", + "locus_product": "2-oxoglutarate dehydrogenase E1 component", + "locus_description": NaN, + "locus_uid": 30, + "dna_seq_len": 501, + "dna_seq_hash": "79048d21794195277a6af839be13e6e1", + "aa_seq_len": 167, + "aa_seq_hash": "186c53cb5c2bf0b7ecac853c6067065d", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "39": { + "seq_id": 39, + "locus_name": "sucA", + "locus_name_alt": "SALM25364", + "locus_product": "2-oxoglutarate dehydrogenase E1 component", + "locus_description": NaN, + "locus_uid": 281, + "dna_seq_len": 501, + "dna_seq_hash": "f10d273aa97d5556a43b96721d666975", + "aa_seq_len": 167, + "aa_seq_hash": "4172d5e8c8265884fe5479e10527cb02", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "40": { + "seq_id": 40, + "locus_name": "sucA", + "locus_name_alt": "SALM25364", + "locus_product": "2-oxoglutarate dehydrogenase E1 component", + "locus_description": NaN, + "locus_uid": 399, + "dna_seq_len": 501, + "dna_seq_hash": "1839775cc7c29412648ec7b004e1a417", + "aa_seq_len": 167, + "aa_seq_hash": "c4cfbbf5c5814829188f4f404f312bd3", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "41": { + "seq_id": 41, + "locus_name": "sucA", + "locus_name_alt": "SALM25364", + "locus_product": "2-oxoglutarate dehydrogenase E1 component", + "locus_description": NaN, + "locus_uid": 571, + "dna_seq_len": 501, + "dna_seq_hash": "fce3e68952108e415579b3ad24a3f150", + "aa_seq_len": 167, + "aa_seq_hash": "43372b6526524f5ed4542be83b5b8614", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "42": { + "seq_id": 42, + "locus_name": "sucA", + "locus_name_alt": "SALM25364", + "locus_product": "2-oxoglutarate dehydrogenase E1 component", + "locus_description": NaN, + "locus_uid": 686, + "dna_seq_len": 501, + "dna_seq_hash": "629ea0cbfe0d2e9f34b1ca034a6c55fd", + "aa_seq_len": 167, + "aa_seq_hash": "c4cfbbf5c5814829188f4f404f312bd3", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "43": { + "seq_id": 43, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 1, + "dna_seq_len": 501, + "dna_seq_hash": "eaec644b411bd0b3ab1e086fbabd29c9", + "aa_seq_len": 167, + "aa_seq_hash": "bfe756f2f421db752907a171f3a44d69", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "44": { + "seq_id": 44, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 31, + "dna_seq_len": 501, + "dna_seq_hash": "97e4acce4e840b1c48de51f55fccf620", + "aa_seq_len": 167, + "aa_seq_hash": "be9296cb1ea9443fb43c0f967d107988", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "45": { + "seq_id": 45, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 208, + "dna_seq_len": 501, + "dna_seq_hash": "fbc6cb34cddfb1fe6a7806d5f7613259", + "aa_seq_len": 167, + "aa_seq_hash": "b788ec581475c9ba71d997b2db6e1def", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "46": { + "seq_id": 46, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 630, + "dna_seq_len": 501, + "dna_seq_hash": "ce58c0cacd4e8d9fa4867d11f2add864", + "aa_seq_len": 167, + "aa_seq_hash": "c062c5c88bdebdf2883e06fe6823c71c", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "47": { + "seq_id": 47, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 631, + "dna_seq_len": 501, + "dna_seq_hash": "949426df5430f94547459d06c786d77b", + "aa_seq_len": 167, + "aa_seq_hash": "dac50e2b5df83fe87c9826ecf99d568e", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "48": { + "seq_id": 48, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 632, + "dna_seq_len": 501, + "dna_seq_hash": "9a187a6b3e4675fe12ea213c7a23577c", + "aa_seq_len": 167, + "aa_seq_hash": "6536824faaa7880cfb44a6cd1ed057c9", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "49": { + "seq_id": 49, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 633, + "dna_seq_len": 501, + "dna_seq_hash": "7be8b9732228c1f82630b547d7011a5e", + "aa_seq_len": 167, + "aa_seq_hash": "1eac2cb94b8f619df1c9b0f3369f4a96", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "50": { + "seq_id": 50, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 637, + "dna_seq_len": 501, + "dna_seq_hash": "1895acdf991b49a885873fe82ce9ca85", + "aa_seq_len": 167, + "aa_seq_hash": "9fe9521d0bf495570a0fd425c0e48764", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "51": { + "seq_id": 51, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 638, + "dna_seq_len": 501, + "dna_seq_hash": "9776bbec78b5214d3dfca0d32b395d4b", + "aa_seq_len": 167, + "aa_seq_hash": "2914d167cc3579348e36d16afc628a39", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + }, + "52": { + "seq_id": 52, + "locus_name": "thrA", + "locus_name_alt": "SALM25365", + "locus_product": "bifunctional aspartate kinase/homoserine dehydrogenase I", + "locus_description": NaN, + "locus_uid": 748, + "dna_seq_len": 501, + "dna_seq_hash": "6cf9d69644c819d9ecd3a0fd090977fc", + "aa_seq_len": 167, + "aa_seq_hash": "cf0168a601a4f5792c7326a2da650edb", + "dna_min_len": 350, + "dna_max_len": 851, + "aa_min_len": 116, + "aa_max_len": 283, + "dna_min_ident": 80, + "aa_min_ident": 80, + "min_dna_match_cov": 80, + "min_aa_match_cov": 80, + "count_int_stops": 0, + "locus_type": "mlst" + } + } +} \ No newline at end of file diff --git a/tests/data/databases/locidex_salm/results.json b/tests/data/databases/locidex_salm/results.json new file mode 100755 index 00000000..9b1627e8 --- /dev/null +++ b/tests/data/databases/locidex_salm/results.json @@ -0,0 +1,15 @@ +{ + "analysis_start_time": "2024-02-26 15:55:10", + "parameters": { + "input_file": "/Users/jrobertson/PycharmProjects/locidex/locidex/example/build_db_mlst/senterica.mlst.txt", + "outdir": "/Users/jrobertson/PycharmProjects/locidex/locidex/example/mlst_locidex_db", + "name": "Locidex Database", + "author": "", + "date": "", + "db_ver": "1.0.0", + "db_desc": "", + "force": true + }, + "result_file": "/Users/jrobertson/PycharmProjects/locidex/locidex/example/mlst_locidex_db", + "analysis_end_time": "2024-02-26 15:55:10" +} \ No newline at end of file diff --git a/tests/data/profiles/salmonella_GCA_000008105.expected_profile.json.gz b/tests/data/profiles/salmonella_GCA_000008105.expected_profile.json.gz new file mode 100644 index 00000000..0336bd4d Binary files /dev/null and b/tests/data/profiles/salmonella_GCA_000008105.expected_profile.json.gz differ diff --git a/tests/main.nf.test b/tests/main.nf.test index 8bab62cf..323d99bf 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -28,6 +28,7 @@ nextflow_pipeline { skip_checkm = true skip_raw_read_metrics = true skip_polishing = true + skip_allele_calling = true max_memory = "2.GB" max_cpus = 1 @@ -39,7 +40,7 @@ nextflow_pipeline { assert path("$launchDir/results").exists() // parse output json file - def json = path("$launchDir/results/SummaryReport/final_report.json").json + def json = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json assert json.CSE.CSE.FastP.summary.sequencing.equals("paired end (150 cycles + 150 cycles)") assert json.CSE.CSE.FastP.summary.before_filtering.total_reads.equals(248) @@ -54,10 +55,10 @@ nextflow_pipeline { assert json.CSE.meta.merge.equals(false) assert json.CSE.meta.downsampled.equals(false) - def assembly_path = "$launchDir/results/assembly/length_filtered_contigs/CSE_filtered.fasta.gz" + def assembly_path = "$launchDir/results/Assembly/FinalAssembly/CSE/CSE.final.filtered.assembly.fasta.gz" assert path(assembly_path).exists().equals(false) - def contigs_path = "$launchDir/results/assembly/length_filtered_contigs" + def contigs_path = "$launchDir/results/Assembly/FinalAssembly" assert path(contigs_path).exists().equals(false) // compare IRIDA Next JSON output @@ -67,15 +68,35 @@ nextflow_pipeline { def iridanext_metadata = iridanext_json.metadata.samples // output files - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.tsv" }.size() == 1 - assert iridanext_samples.CSE.findAll { it.path.contains("assembly/length_filtered_contigs") }.size() == 0 - assert iridanext_samples.CSE.findAll { it.path.contains("assembly/CSE_flat_sample.json.gz") }.size() == 0 - - // output metadata - //assert iridanext_metadata.CSE."AssemblyCompleted" == false + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 + assert iridanext_samples.CSE.findAll { it.path.contains("Assembly/FinalAssembly") }.size() == 0 + assert iridanext_samples.CSE.findAll { it.path.contains("Assembly/FinalReports/FlattenedReports/CSE.flat_sample.json.gz") }.size() == 0 + + // metadata + assert iridanext_metadata.CSE.containsKey("QC Status") + assert iridanext_metadata.CSE."QC Status" == "FAILED" + assert !iridanext_metadata.CSE.containsKey("Checkm Status") + assert !iridanext_metadata.CSE.containsKey("Checkm Value") + assert !iridanext_metadata.CSE.containsKey("Average Coverage Status") + assert !iridanext_metadata.CSE.containsKey("Average Coverage Value") + assert !iridanext_metadata.CSE.containsKey("n50 Status") + assert !iridanext_metadata.CSE.containsKey("n50 Value") + assert !iridanext_metadata.CSE.containsKey("Raw Average Quality Status") + assert !iridanext_metadata.CSE.containsKey("Raw Average Quality Value") + assert !iridanext_metadata.CSE.containsKey("Length Status") + assert !iridanext_metadata.CSE.containsKey("Length Value") + assert !iridanext_metadata.CSE.containsKey("nr contigs Status") + assert !iridanext_metadata.CSE.containsKey("nr contigs Value") + assert iridanext_metadata.CSE."QC Summary" == "[FAILED] Sample is likely contaminated, REISOLATION AND RESEQUENCING RECOMMENDED\nPassed Tests: 0/6\nSpecies ID: null" + + assert iridanext_metadata.CSE."Downsampled" == false + assert !iridanext_metadata.CSE.containsKey("Species") + assert !iridanext_metadata.CSE.containsKey("GC (%)") + //assert iridanext_metadata.CSE."Mean Sequence Length Forward" == 150 + // Base count after decontamination + assert iridanext_metadata.CSE."BaseCount" == 37200 } - } test("Should run without failure.") { @@ -96,6 +117,8 @@ nextflow_pipeline { min_reads = 100 + skip_allele_calling = true + skip_bakta = true skip_staramr = false @@ -114,7 +137,7 @@ nextflow_pipeline { assert path("$launchDir/results").exists() // parse output json file - def json = path("$launchDir/results/SummaryReport/final_report.json").json + def json = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json assert json.short.short.RawReadSummary.R1."total_bp".equals(118750) assert json.short.short.RawReadSummary.R1."total_reads".equals(475) @@ -142,7 +165,7 @@ nextflow_pipeline { // Below two values should be empty assert json.short.short.StarAMR."0"."Genotype".equals("None") - assert json.short.short.StarAMR."0"."Predicted Phenotype".equals("Sensitive") + assert json.short.short.StarAMR."0"."Predicted Phenotype".equals("Susceptible") assert json.short.short.StarAMR."0"."Genome Length".equals("4949") assert json.short.short.CheckM."0"."# genomes".equals("5656") @@ -150,11 +173,11 @@ nextflow_pipeline { assert json.short.short.CheckM."0"."# marker sets".equals("24") assert json.short.short.CheckM."0".Contamination.equals("0.00") - assert json.short.short.SevenGeneMLSTReport[0].filename.equals("short_polished.fasta.gz") + assert json.short.short.SevenGeneMLSTReport[0].filename.equals("short.filtered.fasta.gz") assert json.short.short.Abricate."0".RESISTANCE.equals("NoData") // All Abricate results for this are "NoData". - def assembly_path = "$launchDir/results/assembly/length_filtered_contigs/short_filtered.fasta.gz" + def assembly_path = "$launchDir/results/Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" assert path(assembly_path).exists() // parse assembly file @@ -168,17 +191,41 @@ nextflow_pipeline { def iridanext_metadata = iridanext_json.metadata.samples // output files - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.tsv" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "assembly/length_filtered_contigs/short_filtered.fasta.gz" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "SummaryReport/short_flat_sample.json.gz" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1 // output metadata - assert iridanext_metadata.short."AssemblyCompleted" == true - assert iridanext_metadata.short."QUAST.0.Total length" == "4949" - assert iridanext_metadata.short."QUAST.0.Largest contig" == "4949" - assert iridanext_metadata.short."QUAST.0.# contigs" == "1" - assert iridanext_metadata.short."QUAST.0.N50" == "4949" + assert iridanext_metadata.short."QC Status" == "FAILED" + assert iridanext_metadata.short."Checkm Status" == "PASSED" + assert iridanext_metadata.short."Checkm Value" == 0.0 + assert !iridanext_metadata.short.containsKey("Average Coverage Status") + assert !iridanext_metadata.short.containsKey("Average Coverage Value") + assert iridanext_metadata.short."n50 Status" == "WARNING" + assert iridanext_metadata.short."n50 Value" == 4949 + assert iridanext_metadata.short."Raw Average Quality Status" == "PASSED" + assert iridanext_metadata.short."Raw Average Quality Value" == 40.0 + assert iridanext_metadata.short."Length Status" == "WARNING" + assert iridanext_metadata.short."Length Value" == 4949 + assert iridanext_metadata.short."nr contigs Status" == "WARNING" + assert iridanext_metadata.short."nr contigs Value" == 1 + assert iridanext_metadata.short."QC Summary" == "[FAILED] RESEQUENCING IS RECOMMENDED\nPassed Tests: 5/6\nSpecies ID: No Species Identified" + + assert iridanext_metadata.short."Downsampled" == false + assert iridanext_metadata.short."Species" == "No Species Identified" + assert iridanext_metadata.short."GC (%)" == "52.96" + assert iridanext_metadata.short."Mean Sequence Length Forward" == 250 + assert iridanext_metadata.short."BaseCount" == 237500 + + assert iridanext_metadata.short."StarAMR Genotype" == "None" + assert iridanext_metadata.short."StarAMR Predicted Phenotype" == "Susceptible" } } diff --git a/tests/modules/local/bin_kraken2/bin_kraken2.nf.test b/tests/modules/local/bin_kraken2/bin_kraken2.nf.test new file mode 100644 index 00000000..9bcaf995 --- /dev/null +++ b/tests/modules/local/bin_kraken2/bin_kraken2.nf.test @@ -0,0 +1,41 @@ +nextflow_process { + name "Test Process BIN_KRAKEN2" + script "modules/local/bin_kraken2.nf" + process "BIN_KRAKEN2" + tag "modules" + tag "bin_kraken2" + + test("Test contig binning using a kraken2 output"){ + when { + process { + """ + input[0] = Channel.of([ + [id: "SAMPLE1"], + file("$baseDir/tests/data/kraken2/output.k2.cls.fa"), + file("$baseDir/tests/data/kraken2/output.k2.report"), + file("$baseDir/tests/data/kraken2/output.k2.out") + ]) + input[1] = Channel.from("S") + """ + } + + params{ + outdir = "kraken2_bin" + } + } + then { + assert process.success + + assert path("$launchDir/kraken2_bin/Assembly/PostProcessing/Metagenomic/BinnedContigs/Kraken2/SAMPLE1.SAMPLE1_Escherichia_virus_Lambda.kraken2.binned.assembly.fasta.gz").exists() + assert path("$launchDir/kraken2_bin/Assembly/PostProcessing/Metagenomic/BinnedContigs/Kraken2/SAMPLE1.SAMPLE1_Human_immunodeficiency_virus_1.kraken2.binned.assembly.fasta.gz").exists() + assert path("$launchDir/kraken2_bin/Assembly/PostProcessing/Metagenomic/BinnedContigs/Kraken2/SAMPLE1.SAMPLE1_Human_immunodeficiency_virus_2.kraken2.binned.assembly.fasta.gz").exists() + assert path("$launchDir/kraken2_bin/Assembly/PostProcessing/Metagenomic/BinnedContigs/Kraken2/SAMPLE1.SAMPLE1_Influenza_A_virus.kraken2.binned.assembly.fasta.gz").exists() + assert path("$launchDir/kraken2_bin/Assembly/PostProcessing/Metagenomic/BinnedContigs/Kraken2/SAMPLE1.SAMPLE1_Influenza_B_virus.kraken2.binned.assembly.fasta.gz").exists() + assert path("$launchDir/kraken2_bin/Assembly/PostProcessing/Metagenomic/BinnedContigs/Kraken2/SAMPLE1.SAMPLE1_Middle_East_respiratory_syndrome-related_coronavirus.kraken2.binned.assembly.fasta.gz").exists() + assert path("$launchDir/kraken2_bin/Assembly/PostProcessing/Metagenomic/BinnedContigs/Kraken2/SAMPLE1.SAMPLE1_Severe_acute_respiratory_syndrome-related_coronavirus.kraken2.binned.assembly.fasta.gz").exists() + + } + } + + +} diff --git a/tests/modules/local/seqkit_stats/seqkit_stats.nf.test b/tests/modules/local/seqkit_stats/seqkit_stats.nf.test index af867b88..0d93181e 100644 --- a/tests/modules/local/seqkit_stats/seqkit_stats.nf.test +++ b/tests/modules/local/seqkit_stats/seqkit_stats.nf.test @@ -25,7 +25,7 @@ nextflow_process { then { assert process.success - def stats_data = path("$launchDir/seqkit_stats_test1/assembly/quality/seqkit_stats/SAMPLE1.tsv").readLines() + def stats_data = path("$launchDir/seqkit_stats_test1/Assembly/Quality/SeqKitStats/SAMPLE1.seqkit.stats.summary.tsv").readLines() assert stats_data.contains("file\tformat\ttype\tnum_seqs\tsum_len\tmin_len\tavg_len\tmax_len") assert stats_data.contains("sample1.fasta\tFASTA\tDNA\t4\t10\t1\t2.5\t4") diff --git a/tests/modules/local/seqtk_size/seqtk_size.nf.test b/tests/modules/local/seqtk_size/seqtk_size.nf.test new file mode 100644 index 00000000..94f71760 --- /dev/null +++ b/tests/modules/local/seqtk_size/seqtk_size.nf.test @@ -0,0 +1,58 @@ +nextflow_process { + name "Test Process SEQTK_SIZE" + script "modules/local/seqtk_size.nf" + process "SEQTK_SIZE" + tag "modules" + tag "seqkit_stats" + + test("Test seqtk_size single end") { + tag "single_end" + + when { + process { + """ + input[0] = Channel.of([ + [id: "SAMPLE1", single_end: true], + [file("$baseDir/tests/data/reads/1_R1.fq.gz", checkIfExists: true)] + ]) + """ + } + + params { + outdir = "results" + } + } + + then { + assert process.success + def data = path("$launchDir/results/Reads/FinalReads/BaseCounts/SAMPLE1.SAMPLE1_basecounts.seqtk.size.txt").text + assert data == "475\t118750\n" + } + } + + test("Test seqtk_size paired end") { + tag "paired_end" + + when { + process { + """ + input[0] = Channel.of([ + [id: "SAMPLE1", single_end: false], + [file("$baseDir/tests/data/reads/1_R1.fq.gz", checkIfExists: true), + file("$baseDir/tests/data/reads/1_R2.fq.gz", checkIfExists: true)] + ]) + """ + } + + params { + outdir = "results" + } + } + + then { + assert process.success + def data = path("$launchDir/results/Reads/FinalReads/BaseCounts/SAMPLE1.SAMPLE1_basecounts.seqtk.size.txt").text + assert data == "950\t237500\n" + } + } +} diff --git a/tests/nextflow.config b/tests/nextflow.config index 5c9c86e5..8ab2078a 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -8,12 +8,17 @@ params.max_memory = "2.GB" params.max_cpus = 1 params.fastp.illumina.args = "-Q" -/* This is required for tests to run in WSL/Ubuntu using singularity - Without this, ECTyper was not successfully completing. - More information related to the ECTyper error I encountered - is found at https://github.com/marcelm/cutadapt/issues/583 + +/* This is required for tests to run in WSL/Ubuntu using singularity + Without this, ECTyper was not successfully completing. + More information related to the ECTyper error I encountered + is found at https://github.com/marcelm/cutadapt/issues/583 + + 2024-04-18: Updated from contains to containall in a hope to resolve + an issue with QUAST and gridss after reading through the above issue + sadly it did not work but apparently containall is a good catchall. */ -singularity.runOptions = "--contain" +singularity.runOptions = "--containall" /* Remove gzipping on JSON output for testing/asserts on file contents */ iridanext.output.path = "${params.outdir}/iridanext.output.json" diff --git a/tests/pipelines/main.from_assemblies.nf.test b/tests/pipelines/main.from_assemblies.nf.test index a62bb217..0ca478c8 100644 --- a/tests/pipelines/main.from_assemblies.nf.test +++ b/tests/pipelines/main.from_assemblies.nf.test @@ -27,6 +27,8 @@ nextflow_pipeline { skip_staramr = false skip_subtyping = false + skip_allele_calling = true + max_memory = "2.GB" max_cpus = 1 } @@ -37,12 +39,13 @@ nextflow_pipeline { assert path("$launchDir/results").exists() // parse final report json file - def final_report = path("$launchDir/results/SummaryReport/final_report.json").json + def final_report = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json def ecoli_json = final_report.ecoli_GCA_000947975.ecoli_GCA_000947975 assert ecoli_json.StarAMR."0".Genotype.equals("aph(3'')-Ib, aph(6)-Id, blaCTX-M-15, blaTEM-1B, dfrA7, gyrA (S83A), qacE, sul1, sul2, tet(A)") assert ecoli_json.StarAMR."0"."Predicted Phenotype".equals("streptomycin, kanamycin, ampicillin, ceftriaxone, trimethoprim, ciprofloxacin I/R, nalidixic acid, unknown[qacE_1_X68232], sulfisoxazole, tetracycline") + assert ecoli_json.StarAMR."0"."CGE Predicted Phenotype".equals("Streptomycin, Amoxicillin, Ampicillin, Aztreonam, Cefepime, Cefotaxime, Ceftazidime, Ceftriaxone, Piperacillin, Ticarcillin, Cephalothin, Trimethoprim, Nalidixic acid, Ciprofloxacin, Benzylkonium Chloride, Ethidium Bromide, Chlorhexidine, Cetylpyridinium Chloride, Sulfamethoxazole, Doxycycline, Tetracycline") assert ecoli_json.StarAMR."0".Plasmid.equals("IncQ1") - assert ecoli_json.StarAMR."0".Scheme.equals("ecoli") + assert ecoli_json.StarAMR."0".Scheme.equals("ecoli_achtman_4") assert ecoli_json.StarAMR."0"."Sequence Type".equals("678") assert ecoli_json.PointfinderDB.equals("escherichia_coli") assert ecoli_json.ECTyperSubtyping."0".Species.equals("Escherichia coli") @@ -52,9 +55,16 @@ nextflow_pipeline { assert ecoli_json.ECTyperSubtyping."0".Database.equals("v1.0 (11-03-2020)") assert ecoli_json.ECTyperSubtyping."0".QC.equals("PASS (REPORTABLE)") + assert final_report.ecoli_GCA_000947975.QualityAnalysis.n50_value.value == 122025 + assert final_report.ecoli_GCA_000947975.QualityAnalysis.n50_value.qc_status == "PASSED" + assert final_report.ecoli_GCA_000947975.QualityAnalysis.nr_contigs.qc_status == "PASSED" + assert final_report.ecoli_GCA_000947975.QualityAnalysis.nr_contigs.value == 187 + assert final_report.ecoli_GCA_000947975.QualityAnalysis.length.value == 5333525 + assert final_report.ecoli_GCA_000947975.QualityAnalysis.length.qc_status == "PASSED" + // parse final report tsv file def final_report_tmap = [:] - def final_report_tsv = path("$launchDir/results/SummaryReport/final_report.tsv").readLines() + def final_report_tsv = path("$launchDir/results/FinalReports/Aggregated/Tables/final_report.tsv").readLines() final_report_tsv.each { def tokens = it.split("\t") final_report_tmap[tokens[0]] = tokens[1] @@ -66,8 +76,9 @@ nextflow_pipeline { assert final_report_tmap."QUAST.0.N50" == "122025" assert final_report_tmap."StarAMR.0.Genotype" == "aph(3'')-Ib, aph(6)-Id, blaCTX-M-15, blaTEM-1B, dfrA7, gyrA (S83A), qacE, sul1, sul2, tet(A)" assert final_report_tmap."StarAMR.0.Predicted Phenotype" == "streptomycin, kanamycin, ampicillin, ceftriaxone, trimethoprim, ciprofloxacin I/R, nalidixic acid, unknown[qacE_1_X68232], sulfisoxazole, tetracycline" + assert final_report_tmap."StarAMR.0.CGE Predicted Phenotype" == "Streptomycin, Amoxicillin, Ampicillin, Aztreonam, Cefepime, Cefotaxime, Ceftazidime, Ceftriaxone, Piperacillin, Ticarcillin, Cephalothin, Trimethoprim, Nalidixic acid, Ciprofloxacin, Benzylkonium Chloride, Ethidium Bromide, Chlorhexidine, Cetylpyridinium Chloride, Sulfamethoxazole, Doxycycline, Tetracycline" assert final_report_tmap."StarAMR.0.Plasmid" == "IncQ1" - assert final_report_tmap."StarAMR.0.Scheme" == "ecoli" + assert final_report_tmap."StarAMR.0.Scheme" == "ecoli_achtman_4" assert final_report_tmap."StarAMR.0.Sequence Type" == "678" assert final_report_tmap."ECTyperSubtyping.0.Species" == "Escherichia coli" assert final_report_tmap."ECTyperSubtyping.0.O-type" == "O104" @@ -76,6 +87,7 @@ nextflow_pipeline { assert final_report_tmap."ECTyperSubtyping.0.Database" == "v1.0 (11-03-2020)" assert final_report_tmap."ECTyperSubtyping.0.QC" == "PASS (REPORTABLE)" + // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json def iridanext_global = iridanext_json.files.global @@ -83,28 +95,223 @@ nextflow_pipeline { def iridanext_metadata = iridanext_json.metadata.samples // output files - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.tsv" }.size() == 1 - assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "SummaryReport/ecoli_GCA_000947975_flat_sample.json.gz" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Quality/QUAST/ecoli_GCA_000947975/ecoli_GCA_000947975.transposed_ecoli_GCA_000947975.quast.quality.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Quality/SeqKitStats/ecoli_GCA_000947975.seqkit.stats.summary.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/ecoli_GCA_000947975.mash.screen.taxa.screen.screen" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Subtyping/ECTyper/ecoli_GCA_000947975/ecoli_GCA_000947975.blast_output_alleles.ectyper.subtyping.txt" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Subtyping/ECTyper/ecoli_GCA_000947975/ecoli_GCA_000947975.ectyper.ectyper.subtyping.log" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Subtyping/ECTyper/ecoli_GCA_000947975/ecoli_GCA_000947975.output.ectyper.subtyping.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "FinalReports/FlattenedReports/ecoli_GCA_000947975.flat_sample.json.gz" }.size() == 1 // output metadata def ecoli_metadata = iridanext_metadata.ecoli_GCA_000947975 - assert ecoli_metadata."SpeciesTopHit" == "s__Escherichia coli" - assert ecoli_metadata."QUAST.0.Total length" == "5333525" - assert ecoli_metadata."QUAST.0.Largest contig" == "300823" - assert ecoli_metadata."QUAST.0.# contigs" == "187" - assert ecoli_metadata."QUAST.0.N50" == "122025" - assert ecoli_metadata."StarAMR.0.Genotype" == "aph(3'')-Ib, aph(6)-Id, blaCTX-M-15, blaTEM-1B, dfrA7, gyrA (S83A), qacE, sul1, sul2, tet(A)" - assert ecoli_metadata."StarAMR.0.Predicted Phenotype" == "streptomycin, kanamycin, ampicillin, ceftriaxone, trimethoprim, ciprofloxacin I/R, nalidixic acid, unknown[qacE_1_X68232], sulfisoxazole, tetracycline" - assert ecoli_metadata."StarAMR.0.Plasmid" == "IncQ1" - assert ecoli_metadata."StarAMR.0.Scheme" == "ecoli" - assert ecoli_metadata."StarAMR.0.Sequence Type" == "678" - assert ecoli_metadata."ECTyperSubtyping.0.Species" == "Escherichia coli" - assert ecoli_metadata."ECTyperSubtyping.0.O-type" == "O104" - assert ecoli_metadata."ECTyperSubtyping.0.H-type" == "H4" - assert ecoli_metadata."ECTyperSubtyping.0.Serotype" == "O104:H4" - assert ecoli_metadata."ECTyperSubtyping.0.Database" == "v1.0 (11-03-2020)" - assert ecoli_metadata."ECTyperSubtyping.0.QC" == "PASS (REPORTABLE)" + + assert ecoli_metadata."QC Status" == "FAILED" + assert !ecoli_metadata.containsKey("Checkm Status") + assert !ecoli_metadata.containsKey("Checkm Value") + assert !ecoli_metadata.containsKey("Average Coverage Status") + assert !ecoli_metadata.containsKey("Average Coverage Value") + assert ecoli_metadata."n50 Status" == "PASSED" + assert ecoli_metadata."n50 Value" == 122025 + assert !ecoli_metadata.containsKey("Raw Average Quality Status") + assert !ecoli_metadata.containsKey("Raw Average Quality Value") + assert ecoli_metadata."Length Status" == "PASSED" + assert ecoli_metadata."Length Value" == 5333525 + assert ecoli_metadata."nr contigs Status" == "PASSED" + assert ecoli_metadata."nr contigs Value" == 187 + assert ecoli_metadata."QC Summary" == "[FAILED] Sample is likely contaminated, REISOLATION AND RESEQUENCING RECOMMENDED\nPassed Tests: 3/4\nSpecies ID: s__Escherichia coli" + + assert ecoli_metadata."Downsampled" == false + assert ecoli_metadata."Species" == "s__Escherichia coli" + + assert ecoli_metadata."ECTyper Database" == "v1.0 (11-03-2020)" + assert ecoli_metadata."ECTyper Evidence" == "Based on 3 allele(s)" + assert ecoli_metadata."ECTyper GeneCoverages (%)" == "100;100;100;" + assert ecoli_metadata."ECTyper GeneIdentities (%)" == "100;100;100;" + assert ecoli_metadata."ECTyper GeneScores" == "wzx:1;wzy:1;fliC:1;" + assert ecoli_metadata."ECTyper H-Antigen" == "H4" + assert ecoli_metadata."ECTyper O-Antigen" == "O104" + assert ecoli_metadata."ECTyper QCFlag" == "PASS (REPORTABLE)" + assert ecoli_metadata."ECTyper Serotype" == "O104:H4" + assert ecoli_metadata."ECTyper Subtyping" == "Escherichia coli" + assert ecoli_metadata."ECTyper Warnings" == "-" + + assert ecoli_metadata."GC (%)" == "50.58" + assert !ecoli_metadata.containsKey("Mean Sequence Length Forward") + assert !ecoli_metadata.containsKey("BaseCount") + + assert ecoli_metadata."icd" == "136" + assert ecoli_metadata."recA" == "7" + assert ecoli_metadata."fumC" == "6" + assert ecoli_metadata."adk" == "6" + assert ecoli_metadata."gyrB" == "5" + assert ecoli_metadata."purA" == "7" + assert ecoli_metadata."mdh" == "9" + assert ecoli_metadata."7 Gene ST" == "678" + assert ecoli_metadata."7 Gene Scheme" == "ecoli" + + assert ecoli_metadata."StarAMR Genotype" == "aph(3'')-Ib, aph(6)-Id, blaCTX-M-15, blaTEM-1B, dfrA7, gyrA (S83A), qacE, sul1, sul2, tet(A)" + assert ecoli_metadata."StarAMR Predicted Phenotype" == "streptomycin, kanamycin, ampicillin, ceftriaxone, trimethoprim, ciprofloxacin I/R, nalidixic acid, unknown[qacE_1_X68232], sulfisoxazole, tetracycline" + assert ecoli_metadata."StarAMR CGE Predicted Phenotype" == "Streptomycin, Amoxicillin, Ampicillin, Aztreonam, Cefepime, Cefotaxime, Ceftazidime, Ceftriaxone, Piperacillin, Ticarcillin, Cephalothin, Trimethoprim, Nalidixic acid, Ciprofloxacin, Benzylkonium Chloride, Ethidium Bromide, Chlorhexidine, Cetylpyridinium Chloride, Sulfamethoxazole, Doxycycline, Tetracycline" + assert ecoli_metadata."StarAMR Plasmid" == "IncQ1" + + // Read in filtered assembly fasta to verify number of contigs + def assemblyLines = path("$launchDir/results/Assembly/FinalAssembly/ecoli_GCA_000947975/ecoli_GCA_000947975.final.filtered.assembly.fasta.gz").readLinesGzip() + def actualNumberContigs = assemblyLines.findAll { it.startsWith(">") }.size() + def actualLength = assemblyLines.findAll { !it.startsWith(">") }.collect { it.strip().size() }.sum() + assert actualNumberContigs == 187 + assert actualLength == 5333525 + } + } + + test("Test starting from assemblies ecoli with default min contig without speciation enabled for StarAMR") { + tag "from_assemblies_ecoli_default_min_contig_no_speciation" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-test-from-assemblies.csv" + outdir = "results" + + platform = "illumina" + + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + kraken2_db = "$baseDir/tests/data/kraken2/test" + + // Use default min contig, whould be 1000 + //qt_min_contig_length = 1000 + + skip_bakta = true + skip_mobrecon = true + skip_checkm = true + skip_raw_read_metrics = true + skip_polishing = true + skip_species_classification = true + + skip_staramr = false + skip_subtyping = false + + skip_allele_calling = true + + max_memory = "2.GB" + max_cpus = 1 + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // compare IRIDA Next JSON output + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_global = iridanext_json.files.global + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + // output files + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Quality/QUAST/ecoli_GCA_000947975/ecoli_GCA_000947975.transposed_ecoli_GCA_000947975.quast.quality.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Quality/SeqKitStats/ecoli_GCA_000947975.seqkit.stats.summary.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "FinalReports/FlattenedReports/ecoli_GCA_000947975.flat_sample.json.gz" }.size() == 1 + + // output metadata + def ecoli_metadata = iridanext_metadata.ecoli_GCA_000947975 + assert ecoli_metadata."QC Status" == "FAILED" + assert ecoli_metadata."n50 Value" == 122025 + assert ecoli_metadata."Length Value" == 5299656 + assert ecoli_metadata."nr contigs Value" == 123 + assert ecoli_metadata."StarAMR Genotype" == "aph(3'')-Ib, aph(6)-Id, blaCTX-M-15, blaTEM-1B, dfrA7, qacE, sul1, sul2, tet(A)" + + // Read in filtered assembly fasta to verify number of contigs + def assemblyLines = path("$launchDir/results/Assembly/FinalAssembly/ecoli_GCA_000947975/ecoli_GCA_000947975.final.filtered.assembly.fasta.gz").readLinesGzip() + def actualNumberContigs = assemblyLines.findAll { it.startsWith(">") }.size() + def actualLength = assemblyLines.findAll { !it.startsWith(">") }.collect { it.strip().size() }.sum() + assert actualNumberContigs == 123 + assert actualLength == 5299656 + } + } + + test("Test starting from assemblies ecoli with default min contig") { + tag "from_assemblies_ecoli_default_min_contig" + + when { + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-test-from-assemblies.csv" + outdir = "results" + + platform = "illumina" + + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + kraken2_db = "$baseDir/tests/data/kraken2/test" + + // Use default min contig, whould be 1000 + //qt_min_contig_length = 1000 + + skip_bakta = true + skip_mobrecon = true + skip_checkm = true + skip_raw_read_metrics = true + skip_polishing = true + + skip_staramr = false + skip_subtyping = false + + skip_allele_calling = true + + max_memory = "2.GB" + max_cpus = 1 + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // compare IRIDA Next JSON output + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_global = iridanext_json.files.global + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + // output files + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Quality/QUAST/ecoli_GCA_000947975/ecoli_GCA_000947975.transposed_ecoli_GCA_000947975.quast.quality.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Quality/SeqKitStats/ecoli_GCA_000947975.seqkit.stats.summary.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/ecoli_GCA_000947975.mash.screen.taxa.screen.screen" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Subtyping/ECTyper/ecoli_GCA_000947975/ecoli_GCA_000947975.blast_output_alleles.ectyper.subtyping.txt" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Subtyping/ECTyper/ecoli_GCA_000947975/ecoli_GCA_000947975.ectyper.ectyper.subtyping.log" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "Assembly/Subtyping/ECTyper/ecoli_GCA_000947975/ecoli_GCA_000947975.output.ectyper.subtyping.tsv" }.size() == 1 + assert iridanext_samples.ecoli_GCA_000947975.findAll { it.path == "FinalReports/FlattenedReports/ecoli_GCA_000947975.flat_sample.json.gz" }.size() == 1 + + // output metadata + def ecoli_metadata = iridanext_metadata.ecoli_GCA_000947975 + + assert ecoli_metadata."QC Status" == "FAILED" + assert !ecoli_metadata.containsKey("Checkm Status") + assert !ecoli_metadata.containsKey("Checkm Value") + assert !ecoli_metadata.containsKey("Average Coverage Status") + assert !ecoli_metadata.containsKey("Average Coverage Value") + assert ecoli_metadata."n50 Status" == "PASSED" + assert ecoli_metadata."n50 Value" == 122025 + assert !ecoli_metadata.containsKey("Raw Average Quality Status") + assert !ecoli_metadata.containsKey("Raw Average Quality Value") + assert ecoli_metadata."Length Status" == "PASSED" + assert ecoli_metadata."Length Value" == 5299656 + assert ecoli_metadata."nr contigs Status" == "PASSED" + assert ecoli_metadata."nr contigs Value" == 123 + assert ecoli_metadata."QC Summary" == "[FAILED] Sample is likely contaminated, REISOLATION AND RESEQUENCING RECOMMENDED\nPassed Tests: 3/4\nSpecies ID: s__Escherichia coli" + + // Read in filtered assembly fasta to verify number of contigs + def assemblyLines = path("$launchDir/results/Assembly/FinalAssembly/ecoli_GCA_000947975/ecoli_GCA_000947975.final.filtered.assembly.fasta.gz").readLinesGzip() + def actualNumberContigs = assemblyLines.findAll { it.startsWith(">") }.size() + def actualLength = assemblyLines.findAll { !it.startsWith(">") }.collect { it.strip().size() }.sum() + assert actualNumberContigs == 123 + assert actualLength == 5299656 } } @@ -131,6 +338,9 @@ nextflow_pipeline { skip_staramr = false skip_subtyping = false + skip_allele_calling = false + + allele_scheme = "$baseDir/tests/data/databases/locidex_salm" max_memory = "2.GB" max_cpus = 1 @@ -141,15 +351,26 @@ nextflow_pipeline { assert workflow.success assert path("$launchDir/results").exists() - assert path("$launchDir/results/subtyping/sistr/salmonella_GCA_000008105-allele.fasta").exists() - assert path("$launchDir/results/subtyping/sistr/salmonella_GCA_000008105-allele.json").exists() - assert path("$launchDir/results/subtyping/sistr/salmonella_GCA_000008105-cgmlst.csv").exists() - assert path("$launchDir/results/subtyping/sistr/salmonella_GCA_000008105.tab").exists() + assert path("$launchDir/results/Assembly/Subtyping/SISTR/salmonella_GCA_000008105.sistr.allele.subtyping.fasta").exists() + assert path("$launchDir/results/Assembly/Subtyping/SISTR/salmonella_GCA_000008105.sistr.allele.subtyping.json").exists() + assert path("$launchDir/results/Assembly/Subtyping/SISTR/salmonella_GCA_000008105.sistr.cgmlst.subtyping.csv").exists() + assert path("$launchDir/results/Assembly/Subtyping/SISTR/salmonella_GCA_000008105.sistr.subtyping.tab").exists() - assert path("$launchDir/results/annotations/StarAMR/salmonella_GCA_000008105/plasmidfinder.tsv").exists() - assert path("$launchDir/results/annotations/StarAMR/salmonella_GCA_000008105/pointfinder.tsv").exists() - assert path("$launchDir/results/annotations/StarAMR/salmonella_GCA_000008105/resfinder.tsv").exists() - assert path("$launchDir/results/annotations/StarAMR/salmonella_GCA_000008105/detailed_summary.tsv").exists() + assert path("$launchDir/results/Assembly/Annotation/StarAMR/salmonella_GCA_000008105/salmonella_GCA_000008105.plasmidfinder.staramr.annotation.tsv").exists() + assert path("$launchDir/results/Assembly/Annotation/StarAMR/salmonella_GCA_000008105/salmonella_GCA_000008105.pointfinder.staramr.annotation.tsv").exists() + assert path("$launchDir/results/Assembly/Annotation/StarAMR/salmonella_GCA_000008105/salmonella_GCA_000008105.resfinder.staramr.annotation.tsv").exists() + assert path("$launchDir/results/Assembly/Annotation/StarAMR/salmonella_GCA_000008105/salmonella_GCA_000008105.detailed_summary.staramr.annotation.tsv").exists() + + // Check Locidex outputs + def acutal_mlst_profile = path("$launchDir/results/Assembly/Subtyping/Locidex/Report/salmonella_GCA_000008105.locidex.report.profile.mlst.subtyping.json.gz") + assert acutal_mlst_profile.exists() + assert path("$launchDir/results/Assembly/Subtyping/Locidex/Extract/salmonella_GCA_000008105/salmonella_GCA_000008105.raw.locidex.extract.extracted.seqs.subtyping.fasta.gz").exists() + assert path("$launchDir/results/Assembly/Subtyping/Locidex/Search/salmonella_GCA_000008105.locidex.seq_store.json.gz").exists() + + // comparing bytes, as both files are in gzipped + def actual_mlst_profile_bytes = path("$launchDir/results/Assembly/Subtyping/Locidex/Report/salmonella_GCA_000008105.locidex.report.profile.mlst.subtyping.json.gz").getBytes() + def expected_mlst_profile_bytes = path("$baseDir/tests/data/profiles/salmonella_GCA_000008105.expected_profile.json.gz").getBytes() + assert actual_mlst_profile_bytes.md5() == expected_mlst_profile_bytes.md5() // compare IRIDA Next JSON output assert path("$launchDir/results/iridanext.output.json").exists() @@ -157,80 +378,123 @@ nextflow_pipeline { def iridanext_global = iridanext_json.files.global def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - - assert iridanext_metadata.salmonella_GCA_000008105."meta.assembly" == true - assert iridanext_metadata.salmonella_GCA_000008105."meta.hybrid" == false - assert iridanext_metadata.salmonella_GCA_000008105."meta.single_end" == false + def salmonella_metadata = iridanext_metadata.salmonella_GCA_000008105 + + assert salmonella_metadata."QC Status" == "FAILED" + assert !salmonella_metadata.containsKey("Checkm Status") + assert !salmonella_metadata.containsKey("Checkm Value") + assert !salmonella_metadata.containsKey("Average Coverage Status") + assert !salmonella_metadata.containsKey("Average Coverage Value") + assert salmonella_metadata."n50 Status" == "PASSED" + assert salmonella_metadata."n50 Value" == 4755700 + assert !salmonella_metadata.containsKey("Raw Average Quality Status") + assert !salmonella_metadata.containsKey("Raw Average Quality Value") + assert salmonella_metadata."Length Status" == "PASSED" + assert salmonella_metadata."Length Value" == 4944000 + assert salmonella_metadata."nr contigs Status" == "PASSED" + assert salmonella_metadata."nr contigs Value" == 3 + assert salmonella_metadata."QC Summary" == "[FAILED] Sample is likely contaminated, REISOLATION AND RESEQUENCING RECOMMENDED\nPassed Tests: 3/4\nSpecies ID: s__Salmonella enterica" + + assert salmonella_metadata."Downsampled" == false // parse final report tsv file def final_report_tmap = [:] - def final_report_tsv = path("$launchDir/results/SummaryReport/final_report.tsv").readLines() + def final_report_tsv = path("$launchDir/results/FinalReports/Aggregated/Tables/final_report.tsv").readLines() final_report_tsv.each { def tokens = it.split("\t") final_report_tmap[tokens[0]] = tokens[1] } // output files - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 + assert iridanext_samples.salmonella_GCA_000008105.findAll { it.path == "Assembly/Quality/QUAST/salmonella_GCA_000008105/salmonella_GCA_000008105.transposed_salmonella_GCA_000008105.quast.quality.tsv" }.size() == 1 + assert iridanext_samples.salmonella_GCA_000008105.findAll { it.path == "Assembly/Quality/SeqKitStats/salmonella_GCA_000008105.seqkit.stats.summary.tsv" }.size() == 1 + assert iridanext_samples.salmonella_GCA_000008105.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/salmonella_GCA_000008105.mash.screen.taxa.screen.screen" }.size() == 1 + assert iridanext_samples.salmonella_GCA_000008105.findAll { it.path == "Assembly/Subtyping/SISTR/salmonella_GCA_000008105.sistr.subtyping.tab" }.size() == 1 + assert iridanext_samples.salmonella_GCA_000008105.findAll { it.path == "Assembly/Subtyping/SISTR/salmonella_GCA_000008105.sistr.allele.subtyping.fasta" }.size() == 1 + assert iridanext_samples.salmonella_GCA_000008105.findAll { it.path == "Assembly/Subtyping/SISTR/salmonella_GCA_000008105.sistr.allele.subtyping.json" }.size() == 1 + assert iridanext_samples.salmonella_GCA_000008105.findAll { it.path == "Assembly/Subtyping/SISTR/salmonella_GCA_000008105.sistr.cgmlst.subtyping.csv" }.size() == 1 + assert iridanext_samples.salmonella_GCA_000008105.findAll { it.path == "Assembly/Subtyping/Locidex/Report/salmonella_GCA_000008105.locidex.report.profile.mlst.subtyping.json.gz" }.size() == 1 + assert iridanext_samples.salmonella_GCA_000008105.findAll { it.path == "FinalReports/FlattenedReports/salmonella_GCA_000008105.flat_sample.json.gz" }.size() == 1 // parse final report json file - def final_report = path("$launchDir/results/SummaryReport/final_report.json").json + def final_report = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json def salmonella_json = final_report.salmonella_GCA_000008105.salmonella_GCA_000008105 + assert final_report.salmonella_GCA_000008105.QualityAnalysis.n50_value.value == 4755700 + assert final_report.salmonella_GCA_000008105.QualityAnalysis.n50_value.qc_status == "PASSED" + assert final_report.salmonella_GCA_000008105.QualityAnalysis.nr_contigs.qc_status == "PASSED" + assert final_report.salmonella_GCA_000008105.QualityAnalysis.nr_contigs.value == 3 + assert final_report.salmonella_GCA_000008105.QualityAnalysis.length.value == 4944000 + assert final_report.salmonella_GCA_000008105.QualityAnalysis.length.qc_status == "PASSED" + // Tests assert salmonella_json.SpeciesTopHit == "s__Salmonella enterica" - assert iridanext_metadata.salmonella_GCA_000008105."SpeciesTopHit" == "s__Salmonella enterica" + assert iridanext_metadata.salmonella_GCA_000008105."Species" == "s__Salmonella enterica" assert final_report_tmap.SpeciesTopHit == "s__Salmonella enterica" assert salmonella_json.QUAST."0"."Total length" == "4944000" - assert iridanext_metadata.salmonella_GCA_000008105."QUAST.0.Total length" == "4944000" assert final_report_tmap."QUAST.0.Total length" == "4944000" assert salmonella_json.QUAST."0"."Largest contig" == "4755700" - assert iridanext_metadata.salmonella_GCA_000008105."QUAST.0.Largest contig" == "4755700" assert final_report_tmap."QUAST.0.Largest contig" == "4755700" assert salmonella_json.QUAST."0"."# contigs" == "3" - assert iridanext_metadata.salmonella_GCA_000008105."QUAST.0.# contigs" == "3" assert final_report_tmap."QUAST.0.# contigs" == "3" assert salmonella_json.QUAST."0"."N50" == "4755700" - assert iridanext_metadata.salmonella_GCA_000008105."QUAST.0.N50" == "4755700" assert final_report_tmap."QUAST.0.N50" == "4755700" assert salmonella_json.StarAMR."0".Genotype == "aadA1, aadA2, aadA2, aph(3'')-Ib, aph(3')-Ia, blaCMY-2, blaTEM-1B, cmlA1, dfrA12, gyrA (D87N), gyrA (S83F), qacE, sul1, sul3, tet(A)" - assert iridanext_metadata.salmonella_GCA_000008105."StarAMR.0.Genotype" == "aadA1, aadA2, aadA2, aph(3'')-Ib, aph(3')-Ia, blaCMY-2, blaTEM-1B, cmlA1, dfrA12, gyrA (D87N), gyrA (S83F), qacE, sul1, sul3, tet(A)" + assert iridanext_metadata.salmonella_GCA_000008105."StarAMR Genotype" == "aadA1, aadA2, aadA2, aph(3'')-Ib, aph(3')-Ia, blaCMY-2, blaTEM-1B, cmlA1, dfrA12, gyrA (D87N), gyrA (S83F), qacE, sul1, sul3, tet(A)" assert final_report_tmap."StarAMR.0.Genotype" == "aadA1, aadA2, aadA2, aph(3'')-Ib, aph(3')-Ia, blaCMY-2, blaTEM-1B, cmlA1, dfrA12, gyrA (D87N), gyrA (S83F), qacE, sul1, sul3, tet(A)" assert salmonella_json.StarAMR."0"."Predicted Phenotype" == "streptomycin, kanamycin, ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone, chloramphenicol, trimethoprim, ciprofloxacin I/R, nalidixic acid, unknown[qacE_1_X68232], sulfisoxazole, tetracycline" - assert iridanext_metadata.salmonella_GCA_000008105."StarAMR.0.Predicted Phenotype" == "streptomycin, kanamycin, ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone, chloramphenicol, trimethoprim, ciprofloxacin I/R, nalidixic acid, unknown[qacE_1_X68232], sulfisoxazole, tetracycline" + assert iridanext_metadata.salmonella_GCA_000008105."StarAMR Predicted Phenotype" == "streptomycin, kanamycin, ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone, chloramphenicol, trimethoprim, ciprofloxacin I/R, nalidixic acid, unknown[qacE_1_X68232], sulfisoxazole, tetracycline" assert final_report_tmap."StarAMR.0.Predicted Phenotype" == "streptomycin, kanamycin, ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone, chloramphenicol, trimethoprim, ciprofloxacin I/R, nalidixic acid, unknown[qacE_1_X68232], sulfisoxazole, tetracycline" + assert salmonella_json.StarAMR."0"."CGE Predicted Phenotype" == "Spectinomycin, Streptomycin, Neomycin, Kanamycin, Lividomycin, Paromomycin, Ribostamycin, Amoxicillin, Amoxicillin+Clavulanic acid, Ampicillin, Ampicillin+Clavulanic acid, Cefotaxime, Cefoxitin, Ceftazidime, Piperacillin, Piperacillin+Tazobactam, Ticarcillin, Ticarcillin+Clavulanic acid, Cephalothin, Chloramphenicol, Trimethoprim, Nalidixic acid, Ciprofloxacin, Benzylkonium Chloride, Ethidium Bromide, Chlorhexidine, Cetylpyridinium Chloride, Sulfamethoxazole, Doxycycline, Tetracycline" + assert iridanext_metadata.salmonella_GCA_000008105."StarAMR CGE Predicted Phenotype" == "Spectinomycin, Streptomycin, Neomycin, Kanamycin, Lividomycin, Paromomycin, Ribostamycin, Amoxicillin, Amoxicillin+Clavulanic acid, Ampicillin, Ampicillin+Clavulanic acid, Cefotaxime, Cefoxitin, Ceftazidime, Piperacillin, Piperacillin+Tazobactam, Ticarcillin, Ticarcillin+Clavulanic acid, Cephalothin, Chloramphenicol, Trimethoprim, Nalidixic acid, Ciprofloxacin, Benzylkonium Chloride, Ethidium Bromide, Chlorhexidine, Cetylpyridinium Chloride, Sulfamethoxazole, Doxycycline, Tetracycline" + assert final_report_tmap."StarAMR.0.CGE Predicted Phenotype" == "Spectinomycin, Streptomycin, Neomycin, Kanamycin, Lividomycin, Paromomycin, Ribostamycin, Amoxicillin, Amoxicillin+Clavulanic acid, Ampicillin, Ampicillin+Clavulanic acid, Cefotaxime, Cefoxitin, Ceftazidime, Piperacillin, Piperacillin+Tazobactam, Ticarcillin, Ticarcillin+Clavulanic acid, Cephalothin, Chloramphenicol, Trimethoprim, Nalidixic acid, Ciprofloxacin, Benzylkonium Chloride, Ethidium Bromide, Chlorhexidine, Cetylpyridinium Chloride, Sulfamethoxazole, Doxycycline, Tetracycline" + assert salmonella_json.StarAMR."0".Plasmid == "IncFIB(K), IncFIB(S), IncFII(S)" - assert iridanext_metadata.salmonella_GCA_000008105."StarAMR.0.Plasmid" == "IncFIB(K), IncFIB(S), IncFII(S)" + assert iridanext_metadata.salmonella_GCA_000008105."StarAMR Plasmid" == "IncFIB(K), IncFIB(S), IncFII(S)" assert final_report_tmap."StarAMR.0.Plasmid" == "IncFIB(K), IncFIB(S), IncFII(S)" - assert salmonella_json.StarAMR."0".Scheme == "senterica" - assert iridanext_metadata.salmonella_GCA_000008105."StarAMR.0.Scheme" == "senterica" - assert final_report_tmap."StarAMR.0.Scheme" == "senterica" + assert salmonella_json.StarAMR."0".Scheme == "senterica_achtman_2" + assert final_report_tmap."StarAMR.0.Scheme" == "senterica_achtman_2" assert salmonella_json.StarAMR."0"."Sequence Type" == "66" - assert iridanext_metadata.salmonella_GCA_000008105."StarAMR.0.Sequence Type" == "66" assert final_report_tmap."StarAMR.0.Sequence Type" == "66" assert salmonella_json.SISTRSubtyping."0".serovar == "Choleraesuis" - assert iridanext_metadata.salmonella_GCA_000008105."SISTRSubtyping.0.serovar" == "Choleraesuis" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR Serovar" == "Choleraesuis" assert final_report_tmap."SISTRSubtyping.0.serovar" == "Choleraesuis" assert salmonella_json.SISTRSubtyping."0".serogroup == "C1" - assert iridanext_metadata.salmonella_GCA_000008105."SISTRSubtyping.0.serogroup" == "C1" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR Serogroup" == "C1" assert final_report_tmap."SISTRSubtyping.0.serogroup" == "C1" assert salmonella_json.SISTRSubtyping."0".serovar_antigen == "Hissar|Choleraesuis|Paratyphi C|Typhisuis|Chiredzi" - assert iridanext_metadata.salmonella_GCA_000008105."SISTRSubtyping.0.serovar_antigen" == "Hissar|Choleraesuis|Paratyphi C|Typhisuis|Chiredzi" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR Serovar Antigen" == "Hissar|Choleraesuis|Paratyphi C|Typhisuis|Chiredzi" assert final_report_tmap."SISTRSubtyping.0.serovar_antigen" == "Hissar|Choleraesuis|Paratyphi C|Typhisuis|Chiredzi" + // other metadata for IRIDA Next + assert iridanext_metadata.salmonella_GCA_000008105."GC (%)" == "52.13" + assert !iridanext_metadata.salmonella_GCA_000008105.containsKey("Mean Sequence Length Forward") + assert !iridanext_metadata.salmonella_GCA_000008105.containsKey("BaseCount") + assert iridanext_metadata.salmonella_GCA_000008105."SISTR cgMLST ST" == "2572600197" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR cgMLST Found Loci" == "330" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR cgMLST Genome Match" == "ERR351246" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR cgMLST Matching Alleles" == "320" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR cgMLST Subspecies" == "enterica" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR H1" == "c" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR H2" == "1,5" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR Antigen" == "6,7" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR QC Message" == "INFO: Number of cgMLST330 loci found (n=330)" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR QC Status" == "PASS" + assert iridanext_metadata.salmonella_GCA_000008105."SISTR Serovar cgMLST" == "Choleraesuis" } } @@ -257,6 +521,7 @@ nextflow_pipeline { skip_staramr = false skip_subtyping = false + skip_allele_calling = true max_memory = "2.GB" max_cpus = 1 @@ -267,11 +532,11 @@ nextflow_pipeline { assert workflow.success assert path("$launchDir/results").exists() - assert path("$launchDir/results/subtyping/lissero/listeria_GCF_000196035.tsv").exists() + assert path("$launchDir/results/Assembly/Subtyping/Lissero/listeria_GCF_000196035.lissero.subtyping.tsv").exists() - assert path("$launchDir/results/annotations/StarAMR/listeria_GCF_000196035/plasmidfinder.tsv").exists() - assert path("$launchDir/results/annotations/StarAMR/listeria_GCF_000196035/resfinder.tsv").exists() - assert path("$launchDir/results/annotations/StarAMR/listeria_GCF_000196035/detailed_summary.tsv").exists() + assert path("$launchDir/results/Assembly/Annotation/StarAMR/listeria_GCF_000196035/listeria_GCF_000196035.plasmidfinder.staramr.annotation.tsv").exists() + assert path("$launchDir/results/Assembly/Annotation/StarAMR/listeria_GCF_000196035/listeria_GCF_000196035.resfinder.staramr.annotation.tsv").exists() + assert path("$launchDir/results/Assembly/Annotation/StarAMR/listeria_GCF_000196035/listeria_GCF_000196035.detailed_summary.staramr.annotation.tsv").exists() // compare IRIDA Next JSON output assert path("$launchDir/results/iridanext.output.json").exists() @@ -279,79 +544,115 @@ nextflow_pipeline { def iridanext_global = iridanext_json.files.global def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - - assert iridanext_metadata.listeria_GCF_000196035."meta.assembly" == true - assert iridanext_metadata.listeria_GCF_000196035."meta.hybrid" == false - assert iridanext_metadata.listeria_GCF_000196035."meta.single_end" == false + def listeria_metadata = iridanext_metadata.listeria_GCF_000196035 + + assert listeria_metadata."QC Status" == "FAILED" + assert !listeria_metadata.containsKey("Checkm Status") + assert !listeria_metadata.containsKey("Checkm Value") + assert !listeria_metadata.containsKey("Average Coverage Status") + assert !listeria_metadata.containsKey("Average Coverage Value") + assert listeria_metadata."n50 Status" == "PASSED" + assert listeria_metadata."n50 Value" == 2944528 + assert !listeria_metadata.containsKey("Raw Average Quality Status") + assert !listeria_metadata.containsKey("Raw Average Quality Value") + assert listeria_metadata."Length Status" == "PASSED" + assert listeria_metadata."Length Value" == 2944528 + assert listeria_metadata."nr contigs Status" == "PASSED" + assert listeria_metadata."nr contigs Value" == 1 + assert listeria_metadata."QC Summary" == "[FAILED] Sample is likely contaminated, REISOLATION AND RESEQUENCING RECOMMENDED\nPassed Tests: 3/4\nSpecies ID: s__Listeria monocytogenes" + + assert listeria_metadata."Downsampled" == false // parse final report tsv file def final_report_tmap = [:] - def final_report_tsv = path("$launchDir/results/SummaryReport/final_report.tsv").readLines() + def final_report_tsv = path("$launchDir/results/FinalReports/Aggregated/Tables/final_report.tsv").readLines() final_report_tsv.each { def tokens = it.split("\t") final_report_tmap[tokens[0]] = tokens[1] } // output files - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "SummaryReport/final_report.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 + assert iridanext_samples.listeria_GCF_000196035.findAll { it.path == "Assembly/Quality/QUAST/listeria_GCF_000196035/listeria_GCF_000196035.transposed_listeria_GCF_000196035.quast.quality.tsv" }.size() == 1 + assert iridanext_samples.listeria_GCF_000196035.findAll { it.path == "Assembly/Quality/SeqKitStats/listeria_GCF_000196035.seqkit.stats.summary.tsv" }.size() == 1 + assert iridanext_samples.listeria_GCF_000196035.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/listeria_GCF_000196035.mash.screen.taxa.screen.screen" }.size() == 1 + assert iridanext_samples.listeria_GCF_000196035.findAll { it.path == "Assembly/Subtyping/Lissero/listeria_GCF_000196035.lissero.subtyping.tsv" }.size() == 1 + assert iridanext_samples.listeria_GCF_000196035.findAll { it.path == "FinalReports/FlattenedReports/listeria_GCF_000196035.flat_sample.json.gz" }.size() == 1 // parse final report json file - def final_report = path("$launchDir/results/SummaryReport/final_report.json").json + def final_report = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json def listeria_json = final_report.listeria_GCF_000196035.listeria_GCF_000196035 + assert final_report.listeria_GCF_000196035.QualityAnalysis.n50_value.value == 2944528 + assert final_report.listeria_GCF_000196035.QualityAnalysis.n50_value.qc_status == "PASSED" + assert final_report.listeria_GCF_000196035.QualityAnalysis.nr_contigs.qc_status == "PASSED" + assert final_report.listeria_GCF_000196035.QualityAnalysis.nr_contigs.value == 1 + assert final_report.listeria_GCF_000196035.QualityAnalysis.length.value == 2944528 + assert final_report.listeria_GCF_000196035.QualityAnalysis.length.qc_status == "PASSED" + // Tests assert listeria_json.SpeciesTopHit == "s__Listeria monocytogenes" - assert iridanext_metadata.listeria_GCF_000196035."SpeciesTopHit" == "s__Listeria monocytogenes" + assert iridanext_metadata.listeria_GCF_000196035."Species" == "s__Listeria monocytogenes" assert final_report_tmap.SpeciesTopHit == "s__Listeria monocytogenes" assert listeria_json.QUAST."0"."Total length" == "2944528" - assert iridanext_metadata.listeria_GCF_000196035."QUAST.0.Total length" == "2944528" assert final_report_tmap."QUAST.0.Total length" == "2944528" assert listeria_json.QUAST."0"."Largest contig" == "2944528" - assert iridanext_metadata.listeria_GCF_000196035."QUAST.0.Largest contig" == "2944528" assert final_report_tmap."QUAST.0.Largest contig" == "2944528" assert listeria_json.QUAST."0"."# contigs" == "1" - assert iridanext_metadata.listeria_GCF_000196035."QUAST.0.# contigs" == "1" assert final_report_tmap."QUAST.0.# contigs" == "1" assert listeria_json.QUAST."0"."N50" == "2944528" - assert iridanext_metadata.listeria_GCF_000196035."QUAST.0.N50" == "2944528" assert final_report_tmap."QUAST.0.N50" == "2944528" assert listeria_json.StarAMR."0".Genotype == "fosX" - assert iridanext_metadata.listeria_GCF_000196035."StarAMR.0.Genotype" == "fosX" + assert iridanext_metadata.listeria_GCF_000196035."StarAMR Genotype" == "fosX" assert final_report_tmap."StarAMR.0.Genotype" == "fosX" assert listeria_json.StarAMR."0"."Predicted Phenotype" == "fosfomycin" - assert iridanext_metadata.listeria_GCF_000196035."StarAMR.0.Predicted Phenotype" == "fosfomycin" + assert iridanext_metadata.listeria_GCF_000196035."StarAMR Predicted Phenotype" == "fosfomycin" assert final_report_tmap."StarAMR.0.Predicted Phenotype" == "fosfomycin" + assert listeria_json.StarAMR."0"."CGE Predicted Phenotype" == "Fosfomycin" + assert iridanext_metadata.listeria_GCF_000196035."StarAMR CGE Predicted Phenotype" == "Fosfomycin" + assert final_report_tmap."StarAMR.0.CGE Predicted Phenotype" == "Fosfomycin" + assert listeria_json.StarAMR."0".Plasmid == "None" - assert iridanext_metadata.listeria_GCF_000196035."StarAMR.0.Plasmid" == "None" + assert iridanext_metadata.listeria_GCF_000196035."StarAMR Plasmid" == "None" assert final_report_tmap."StarAMR.0.Plasmid" == "None" - assert listeria_json.StarAMR."0".Scheme == "lmonocytogenes" - assert iridanext_metadata.listeria_GCF_000196035."StarAMR.0.Scheme" == "lmonocytogenes" - assert final_report_tmap."StarAMR.0.Scheme" == "lmonocytogenes" + assert listeria_json.StarAMR."0".Scheme == "listeria_2" + assert final_report_tmap."StarAMR.0.Scheme" == "listeria_2" assert listeria_json.StarAMR."0"."Sequence Type" == "35" - assert iridanext_metadata.listeria_GCF_000196035."StarAMR.0.Sequence Type" == "35" assert final_report_tmap."StarAMR.0.Sequence Type" == "35" assert listeria_json.LISSEROSubtyping."0".SEROTYPE == "1/2c, 3c" - assert iridanext_metadata.listeria_GCF_000196035."LISSEROSubtyping.0.SEROTYPE" == "1/2c, 3c" + assert iridanext_metadata.listeria_GCF_000196035."LISSERO Serotype" == "1/2c, 3c" assert final_report_tmap."LISSEROSubtyping.0.SEROTYPE" == "1/2c, 3c" assert listeria_json.LISSEROSubtyping."0".PRS == "FULL" - assert iridanext_metadata.listeria_GCF_000196035."LISSEROSubtyping.0.PRS" == "FULL" assert final_report_tmap."LISSEROSubtyping.0.PRS" == "FULL" assert listeria_json.LISSEROSubtyping."0".ORF2110 == "NONE" - assert iridanext_metadata.listeria_GCF_000196035."LISSEROSubtyping.0.ORF2110" == "NONE" assert final_report_tmap."LISSEROSubtyping.0.ORF2110" == "NONE" + + assert iridanext_metadata.listeria_GCF_000196035."GC (%)" == "37.98" + assert !iridanext_metadata.listeria_GCF_000196035.containsKey("Mean Sequence Length Forward") + assert !iridanext_metadata.listeria_GCF_000196035.containsKey("BaseCount") + + assert iridanext_metadata.listeria_GCF_000196035."dat" == "1" + assert iridanext_metadata.listeria_GCF_000196035."ldh" == "4" + assert iridanext_metadata.listeria_GCF_000196035."cat" == "6" + assert iridanext_metadata.listeria_GCF_000196035."abcZ" == "6" + assert iridanext_metadata.listeria_GCF_000196035."bglA" == "5" + assert iridanext_metadata.listeria_GCF_000196035."dapE" == "20" + assert iridanext_metadata.listeria_GCF_000196035."lhkA" == "1" + assert iridanext_metadata.listeria_GCF_000196035."7 Gene ST" == "35" + assert iridanext_metadata.listeria_GCF_000196035."7 Gene Scheme" == "lmonocytogenes" } } diff --git a/tests/subworkflows/local/qc_assemblies/qc_assemblies.nf.test b/tests/subworkflows/local/qc_assemblies/qc_assemblies.nf.test index ea3136dd..03760b45 100644 --- a/tests/subworkflows/local/qc_assemblies/qc_assemblies.nf.test +++ b/tests/subworkflows/local/qc_assemblies/qc_assemblies.nf.test @@ -48,9 +48,9 @@ nextflow_workflow { assert contains([["id": "SAMPLE2"], ["report_tag": "MaxContigToShort"], false]) } - assert !file("$launchDir/results/assembly/quality/quast/SAMPLE1/SAMPLE1.tsv").exists() + assert !file("$launchDir/results/Assembly/Quality/QUAST/SAMPLE1/SAMPLE1.quast.quality.tsv").exists() - with(path("$launchDir/results/assembly/quality/quast/SAMPLE2/SAMPLE2.tsv").readLines()) { + with(path("$launchDir/results/Assembly/Quality/QUAST/SAMPLE2/SAMPLE2.quast.quality.tsv").readLines()) { assert contains("Assembly\tsample2") assert contains("# contigs (>= 0 bp)\t3") assert contains("Total length (>= 0 bp)\t60") @@ -104,8 +104,8 @@ nextflow_workflow { assert contains([["id": "SAMPLE2"], ["report_tag": "MaxContigToShort"], true]) } - assert !file("$launchDir/results/assembly/quality/quast/SAMPLE1/SAMPLE1.tsv").exists() - assert !file("$launchDir/results/assembly/quality/quast/SAMPLE2/SAMPLE2.tsv").exists() + assert !file("$launchDir/results/Assembly/Quality/QUAST/SAMPLE1/SAMPLE1.quast.quality.tsv").exists() + assert !file("$launchDir/results/Assembly/Quality/QUAST/SAMPLE2/SAMPLE2.quast.quality.tsv").exists() } } @@ -150,14 +150,14 @@ nextflow_workflow { assert contains([["id": "SAMPLE2"], ["report_tag": "MaxContigToShort"], false]) } - with(path("$launchDir/results/assembly/quality/quast/SAMPLE1/SAMPLE1.tsv").readLines()) { + with(path("$launchDir/results/Assembly/Quality/QUAST/SAMPLE1/SAMPLE1.quast.quality.tsv").readLines()) { assert contains("Assembly\tsample1") assert contains("# contigs (>= 0 bp)\t4") assert contains("Total length (>= 0 bp)\t10") assert contains("N50\t4") } - with(path("$launchDir/results/assembly/quality/quast/SAMPLE2/SAMPLE2.tsv").readLines()) { + with(path("$launchDir/results/Assembly/Quality/QUAST/SAMPLE2/SAMPLE2.quast.quality.tsv").readLines()) { assert contains("Assembly\tsample2") assert contains("# contigs (>= 0 bp)\t3") assert contains("Total length (>= 0 bp)\t60") diff --git a/tests/subworkflows/local/qc_assemblies/qc_assemblies.nf.test.snap b/tests/subworkflows/local/qc_assemblies/qc_assemblies.nf.test.snap index b6e4e7a8..878ef970 100644 --- a/tests/subworkflows/local/qc_assemblies/qc_assemblies.nf.test.snap +++ b/tests/subworkflows/local/qc_assemblies/qc_assemblies.nf.test.snap @@ -29,7 +29,7 @@ "singularity": "https://depot.galaxyproject.org/singularity/seqkit:2.2.0--h9ee0642_0", "docker": "quay.io/biocontainers/seqkit:2.2.0--h9ee0642_0", "report_ext": ".tsv", - "fasta_ext": "_filtered.fasta.gz", + "fasta_ext": ".filtered.fasta.gz", "filter_field": "max_len", "report_tag": "Seqkit_stats", "header_p": true @@ -74,7 +74,7 @@ "singularity": "https://depot.galaxyproject.org/singularity/seqkit:2.2.0--h9ee0642_0", "docker": "quay.io/biocontainers/seqkit:2.2.0--h9ee0642_0", "report_ext": ".tsv", - "fasta_ext": "_filtered.fasta.gz", + "fasta_ext": ".filtered.fasta.gz", "filter_field": "max_len", "report_tag": "Seqkit_stats", "header_p": true @@ -86,7 +86,8 @@ "versions.yml:md5,7531e2c7e3f82f40f432af6b47f2879b", "versions.yml:md5,af079a3d8ae9ebd777f169eb697dcfd1", "versions.yml:md5,af079a3d8ae9ebd777f169eb697dcfd1", - "versions.yml:md5,b08afc3129cf0e1dddee612ed2eaceb8" + "versions.yml:md5,b08afc3129cf0e1dddee612ed2eaceb8", + "versions.yml:md5,fa9af6c042ba90adf4edabce42cbb6e5" ], "quast_data": [ [ @@ -115,7 +116,7 @@ "singularity": "https://depot.galaxyproject.org/singularity/seqkit:2.2.0--h9ee0642_0", "docker": "quay.io/biocontainers/seqkit:2.2.0--h9ee0642_0", "report_ext": ".tsv", - "fasta_ext": "_filtered.fasta.gz", + "fasta_ext": ".filtered.fasta.gz", "filter_field": "max_len", "report_tag": "Seqkit_stats", "header_p": true @@ -160,7 +161,7 @@ "singularity": "https://depot.galaxyproject.org/singularity/seqkit:2.2.0--h9ee0642_0", "docker": "quay.io/biocontainers/seqkit:2.2.0--h9ee0642_0", "report_ext": ".tsv", - "fasta_ext": "_filtered.fasta.gz", + "fasta_ext": ".filtered.fasta.gz", "filter_field": "max_len", "report_tag": "Seqkit_stats", "header_p": true @@ -172,14 +173,15 @@ "versions.yml:md5,7531e2c7e3f82f40f432af6b47f2879b", "versions.yml:md5,af079a3d8ae9ebd777f169eb697dcfd1", "versions.yml:md5,af079a3d8ae9ebd777f169eb697dcfd1", - "versions.yml:md5,b08afc3129cf0e1dddee612ed2eaceb8" + "versions.yml:md5,b08afc3129cf0e1dddee612ed2eaceb8", + "versions.yml:md5,fa9af6c042ba90adf4edabce42cbb6e5" ] } ], "meta": { "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nextflow": "23.04.1" }, - "timestamp": "2024-02-21T09:36:25.2768567" + "timestamp": "2024-04-18T13:53:27.760811189" } } \ No newline at end of file diff --git a/workflows/CleanAssemble.nf b/workflows/CleanAssemble.nf index 73fbf89e..88386f1a 100644 --- a/workflows/CleanAssemble.nf +++ b/workflows/CleanAssemble.nf @@ -60,7 +60,6 @@ workflow CLEAN_ASSEMBLE_READS { // Get quality metrics on Raw Reads (requirement) if(!params.skip_raw_read_metrics){ - raw_quality_info = READ_SCAN(prepped_input.map{ val -> val[0].hybrid ? tuple(val[0], val[1], val[2]) : tuple(val[0], val[1], []) }) diff --git a/workflows/PostAssembly.nf b/workflows/PostAssembly.nf index fc288752..80a28e93 100644 --- a/workflows/PostAssembly.nf +++ b/workflows/PostAssembly.nf @@ -19,6 +19,7 @@ include { HYBRID_ASSEMBLY } from '../subworkflows/local/hybrid_assembly' include { ANNOTATE_GENOMES } from '../subworkflows/local/annotate_genomes.nf' include { SUBTYPE_GENOME } from '../subworkflows/local/subtype_genome.nf' include { SPLIT_METAGENOMIC } from '../subworkflows/local/split_metagenomic.nf' +include { LOCIDEX } from '../subworkflows/local/locidex.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -69,8 +70,10 @@ workflow POST_ASSEMBLY { } ch_speciation = Channel.empty() + top_hit = channel.empty() if(!params.skip_species_classification){ ch_speciation = DETERMINE_SPECIES(ch_filtered_contigs) + top_hit = ch_speciation.top_hit ch_versions = ch_versions.mix(ch_speciation.versions) ch_reports = ch_reports.mix(ch_speciation.reports) @@ -80,8 +83,7 @@ workflow POST_ASSEMBLY { //if(!params.skip_subtyping && !params.run_kraken && !params.skip_species_classification){ if(!params.skip_subtyping && !params.skip_species_classification){ - //SUBTYPE_GENOME(ch_filtered_contigs, ch_speciation.results) - SUBTYPE_GENOME(ch_filtered_contigs, ch_speciation.top_hit) + SUBTYPE_GENOME(ch_filtered_contigs, top_hit) ch_reports = ch_reports.mix(SUBTYPE_GENOME.out.reports) ch_versions = ch_versions.mix(SUBTYPE_GENOME.out.versions) @@ -91,12 +93,19 @@ workflow POST_ASSEMBLY { log.info "No subtyping of assemblies performed" } - ANNOTATE_GENOMES(ch_filtered_contigs, ch_speciation.top_hit) - ch_reports = ch_reports.mix(ANNOTATE_GENOMES.out.reports) - ch_versions = ch_versions.mix(ANNOTATE_GENOMES.out.versions) - + if(!params.skip_allele_calling){ + if (!params.skip_species_classification || params.allele_scheme){ + LOCIDEX(ch_filtered_contigs, top_hit) + ch_versions = LOCIDEX.out.versions + }else{ + log.info "Skipping locidex since there is no '--allele_scheme' set and '--skip_species_classification' is enabled" + } + } + ANNOTATE_GENOMES(ch_filtered_contigs, top_hit) + ch_reports = ch_reports.mix(ANNOTATE_GENOMES.out.reports) + ch_versions = ch_versions.mix(ANNOTATE_GENOMES.out.versions) emit: