diff --git a/CHANGELOG.md b/CHANGELOG.md index e6b8ca8..c332bd1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,4 +38,15 @@ specified and MASH distance to RefSeq genomes fails * Now species verification via MASH distance to RefSeq genomes and E.coli specific alleles is done only if `--verify` parameter is specified. * If `--verify` is not specified, all input genomes are treated as E.coli without doing any species verification + +**v2.0.0** +* Updated species identification module now based on GTDB + custom Escherichia and Shigella sketch covering all known bacterial species +* Implemented pathotyping covering 7 DEC *Escherichia coli* pathotypes (`DAEC`, `EAEC`, `EHEC`, `EIEC`, `EPEC`, `ETEC` and `STEC`) supporting simultaneous presence of multiple signatures (e.g. `ETEC/STEC`). Note that `EHEC` is reported as `EHEC-STEC` as this is a more severe subtype of `STEC`. +* Implemented Shiga 1 and 2 toxin typing supporting multiple toxin signatures present in a single sample. + * A total of 4 *stx1* subtypes are supported: `stx1a`, `stx1c`, `stx1d` and `stx1e`. + * A total of 15 *stx2* subtypes are supported: `stx2a`, `stx2b`, `stx2c`, `stx2d`, `stx2e`, `stx2f`, `stx2g` ,`stx2h`, `stx2i`, `stx2j`, `stx2k`, `stx2l`, `stx2m`, `stx2n`, `stx2o`. +* new database of pathotypes and toxins in JSON clear transparent format composed of the key virulence factors based on both BioNumerics and literature sources +* support for gzip compressed inputs `fastq.gz` and `fasta.gz` saving storage and increasing versatility +* other toxin typing covering enterohemolysin A (`ehxA`), hemolysin E (`hlyE`), hemolysin A (`hlyA`) + \ No newline at end of file diff --git a/README.md b/README.md index c07f903..fd5ea29 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,9 @@ The tool provides convenient species identification coupled to quality control m As WGS becomes standard within public health and research laboratories, it is important to harness the high throughput and resolution potential of this technology providing accurate and rapid at scale typing of E.coli both in public health, clinical and research contexts. ## Citation -Bessonov, Kyrylo, Chad Laing, James Robertson, Irene Yong, Kim Ziebell, Victor PJ Gannon, Anil Nichani, Gitanjali Arya, John HE Nash, and Sara Christianson. "ECTyper: in silico Escherichia coli serotype and species prediction from raw and assembled whole-genome sequence data." Microbial genomics 7, no. 12 (2021): 000728. [https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000728](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000728) +If you find `ectyper` useful, please cite the following paper: + +> Bessonov, Kyrylo, Chad Laing, James Robertson, Irene Yong, Kim Ziebell, Victor PJ Gannon, Anil Nichani, Gitanjali Arya, John HE Nash, and Sara Christianson. **"ECTyper: in silico Escherichia coli serotype and species prediction from raw and assembled whole-genome sequence data."** Microbial genomics 7, no. 12 (2021): 000728. [https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000728](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000728) ## Contact For any questions, issues or comments please make a Github issue or reach out to [Kyrylo Bessonov](kyrylo.bessonov@phac-aspc.gc.ca). @@ -328,6 +330,7 @@ Some O-antigens display very high degree of homology and are very hard to discer |[Galaxy Europe](https://usegalaxy.eu/root?tool_id=ectyper)| Galaxy public server to execute your analysis from anywhere|Web-based| |[IRIDA plugin](https://github.com/phac-nml/irida-plugin-ectyper)| IRIDA instances could easily install additional pipeline|Web-based| + # Legal and Compliance Information Copyright Government of Canada 2024 diff --git a/ectyper/commandLineOptions.py b/ectyper/commandLineOptions.py index 9dc3ab5..a531a29 100644 --- a/ectyper/commandLineOptions.py +++ b/ectyper/commandLineOptions.py @@ -89,7 +89,7 @@ def checkdbversion(): "--percentIdentityOtype", type=check_percentage, help="Percent identity required for an O antigen allele match [default %(default)s]", - default=90 + default=95 ) parser.add_argument( diff --git a/test/Data/CP041431_STEC316.fasta.gz b/test/Data/CP041431_STEC316.fasta.gz new file mode 100644 index 0000000..63bca7f Binary files /dev/null and b/test/Data/CP041431_STEC316.fasta.gz differ diff --git a/test/Data/SRR7612273.fasta.gz b/test/Data/SRR7612273.fasta.gz new file mode 100644 index 0000000..ee02c26 Binary files /dev/null and b/test/Data/SRR7612273.fasta.gz differ diff --git a/test/Data/SRR7947260.fasta.gz b/test/Data/SRR7947260.fasta.gz new file mode 100644 index 0000000..d9b2f3d Binary files /dev/null and b/test/Data/SRR7947260.fasta.gz differ diff --git a/test/test_complex_inputs.py b/test/test_complex_inputs.py index 6fc4c9b..ca12919 100644 --- a/test/test_complex_inputs.py +++ b/test/test_complex_inputs.py @@ -24,7 +24,7 @@ def set_input(input, :param output: Location of output :return: None """ - print(input) + args = ['-i', input, '-c', str(cores), ] diff --git a/test/test_shiga_and_pathotyping.py b/test/test_shiga_and_pathotyping.py new file mode 100644 index 0000000..14ca5b7 --- /dev/null +++ b/test/test_shiga_and_pathotyping.py @@ -0,0 +1,109 @@ +import sys +import pytest +import tempfile +import os +from ectyper import ectyper, definitions +import subprocess +import pandas as pd +import logging +import re + +TEST_ROOT = os.path.dirname(__file__) +LOG=logging.getLogger("TEST") +LOG.setLevel(logging.INFO) + +def set_input(input, + percent_iden=None, + verify=True, + output=tempfile.mkdtemp(), + cores=1, + debug=False, + pathotype = False): + """ + Create the sys.argv[] without need for commandline input. + :param input: Input file given by testing function + :param percent_iden: Percent identity for comparison + :param output: Location of output + :return: None + """ + args = ['-i', input, + '-c', str(cores), + ] + + if percent_iden: + args += ['-d', str(percent_iden)] + if verify: + args += ['--verify'] + if output: + args += ['-o', output] + if debug: + args+=['--debug'] + if pathotype: + args+=['--pathotype'] + + sys.argv[1:] = args + + +def test_single_stx2_subtyping(caplog): + caplog.set_level(logging.DEBUG) + file = os.path.join(TEST_ROOT,'Data/EscherichiaO28H5.fasta') + tmpdir = tempfile.mkdtemp() + set_input(input=file, cores=4, verify=True, debug=False, output=tmpdir, pathotype=True) + ectyper.run_program() + with open(os.path.join(tmpdir,"output.tsv")) as outfp: + secondrow = outfp.readlines()[1] + assert "STEC" in secondrow + assert "stx2a" in secondrow + +def test_stx1_stx2_subtyping_pathotyping(caplog): + caplog.set_level(logging.DEBUG) + file = os.path.join(TEST_ROOT,'Data/Escherichia.fna') + tmpdir = tempfile.mkdtemp() + set_input(input=file, cores=4, verify=True, debug=True, output=tmpdir, pathotype=True) + ectyper.run_program() + with open(os.path.join(tmpdir,"output.tsv")) as outfp: + secondrow = outfp.readlines()[1] + assert "EHEC" in secondrow + assert "stx1a;stx2a" in secondrow + assert "AP010958.1;AP010958.1" in secondrow + + +def test_multi_stx_non_overlap_ranges(caplog): + caplog.set_level(logging.DEBUG) + file = os.path.join(TEST_ROOT,'Data/CP041431_STEC316.fasta.gz') + tmpdir = tempfile.mkdtemp() + set_input(input=file, cores=4, verify=True, debug=False, output=tmpdir, pathotype=True) + ectyper.run_program() + with open(os.path.join(tmpdir,"output.tsv")) as outfp: + secondrow = outfp.readlines()[1] + assert "STEC" in secondrow + assert "stx2e" in secondrow + assert "stx2k" in secondrow + +def test_multi_stx_non_overlap_different_contigs(caplog): + caplog.set_level(logging.DEBUG) + file = os.path.join(TEST_ROOT,'Data/SRR7947260.fasta.gz') + tmpdir = tempfile.mkdtemp() + set_input(input=file, cores=4, verify=True, debug=False, output=tmpdir, pathotype=True) + ectyper.run_program() + with open(os.path.join(tmpdir,"output.tsv")) as outfp: + secondrow = outfp.readlines()[1] + assert "ETEC/STEC" in secondrow + assert "stx2a" in secondrow + assert "stx2g" in secondrow + assert "contig00064;contig00074" in secondrow + + +def test_multi_stx_overlap_same_contig(caplog): + caplog.set_level(logging.DEBUG) + file = os.path.join(TEST_ROOT,'Data/SRR7612273.fasta.gz') + tmpdir = tempfile.mkdtemp() + set_input(input=file, cores=4, verify=True, debug=True, output=tmpdir, pathotype=True) + ectyper.run_program() + with open(os.path.join(tmpdir,"output.tsv")) as outfp: + secondrow = outfp.readlines()[1] + assert "STEC" in secondrow + assert "stx2a" in secondrow + assert "stx2d" in secondrow + assert "contig00078;contig00078" in secondrow +