Merge pull request #94 from phac-nml/v2.0.0

Release V2.0.0
phac-nml · Dec 12, 2024 · 21f2cbd · 21f2cbd
2 parents ed4996e + 33dd165
commit 21f2cbd
Show file tree

Hide file tree

Showing 29 changed files with 8,452 additions and 538 deletions.
diff --git a/.github/workflows/github-actions.yaml b/.github/workflows/github-actions.yaml
@@ -0,0 +1,41 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python application
+
+on:
+  push:
+    branches: [ "master", "v2.0.0" ]
+  pull_request:
+    branches: [ "master", "v2.0.0" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-22.04
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.12"
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install samtools bowtie2 mash bcftools ncbi-blast+ seqtk libcurl4-openssl-dev libssl-dev ca-certificates -y
+        sudo apt-get install python3-pip python3-dev python3-pandas python3-requests  python3-biopython -y
+        python3 -m pip install --upgrade pip setuptools
+        pip3 install pytest
+        if [ -f requirements.txt ]; then 
+          pip3 install -r requirements.txt;
+        else
+          pip3 install -e .
+        fi
+        ectyper_init
+    - name: Test  with pytest
+      run: |
+        pytest -o log_cli=true --basetemp=tmp-pytest
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,4 +38,15 @@ specified and MASH distance to RefSeq genomes fails
   * Now species verification via MASH distance to RefSeq genomes and E.coli specific alleles is done only if `--verify`
 parameter is specified. 
   * If `--verify` is not specified, all input genomes are treated as E.coli without doing any species verification
+
+**v2.0.0**
+* Updated species identification module now based on GTDB + custom Escherichia and Shigella sketch covering all known bacterial species
+* Implemented pathotyping covering 7 DEC *Escherichia coli* pathotypes (`DAEC`, `EAEC`, `EHEC`, `EIEC`, `EPEC`, `ETEC` and `STEC`) supporting simultaneous presence of multiple signatures (e.g. `ETEC/STEC`). Note that `EHEC` is reported as `EHEC-STEC` as this is a more severe subtype of `STEC`. 
+* Implemented Shiga 1 and 2 toxin typing supporting multiple toxin signatures present in a single sample.
+  * A total of 4 *stx1* subtypes are supported: `stx1a`, `stx1c`, `stx1d` and `stx1e`.
+  * A total of 15 *stx2* subtypes are supported: `stx2a`, `stx2b`, `stx2c`, `stx2d`, `stx2e`, `stx2f`, `stx2g` ,`stx2h`, `stx2i`, `stx2j`, `stx2k`, `stx2l`, `stx2m`, `stx2n`, `stx2o`.
+* new database of pathotypes and toxins in JSON clear transparent format composed of the key virulence factors based on both BioNumerics and literature sources  
+* support for gzip compressed inputs `fastq.gz` and `fasta.gz` saving storage and increasing versatility
+* other toxin typing covering enterohemolysin A (`ehxA`), hemolysin E (`hlyE`),  hemolysin A (`hlyA`)
+
 
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,13 @@
+FROM ubuntu:22.04
+ENV DEBIAN_FRONTEND="noninteractive" TZ="America/New_York"
+RUN apt update && apt install git python3-pip -y
+RUN apt install libcurl4-openssl-dev libssl-dev -y
+RUN pip3 install Cython numpy
+RUN apt install mash ncbi-blast+  bowtie2 seqtk samtools bcftools -y
+RUN git clone https://github.com/phac-nml/ecoli_serotyping.git
+# install the tool and initialize its species ID MASH database
+RUN cd ecoli_serotyping && git checkout v2.0.0 && pip3 install .
+RUN ectyper_init
+
+#build image:  docker build --tag  ectyper:2.0.0 .
+#type a sample: docker run -it --rm -v $PWD:/mnt ectyper:2.0.0 ectyper -i /mnt/assembly.fasta -o /mnt/temp/ --pathotype
diff --git a/README.md b/README.md
diff --git a/Singularity.def b/Singularity.def
@@ -0,0 +1,16 @@
+Bootstrap: docker
+From: ubuntu:22.04
+
+%environment
+    DEBIAN_FRONTEND="noninteractive" TZ="America/New_York"
+
+%post
+    apt update && apt install git python3-pip -y
+    apt install libcurl4-openssl-dev libssl-dev -y
+    pip3 install Cython numpy
+    apt install mash ncbi-blast+  bowtie2 seqtk samtools bcftools -y
+    git clone https://github.com/phac-nml/ecoli_serotyping.git
+    cd ecoli_serotyping && git checkout v2.0.0 && pip3 install .
+    ectyper_init
+# To build an image run the following. Might use --remote flag if no sudo/admin priv.
+# singularity build ectyper_v2.0.0_22032024.sif Singularity.def
diff --git a/ectyper/Data/ectyper_patho_stx_toxin_typing_database.json b/ectyper/Data/ectyper_patho_stx_toxin_typing_database.json
diff --git a/ectyper/__init__.py b/ectyper/__init__.py
@@ -1 +1 @@
-__version__ = "1.0.0"
+__version__ = "2.0.0"
diff --git a/ectyper/commandLineOptions.py b/ectyper/commandLineOptions.py
@@ -40,7 +40,7 @@ def checkdbversion():
     dbversion = checkdbversion()
 
     parser = argparse.ArgumentParser(
-        description='ectyper v{} database v{} Prediction of Escherichia coli serotype from '
+        description='ectyper v{} antigen database v{}. Prediction of Escherichia coli serotype, pathotype and shiga toxin tying from '
                     'raw reads'
                     ' or assembled genome sequences. The default settings are recommended.'.format(__version__, dbversion)
     )
@@ -57,9 +57,25 @@ def checkdbversion():
         "--input",
         help="Location of E. coli genome file(s). Can be a single file, a \
             comma-separated list of files, or a directory",
-        required=True
+        required=True,
+        nargs="+"
     )
 
+    parser.add_argument(
+        "--longreads",
+        action="store_true",
+        default=False,
+        help="Enable for raw long reads FASTQ inputs (ONT, PacBio, other sequencing platforms). [default %(default)s]"
+    )
+
+    parser.add_argument(
+        "--maxdirdepth",
+        help="Maximum number of directories to descend when searching an input directory of files [default %(default)s levels]. Only works on path inputs not containing '*' wildcard",
+        default=0, 
+        type=int,   
+        required=False
+    )
+
     parser.add_argument(
         "-c",
         "--cores",
@@ -73,7 +89,7 @@ def checkdbversion():
         "--percentIdentityOtype",
         type=check_percentage,
         help="Percent identity required for an O antigen allele match [default %(default)s]",
-        default=90
+        default=95
     )
 
     parser.add_argument(
@@ -88,15 +104,15 @@ def checkdbversion():
         "-opcov",
         "--percentCoverageOtype",
         type=check_percentage,
-        help="Minumum percent coverage required for an O antigen allele match [default %(default)s]",
+        help="Minimum percent coverage required for an O antigen allele match [default %(default)s]",
         default=90
     )
 
     parser.add_argument(
         "-hpcov",
         "--percentCoverageHtype",
         type=check_percentage,
-        help="Minumum percent coverage required for an H antigen allele match [default %(default)s]",
+        help="Minimum percent coverage required for an H antigen allele match [default %(default)s]",
         default=50
     )
 
@@ -114,14 +130,13 @@ def checkdbversion():
 
     parser.add_argument(
         "-r",
-        "--refseq",
-        help="Location of pre-computed MASH RefSeq sketch. If provided, "
+        "--reference",
+        default=definitions.SPECIES_ID_SKETCH,
+        help="Location of pre-computed MASH sketch for species identification. If provided, "
              "genomes "
              "identified as non-E. coli will have their species identified "
              "using "
-             "MASH. For best results the pre-sketched RefSeq archive "
-             "https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh "
-             "is recommended"
+             "MASH dist"
     )
 
     parser.add_argument(
@@ -140,7 +155,29 @@ def checkdbversion():
 
     parser.add_argument(
         "--dbpath",
-        help="Path to a custom database of O and H antigen alleles in JSON format.\nCheck Data/ectyper_database.json for more information"
+        help="Path to a custom database of O and H antigen alleles in JSON format.\n"
+    )
+
+    parser.add_argument(
+        "--pathotype",
+        action="store_true",
+        help="Predict E.coli pathotype and Shiga toxin subtype(s) if present\n"
+    )
+
+    parser.add_argument(
+        "-pathpid",
+        "--percentIdentityPathotype",
+        type=check_percentage,
+        help="Minimum percent identity required for a pathotype reference allele match [default: %(default)s]",
+        default=90
+    )
+
+    parser.add_argument(
+        "-pathpcov",
+        "--percentCoveragePathotype",
+        type=check_percentage,
+        help="Minimum percent coverage required for a pathotype reference allele match [default: %(default)s]",
+        default=50
     )
 
     if args is None:

diff --git a/ectyper/definitions.py b/ectyper/definitions.py
@@ -11,8 +11,10 @@
 WORKPLACE_DIR = os.getcwd()
 
 SEROTYPE_ALLELE_JSON = os.path.join(DATA_DIR, 'ectyper_alleles_db.json')
+PATHOTYPE_ALLELE_JSON = os.path.join(DATA_DIR, 'ectyper_patho_stx_toxin_typing_database.json')
+SPECIES_ID_SKETCH = os.path.join(DATA_DIR, 'EnteroRef_GTDBSketch_20231003_V2.msh')
 #ECOLI_MARKERS = os.path.join(DATA_DIR, 'ecoli_specific_markers.fasta')
-REFSEQ_SUMMARY = os.path.join(DATA_DIR, 'assembly_summary_refseq.txt')
+#REFSEQ_SUMMARY = os.path.join(DATA_DIR, 'assembly_summary_refseq.txt')
 OSEROTYPE_GROUPS_DICT = {'1': ['O20','O137'],
                          '2': ['O28','O42'],
                          '3': ['O118','O151'],
@@ -30,12 +32,20 @@
                          '15':['O89','O101','O162'],
                          '16':['O169','O183']
                          }
-MASH_URLS = ["https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh",
-             "https://share.corefacility.ca/index.php/s/KDhSNQfhE6npIyo/download",
-             "https://gitlab.com/kbessonov/ectyper/raw/master/ectyper/Data/refseq.genomes.k21s1000.msh"]
-assembly_summary_refseq_url_dict = {"assembly_summary_refseq.txt":
-                                    "http://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt"
-                                    }
+MASH_URLS = ["https://zenodo.org/records/13969103/files/EnteroRef_GTDBSketch_20231003_V2.msh?download=1"]
+
 HIGH_SIMILARITY_THRESHOLD_O = 0.00771 # alleles that are 99.23% apart will be reported as mixed call ~ 8 nt difference on average
 MIN_O_IDENTITY_LS = 95 #low similarity group O antigen min identity threshold to pre-filter BLAST output  (identical to global threshold)
-MIN_O_COVERAGE_LS = 48 #low similarity group O antigen min coverage threshold to pre-filter BLAST output (based on cross-talk study results)
+MIN_O_COVERAGE_LS = 48 #low similarity group O antigen min coverage threshold to pre-filter BLAST output (based on cross-talk study results)
+PATHOTYPE_TOXIN_FIELDS = ['pathotype', 'pathotype_count', 'pathotype_genes', 'pathotype_gene_names',  'pathotype_accessions', 'pathotype_allele_id', 
+                   'pathotype_pident', 'pathotype_pcov','pathotype_length_ratio', 'pathotype_rule_ids', 'pathotype_gene_counts', 'pathotype_database',
+                   'stx_genes', 'stx_accessions', 'stx_allele_ids', 'stx_genes_full_name', 'stx_pidents', 'stx_pcovs', 'stx_gene_lengths', 'stx_contigs', 'stx_gene_ranges']
+OUTPUT_TSV_HEADER = ['Name','Species', 'SpeciesMashRatio', 'SpeciesMashDistance','SpeciesMashTopID','O-type','H-type','Serotype','QC',
+             'Evidence','GeneScores','AlleleKeys','GeneIdentities(%)',
+             'GeneCoverages(%)','GeneContigNames','GeneRanges',
+             'GeneLengths','DatabaseVer','Warnings','Pathotype', 'PathotypeCounts', 'PathotypeGenes', 'PathotypeGeneNames', 'PathotypeAccessions', 'PathotypeAlleleIDs', 
+             'PathotypeIdentities(%)','PathotypeCoverages(%)','PathotypeGeneLengthRatios','PathotypeRuleIDs', 'PathotypeGeneCounts', 'PathoDBVer',
+             'StxSubtypes','StxAccessions','StxAlleleIDs','StxAlleleNames', 'StxIdentities(%)','StxCoverages(%)','StxLengths',
+             'StxContigNames','StxCoordinates']
+OUTPUT_FILES_LIST = ['blastn_output_alleles.txt', 'blastn_pathotype_alleles_overall.txt', 'mash_output.txt', 
+                     'stx1_allhits_annotated_df.txt', 'stx2_allhits_annotated_df.txt']