added new species ID fields that provide info on species hashes to re…

…f and top reference accession id
phac-nml · Oct 28, 2024 · b2c7c5c · b2c7c5c
1 parent 7481376
commit b2c7c5c
Show file tree

Hide file tree

Showing 10 changed files with 158 additions and 93 deletions.
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ Docker and Singularity images are also available from [https://biocontainers.pro
 
 ### Databases
 ECTyper uses multiple databases 
-  - the species identification database is available from [https://zenodo.org/records/10211569](https://zenodo.org/records/10211569)
+  - the species identification database is available from [Zenodo](https://doi.org/10.5281/zenodo.10211568) repository
   - the O and H antigen allele sequences are stored in [ectyper_alleles_db.json](ectyper/Data/ectyper_alleles_db.json)
   - the toxin and pathotype signature marker sequences are stored in [ectyper_patho_stx_toxin_typing_database.json](ectyper/Data/ectyper_patho_stx_toxin_typing_database.json)
 

diff --git a/ectyper/commandLineOptions.py b/ectyper/commandLineOptions.py
@@ -2,7 +2,7 @@
 
 import argparse
 from ectyper import __version__
-import json, os
+import json
 import ectyper.definitions as definitions
 
 def parse_command_line(args=None):

diff --git a/ectyper/definitions.py b/ectyper/definitions.py
@@ -32,15 +32,15 @@
                          '15':['O89','O101','O162'],
                          '16':['O169','O183']
                          }
-MASH_URLS = ["https://zenodo.org/records/10211569/files/EnteroRef_GTDBSketch_20231003_V2.msh?download=1"]
+MASH_URLS = ["https://zenodo.org/records/13969103/files/EnteroRef_GTDBSketch_20231003_V2.msh?download=1"]
 
 HIGH_SIMILARITY_THRESHOLD_O = 0.00771 # alleles that are 99.23% apart will be reported as mixed call ~ 8 nt difference on average
 MIN_O_IDENTITY_LS = 95 #low similarity group O antigen min identity threshold to pre-filter BLAST output  (identical to global threshold)
 MIN_O_COVERAGE_LS = 48 #low similarity group O antigen min coverage threshold to pre-filter BLAST output (based on cross-talk study results)
 PATHOTYPE_TOXIN_FIELDS = ['pathotype', 'pathotype_count', 'pathotype_genes', 'pathotype_gene_names',  'pathotype_accessions', 'pathotype_allele_id', 
                    'pathotype_pident', 'pathotype_pcov','pathotype_length_ratio', 'pathotype_rule_ids', 'pathotype_gene_counts', 'pathotype_database',
                    'stx_genes', 'stx_accessions', 'stx_allele_ids', 'stx_genes_full_name', 'stx_pidents', 'stx_pcovs', 'stx_gene_lengths', 'stx_contigs', 'stx_gene_ranges']
-OUTPUT_TSV_HEADER = ['Name','Species','O-type','H-type','Serotype','QC',
+OUTPUT_TSV_HEADER = ['Name','Species', 'SpeciesMashRatio', 'SpeciesMashDistance','SpeciesMashTopID','O-type','H-type','Serotype','QC',
              'Evidence','GeneScores','AlleleKeys','GeneIdentities(%)',
              'GeneCoverages(%)','GeneContigNames','GeneRanges',
              'GeneLengths','DatabaseVer','Warnings','Pathotype', 'PathotypeCounts', 'PathotypeGenes', 'PathotypeGeneNames', 'PathotypeAccessions', 'PathotypeAlleleIDs', 

diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py
@@ -348,15 +348,24 @@ def run_prediction(genome_files_dict, args, alleles_fasta, temp_dir, ectyperdb_d
     with Pool(processes=args.cores) as pool:
         results = pool.map(gp, genome_groups)
 
-        # merge the per-database predictions with the final predictions dict
+        # merge the database predictions with the final predictions dict
         for r in results:
             predictions_dict = {**r, **predictions_dict}
+
     for genome_name in predictions_dict.keys():
+        predictions_dict[genome_name]["species"] = "-"
+        predictions_dict[genome_name]["species_mash_hash_ratio2ref"] = "-"
+        predictions_dict[genome_name]["species_mash_dist2ref"] = "-"
+        predictions_dict[genome_name]["species_mash_top_reference"] = "-"
         try:
             predictions_dict[genome_name]["species"] = genome_files_dict[genome_name]["species"]
+            predictions_dict[genome_name]["species_mash_hash_ratio2ref"] = genome_files_dict[genome_name]["species_mash_hash_ratio2ref"]
+            predictions_dict[genome_name]["species_mash_dist2ref"] = genome_files_dict[genome_name]["species_mash_dist2ref"]
+            predictions_dict[genome_name]["species_mash_top_reference"] = genome_files_dict[genome_name]["species_mash_top_reference"]
             predictions_dict[genome_name]["error"] = genome_files_dict[genome_name]["error"]
         except KeyError as e:
             predictions_dict[genome_name]["error"] = "Error: "+str(e)+" in "+genome_name
+            LOG.error(f"Failed on {genome_name} sample that does not exist in the 'genome_files_dict' dictionary with the {e} error")
 
     return predictions_dict
 

diff --git a/ectyper/init.py b/ectyper/init.py
@@ -1,11 +1,23 @@
+import argparse
 from ectyper import (definitions, speciesIdentification)
 import logging
 logging.basicConfig(level=logging.DEBUG)
 
 
 
 
+
 def main():
     logging.info("Initializing Species ID database ...")
-    speciesIdentification.get_species_mash(definitions.SPECIES_ID_SKETCH)
+    parser = argparse.ArgumentParser(
+        description='Species ID database ectyper initializer'
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Force species ID database initialization"
+    )
+    args = parser.parse_args()
+
+    speciesIdentification.get_species_mash(definitions.SPECIES_ID_SKETCH, args.force)
     logging.info("Done")
diff --git a/ectyper/predictionFunctions.py b/ectyper/predictionFunctions.py
@@ -757,6 +757,9 @@ def report_result(final_dict, output_dir, output_file, args):
     for sample in final_dict.keys():
         output_line = [sample] #name of a query sample/genome
         output_line.append(final_dict[sample]["species"]) #add species info
+        output_line.append(final_dict[sample]["species_mash_hash_ratio2ref"]) #add species top hit mash hash ratios
+        output_line.append(final_dict[sample]["species_mash_dist2ref"]) #add species mash distance
+        output_line.append(final_dict[sample]["species_mash_top_reference"]) #add species mash top hit id
 
         if "O" in final_dict[sample].keys():
             Otype=final_dict[sample]["O"]["serogroup"]
@@ -876,6 +879,7 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo
     :param predictions_data_frame: the Dict containing the ectyper predictions
     :return: modified prediction file
     """
+    print(other_dict)
 
     # genome names are given without the filename extension
     for g in all_genomes_list:
@@ -886,17 +890,26 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo
             if gname in other_dict:
                 predictions_dict[gname] = {
                     'error': other_dict[gname]["error"],
-                    'species': other_dict[gname]["species"]
+                    'species': other_dict[gname]["species"],
+                    'species_mash_hash_ratio2ref': other_dict[gname]["species_mash_hash_ratio2ref"],
+                    'species_mash_dist2ref': other_dict[gname]["species_mash_dist2ref"],
+                    'species_mash_top_reference' : other_dict[gname]["species_mash_top_reference"]
                 }
             elif gname in filesnotfound_dict:
                 predictions_dict[gname] = {
                 'error': filesnotfound_dict[gname]["error"],
-                'species': '-'
+                'species': '-',
+                'species_mash_hash_ratio2ref':'-',
+                'species_mash_dist2ref': '-',
+                'species_mash_top_reference' : '-'
             }
             else:
                 predictions_dict[gname] = {
                     'error':  f"No O and H antigen determinant E.coli genes were found in {gname}",
-                    'species': ecoli_dict[gname]["species"]
+                    'species': ecoli_dict[gname]["species"],
+                    'species_mash_hash_ratio2ref': ecoli_dict[gname]["species_mash_hash_ratio2ref"],
+                    'species_mash_dist2ref': ecoli_dict[gname]["species_mash_dist2ref"],
+                    'species_mash_top_reference' : ecoli_dict[gname]["species_mash_top_reference"]
                 }
 
     return predictions_dict
diff --git a/ectyper/speciesIdentification.py b/ectyper/speciesIdentification.py
@@ -42,7 +42,7 @@ def setLockFile(lockfilepath):
             time.sleep(60)  # recheck every 1 min if lock file was removed
         LOG.info("Lock file doest not exist or is removed by other process. Continue with databases download ...")
 
-def get_species_mash(targetpath):
+def get_species_mash(targetpath, force_download=False):
     """
     Get MASH sketch of genomes for species identification and check that the most recent version is installed
     :return returns boolean value depending on success of failure to download the MASH sketch
@@ -54,12 +54,12 @@ def get_species_mash(targetpath):
         os.remove(lockfilepath)
 
 
-    if bool_downloadMashSketch(targetpath):
+    if bool_downloadMashSketch(targetpath) or force_download == True:
         LOG.info("MASH species id sketch is missing and needs to be downloaded ...")
         setLockFile(lockfilepath)
         for url in definitions.MASH_URLS:
             try:
-                if os.path.exists(targetpath) == False:
+                if os.path.exists(targetpath) == False or force_download == True:
                     LOG.info("Downloading ~900MB from {}.".format(url))
                     response = requests.get(url,timeout=10, verify=False)
                     response.raise_for_status()
@@ -84,7 +84,7 @@ def get_species_mash(targetpath):
         return False #if all mirrors failed
 
     else:
-        LOG.info("MASH species id sketch is in good health and does not need to be downloaded".format(
+        LOG.info("MASH species id sketch at {} exists and is in good health and does not need to be downloaded".format(
             targetpath
         ))
         return True
@@ -209,7 +209,6 @@ def get_species(file, args, cores=1):
         LOG.info('For {} following top hits and hash ratios returned by MASH {}'.format(file,
                                                                         [(top_hit_line.split("\t")[0],top_hit_line.split("\t")[4]) for top_hit_line in top_hit_lines if len(top_hit_line.split("\t")[0])>0]))
 
-
     top_hit_line_elements = top_hit_line.split()
 
     if len(top_hit_line_elements) < 5:
@@ -231,7 +230,7 @@ def get_species(file, args, cores=1):
         else:
             LOG.warning(f"Could not determine species based on MASH distance for {file}")
             species = "-" 
-    return species    
+    return species, top_match_hashratio, top_match_dist, top_match   
 
 
 def getSampleName(file):
@@ -286,31 +285,40 @@ def verify_ecoli_and_inputs(fasta_fastq_files_dict, ofiles, filesnotfound, args)
         #do species always regardless of --verify param. Do prediction on fastq files if available for better accuracy
         if fasta_fastq_files_dict[fasta]:
             fastq_file = fasta_fastq_files_dict[fasta]
-            speciesname = get_species(fastq_file, args, args.cores)
+            speciesname, species_mash_hash_ratio, species_mash_dist, species_top_hit_accession   = get_species(fastq_file, args, args.cores)
         else:
-            speciesname = get_species(fasta, args, args.cores)
+            speciesname, species_mash_hash_ratio, species_mash_dist, species_top_hit_accession  = get_species(fasta, args, args.cores)
 
         if args.verify:
-            failverifyerrormessage += "Sample identified as " + speciesname + ": serotyping results are only valid for E.coli samples." \
-                                                                             "If sure that sample is E.coli run without --verify parameter."
+            failverifyerrormessage += "Sample identified as " + speciesname + ": typing results are only valid for E.coli samples." \
+                                                                             "If sure that sample is E.coli or want results regardless try running without the --verify parameter."
             if re.match("Escherichia coli", speciesname):
-                ecoli_files_dict[sampleName] = {"species":speciesname,
+                ecoli_files_dict[sampleName] = {"species":speciesname, "species_mash_hash_ratio2ref":species_mash_hash_ratio, 
+                                                "species_mash_dist2ref": species_mash_dist, "species_mash_top_reference":species_top_hit_accession, 
                                                 "filepath":fasta, "error": ""}
             elif is_escherichia_genus(speciesname):
-                other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta,"error":failverifyerrormessage}
+                other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta, 
+                                                "species_mash_hash_ratio2ref":species_mash_hash_ratio, 
+                                                "species_mash_dist2ref": species_mash_dist, "species_mash_top_reference":species_top_hit_accession, 
+                                                "error":failverifyerrormessage}
             else:
-                other_files_dict[sampleName] = {"species":speciesname, "filepath":fasta, "error":failverifyerrormessage}
+                other_files_dict[sampleName] = {"species":speciesname, "filepath":fasta, 
+                                                "species_mash_hash_ratio2ref":species_mash_hash_ratio, 
+                                                "species_mash_dist2ref": species_mash_dist, "species_mash_top_reference":species_top_hit_accession, 
+                                                "error":failverifyerrormessage}
         else:
-            ecoli_files_dict[sampleName] = {"species": speciesname,
-                                            "filepath": fasta, "error": ""}
-
+            ecoli_files_dict[sampleName] = {"species": speciesname,"filepath": fasta, 
+                                            "species_mash_hash_ratio2ref":species_mash_hash_ratio, "species_mash_dist2ref": species_mash_dist, 
+                                            "species_mash_top_reference":species_top_hit_accession, "error": ""}
+
     for bf in ofiles:
         sampleName = getSampleName(bf)
         LOG.warning(f"{sampleName} is non fasta / fastq file. Species identification aborted")
-        other_files_dict[sampleName] = {"error":"Non fasta / fastq file. ","filepath":bf,"species":"-"}
+        other_files_dict[sampleName] = {"error":"Non fasta / fastq file. ","filepath":bf,"species":"-", 
+                                        "species_mash_hash_ratio2ref":"-", "species_mash_dist2ref":"-", "species_mash_top_reference":"-"}
 
     for file in filesnotfound:
         sampleName = getSampleName(file)
-        filesnotfound_dict[sampleName]={"error":"File {} not found!".format(file)}
+        filesnotfound_dict[sampleName]={"error":"File {} not found!".format(file), }
 
     return ecoli_files_dict, other_files_dict,filesnotfound_dict
diff --git a/test/test_O_serotyping.py b/test/test_O_serotyping.py
@@ -55,9 +55,15 @@ def test_Otyping(caplog):
 
 
     with open(os.path.join(tmpdir,"output.tsv")) as outfp:
-         secondrow = outfp.readlines()[1].split("\t")
-         Otype = secondrow[2]
-         Htype = secondrow[3]
+         lines = outfp.readlines()
+         header = lines[0].split("\t")
+         secondrow = lines[1].split("\t")
+         assert 'O-type' in header, f'O-type column not found in the output.tsv ({header})'
+         assert 'H-type' in header, f'H-type column not found in the output.tsv ({header})'
+         col_number = [idx for idx, i in enumerate(header) if i == "O-type"][0]
+         Otype = secondrow[col_number]
+         col_number = [idx for idx, i in enumerate(header) if i == "H-type"][0]
+         Htype = secondrow[col_number]
 
     assert Otype == "-", "Expected no call but reported O-type:" + Otype
     assert Htype == "H11", "Expected H11 but reported H-type:" + Htype
@@ -102,71 +108,23 @@ def test_mixofspecies(caplog):
 
     with open(os.path.join(tmpdir,"output.tsv")) as outfp:
          rows = outfp.readlines()
-    rows=rows[1:] #remove header line
+    header = rows[0]
+    resultsrows = rows[1:] #remove header line
 
-    serovars=[]; genomenames=[]; QCflag=[]; confidence=[]
-    for row in rows:
+    serotypes=[]; species=[]; QCflag=[]
+
+    for row in resultsrows:
         rowlist = row.split("\t")
-        serovars.append(rowlist[4])
-        genomenames.append(rowlist[1])
-        QCflag.append(rowlist[5])
-        confidence.append(rowlist[6])
+        serotypes.append(rowlist[ [idx for idx, i in enumerate(header.split('\t')) if i == "Serotype"][0] ])
+        species.append(rowlist[  [idx for idx, i in enumerate(header.split('\t')) if i == "Species"][0]   ])
+        QCflag.append(rowlist[ [idx for idx, i in enumerate(header.split('\t')) if i == "QC"][0]  ])
 
-    assert serovars == ['-:-', 'O22:H8', '-:-']
+    assert serotypes == ['-:-', 'O22:H8', '-:-']
     expectedspecies_list = ["Campylobacter_D jejuni","Escherichia coli","Salmonella enterica"]
     for i in range(0,3):
-        assert bool(re.match(expectedspecies_list[i], genomenames[i])) == True
+        assert bool(re.match(expectedspecies_list[i], species[i])) == True
     assert QCflag == ["WARNING (WRONG SPECIES)","PASS (REPORTABLE)","WARNING (WRONG SPECIES)"]
 
-
-def test_Ealbertii_1(caplog): #error
-    LOG.info("Starting 1 of 3 test on EnteroBase on sample ESC_HA8355AA_AS: Escherichia albertii O65:H5")
-    caplog.set_level(logging.DEBUG)
-    file = os.path.join(TEST_ROOT,
-                        'Data/ESC_HA8355AA_AS_Ealberii_O65H5.fasta')
-    tmpdir = tempfile.mkdtemp()
-    set_input(input=file, cores=4, print_sequence=True,  verify=True, output=tmpdir)
-
-    ectyper.run_program()
-    with open(os.path.join(tmpdir,"output.tsv")) as outfp:
-         rows = outfp.readlines()
-    secondrow=rows[1:][0] #remove header line
-    assert "Escherichia albertii" in secondrow
-    assert "WARNING (WRONG SPECIES)" in secondrow
-
-def test_Ealbertii_2(): #error
-    LOG.info("Starting 2 of 3 test on EnteroBase on sample on ESC_HA8509AA_AS: Escherichia albertii O5:H5")
-
-    file = os.path.join(TEST_ROOT,
-                        'Data/ESC_HA8509AA_AS_EalbertiiO5H5.fasta')
-    tmpdir = tempfile.mkdtemp()
-    set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir)
-    ectyper.run_program()
-
-    with open(os.path.join(tmpdir,"output.tsv")) as outfp:
-         rows = outfp.readlines()
-    secondrow=rows[1:][0] #check only second row
-
-    assert "Escherichia albertii" in secondrow
-    assert "WARNING (WRONG SPECIES)" in secondrow
-
-def test_Ealbertii_3(caplog):
-    LOG.info("Starting 3 of 3 test Escherichia albertii O49:NM") #can not type O49 due to poor sequence quality of uncertainty of wet-lab O49 typing
-    caplog.set_level(logging.DEBUG)
-    file = os.path.join(TEST_ROOT,
-                        'Data/Ealbertii_O49NM.fasta')
-
-    tmpdir = tempfile.mkdtemp()
-    set_input(input=file, cores=4, print_sequence=True,  verify=True, output=tmpdir)
-    ectyper.run_program()
-
-    with open(os.path.join(tmpdir ,"output.tsv")) as outfp:
-         rows = outfp.readlines()
-    secondrow=rows[1:][0] #check only second row
-    assert "Escherichia albertii" in secondrow
-    assert "WARNING (WRONG SPECIES)" in secondrow
-
-
 def test_Ecoli_O17H18(caplog):
     caplog.set_level(logging.DEBUG)
     file = os.path.join(TEST_ROOT,
@@ -179,7 +137,8 @@ def test_Ecoli_O17H18(caplog):
     with open(os.path.join(tmpdir,"output.tsv")) as outfp:
          rows = outfp.readlines()
     secondrow=rows[1:][0] #check only second row
-    assert "Escherichia coli\tO17/O77/O44/O106\tH18\tO17/O77/O44/O106:H18\tWARNING MIXED O-TYPE" in secondrow
+    assert "Escherichia coli" in secondrow.split('\t')
+    assert "O17/O77/O44/O106\tH18\tO17/O77/O44/O106:H18\tWARNING MIXED O-TYPE" in secondrow
 
 def test_download_refseq_mash(caplog, tmpdir):
     caplog.set_level(logging.DEBUG)