Skip to content

Commit

Permalink
added new species ID fields that provide info on species hashes to re…
Browse files Browse the repository at this point in the history
…f and top reference accession id
  • Loading branch information
kbessonov1984 committed Oct 28, 2024
1 parent 7481376 commit b2c7c5c
Show file tree
Hide file tree
Showing 10 changed files with 158 additions and 93 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Docker and Singularity images are also available from [https://biocontainers.pro

### Databases
ECTyper uses multiple databases
- the species identification database is available from [https://zenodo.org/records/10211569](https://zenodo.org/records/10211569)
- the species identification database is available from [Zenodo](https://doi.org/10.5281/zenodo.10211568) repository
- the O and H antigen allele sequences are stored in [ectyper_alleles_db.json](ectyper/Data/ectyper_alleles_db.json)
- the toxin and pathotype signature marker sequences are stored in [ectyper_patho_stx_toxin_typing_database.json](ectyper/Data/ectyper_patho_stx_toxin_typing_database.json)

Expand Down
2 changes: 1 addition & 1 deletion ectyper/commandLineOptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import argparse
from ectyper import __version__
import json, os
import json
import ectyper.definitions as definitions

def parse_command_line(args=None):
Expand Down
4 changes: 2 additions & 2 deletions ectyper/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@
'15':['O89','O101','O162'],
'16':['O169','O183']
}
MASH_URLS = ["https://zenodo.org/records/10211569/files/EnteroRef_GTDBSketch_20231003_V2.msh?download=1"]
MASH_URLS = ["https://zenodo.org/records/13969103/files/EnteroRef_GTDBSketch_20231003_V2.msh?download=1"]

HIGH_SIMILARITY_THRESHOLD_O = 0.00771 # alleles that are 99.23% apart will be reported as mixed call ~ 8 nt difference on average
MIN_O_IDENTITY_LS = 95 #low similarity group O antigen min identity threshold to pre-filter BLAST output (identical to global threshold)
MIN_O_COVERAGE_LS = 48 #low similarity group O antigen min coverage threshold to pre-filter BLAST output (based on cross-talk study results)
PATHOTYPE_TOXIN_FIELDS = ['pathotype', 'pathotype_count', 'pathotype_genes', 'pathotype_gene_names', 'pathotype_accessions', 'pathotype_allele_id',
'pathotype_pident', 'pathotype_pcov','pathotype_length_ratio', 'pathotype_rule_ids', 'pathotype_gene_counts', 'pathotype_database',
'stx_genes', 'stx_accessions', 'stx_allele_ids', 'stx_genes_full_name', 'stx_pidents', 'stx_pcovs', 'stx_gene_lengths', 'stx_contigs', 'stx_gene_ranges']
OUTPUT_TSV_HEADER = ['Name','Species','O-type','H-type','Serotype','QC',
OUTPUT_TSV_HEADER = ['Name','Species', 'SpeciesMashRatio', 'SpeciesMashDistance','SpeciesMashTopID','O-type','H-type','Serotype','QC',
'Evidence','GeneScores','AlleleKeys','GeneIdentities(%)',
'GeneCoverages(%)','GeneContigNames','GeneRanges',
'GeneLengths','DatabaseVer','Warnings','Pathotype', 'PathotypeCounts', 'PathotypeGenes', 'PathotypeGeneNames', 'PathotypeAccessions', 'PathotypeAlleleIDs',
Expand Down
11 changes: 10 additions & 1 deletion ectyper/ectyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,15 +348,24 @@ def run_prediction(genome_files_dict, args, alleles_fasta, temp_dir, ectyperdb_d
with Pool(processes=args.cores) as pool:
results = pool.map(gp, genome_groups)

# merge the per-database predictions with the final predictions dict
# merge the database predictions with the final predictions dict
for r in results:
predictions_dict = {**r, **predictions_dict}

for genome_name in predictions_dict.keys():
predictions_dict[genome_name]["species"] = "-"
predictions_dict[genome_name]["species_mash_hash_ratio2ref"] = "-"
predictions_dict[genome_name]["species_mash_dist2ref"] = "-"
predictions_dict[genome_name]["species_mash_top_reference"] = "-"
try:
predictions_dict[genome_name]["species"] = genome_files_dict[genome_name]["species"]
predictions_dict[genome_name]["species_mash_hash_ratio2ref"] = genome_files_dict[genome_name]["species_mash_hash_ratio2ref"]
predictions_dict[genome_name]["species_mash_dist2ref"] = genome_files_dict[genome_name]["species_mash_dist2ref"]
predictions_dict[genome_name]["species_mash_top_reference"] = genome_files_dict[genome_name]["species_mash_top_reference"]
predictions_dict[genome_name]["error"] = genome_files_dict[genome_name]["error"]
except KeyError as e:
predictions_dict[genome_name]["error"] = "Error: "+str(e)+" in "+genome_name
LOG.error(f"Failed on {genome_name} sample that does not exist in the 'genome_files_dict' dictionary with the {e} error")

return predictions_dict

Expand Down
14 changes: 13 additions & 1 deletion ectyper/init.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
import argparse
from ectyper import (definitions, speciesIdentification)
import logging
logging.basicConfig(level=logging.DEBUG)





def main():
logging.info("Initializing Species ID database ...")
speciesIdentification.get_species_mash(definitions.SPECIES_ID_SKETCH)
parser = argparse.ArgumentParser(
description='Species ID database ectyper initializer'
)
parser.add_argument(
"--force",
action="store_true",
help="Force species ID database initialization"
)
args = parser.parse_args()

speciesIdentification.get_species_mash(definitions.SPECIES_ID_SKETCH, args.force)
logging.info("Done")
19 changes: 16 additions & 3 deletions ectyper/predictionFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,9 @@ def report_result(final_dict, output_dir, output_file, args):
for sample in final_dict.keys():
output_line = [sample] #name of a query sample/genome
output_line.append(final_dict[sample]["species"]) #add species info
output_line.append(final_dict[sample]["species_mash_hash_ratio2ref"]) #add species top hit mash hash ratios
output_line.append(final_dict[sample]["species_mash_dist2ref"]) #add species mash distance
output_line.append(final_dict[sample]["species_mash_top_reference"]) #add species mash top hit id

if "O" in final_dict[sample].keys():
Otype=final_dict[sample]["O"]["serogroup"]
Expand Down Expand Up @@ -876,6 +879,7 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo
:param predictions_data_frame: the Dict containing the ectyper predictions
:return: modified prediction file
"""
print(other_dict)

# genome names are given without the filename extension
for g in all_genomes_list:
Expand All @@ -886,17 +890,26 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo
if gname in other_dict:
predictions_dict[gname] = {
'error': other_dict[gname]["error"],
'species': other_dict[gname]["species"]
'species': other_dict[gname]["species"],
'species_mash_hash_ratio2ref': other_dict[gname]["species_mash_hash_ratio2ref"],
'species_mash_dist2ref': other_dict[gname]["species_mash_dist2ref"],
'species_mash_top_reference' : other_dict[gname]["species_mash_top_reference"]
}
elif gname in filesnotfound_dict:
predictions_dict[gname] = {
'error': filesnotfound_dict[gname]["error"],
'species': '-'
'species': '-',
'species_mash_hash_ratio2ref':'-',
'species_mash_dist2ref': '-',
'species_mash_top_reference' : '-'
}
else:
predictions_dict[gname] = {
'error': f"No O and H antigen determinant E.coli genes were found in {gname}",
'species': ecoli_dict[gname]["species"]
'species': ecoli_dict[gname]["species"],
'species_mash_hash_ratio2ref': ecoli_dict[gname]["species_mash_hash_ratio2ref"],
'species_mash_dist2ref': ecoli_dict[gname]["species_mash_dist2ref"],
'species_mash_top_reference' : ecoli_dict[gname]["species_mash_top_reference"]
}

return predictions_dict
44 changes: 26 additions & 18 deletions ectyper/speciesIdentification.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def setLockFile(lockfilepath):
time.sleep(60) # recheck every 1 min if lock file was removed
LOG.info("Lock file doest not exist or is removed by other process. Continue with databases download ...")

def get_species_mash(targetpath):
def get_species_mash(targetpath, force_download=False):
"""
Get MASH sketch of genomes for species identification and check that the most recent version is installed
:return returns boolean value depending on success of failure to download the MASH sketch
Expand All @@ -54,12 +54,12 @@ def get_species_mash(targetpath):
os.remove(lockfilepath)


if bool_downloadMashSketch(targetpath):
if bool_downloadMashSketch(targetpath) or force_download == True:
LOG.info("MASH species id sketch is missing and needs to be downloaded ...")
setLockFile(lockfilepath)
for url in definitions.MASH_URLS:
try:
if os.path.exists(targetpath) == False:
if os.path.exists(targetpath) == False or force_download == True:
LOG.info("Downloading ~900MB from {}.".format(url))
response = requests.get(url,timeout=10, verify=False)
response.raise_for_status()
Expand All @@ -84,7 +84,7 @@ def get_species_mash(targetpath):
return False #if all mirrors failed

else:
LOG.info("MASH species id sketch is in good health and does not need to be downloaded".format(
LOG.info("MASH species id sketch at {} exists and is in good health and does not need to be downloaded".format(
targetpath
))
return True
Expand Down Expand Up @@ -209,7 +209,6 @@ def get_species(file, args, cores=1):
LOG.info('For {} following top hits and hash ratios returned by MASH {}'.format(file,
[(top_hit_line.split("\t")[0],top_hit_line.split("\t")[4]) for top_hit_line in top_hit_lines if len(top_hit_line.split("\t")[0])>0]))


top_hit_line_elements = top_hit_line.split()

if len(top_hit_line_elements) < 5:
Expand All @@ -231,7 +230,7 @@ def get_species(file, args, cores=1):
else:
LOG.warning(f"Could not determine species based on MASH distance for {file}")
species = "-"
return species
return species, top_match_hashratio, top_match_dist, top_match


def getSampleName(file):
Expand Down Expand Up @@ -286,31 +285,40 @@ def verify_ecoli_and_inputs(fasta_fastq_files_dict, ofiles, filesnotfound, args)
#do species always regardless of --verify param. Do prediction on fastq files if available for better accuracy
if fasta_fastq_files_dict[fasta]:
fastq_file = fasta_fastq_files_dict[fasta]
speciesname = get_species(fastq_file, args, args.cores)
speciesname, species_mash_hash_ratio, species_mash_dist, species_top_hit_accession = get_species(fastq_file, args, args.cores)
else:
speciesname = get_species(fasta, args, args.cores)
speciesname, species_mash_hash_ratio, species_mash_dist, species_top_hit_accession = get_species(fasta, args, args.cores)

if args.verify:
failverifyerrormessage += "Sample identified as " + speciesname + ": serotyping results are only valid for E.coli samples." \
"If sure that sample is E.coli run without --verify parameter."
failverifyerrormessage += "Sample identified as " + speciesname + ": typing results are only valid for E.coli samples." \
"If sure that sample is E.coli or want results regardless try running without the --verify parameter."
if re.match("Escherichia coli", speciesname):
ecoli_files_dict[sampleName] = {"species":speciesname,
ecoli_files_dict[sampleName] = {"species":speciesname, "species_mash_hash_ratio2ref":species_mash_hash_ratio,
"species_mash_dist2ref": species_mash_dist, "species_mash_top_reference":species_top_hit_accession,
"filepath":fasta, "error": ""}
elif is_escherichia_genus(speciesname):
other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta,"error":failverifyerrormessage}
other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta,
"species_mash_hash_ratio2ref":species_mash_hash_ratio,
"species_mash_dist2ref": species_mash_dist, "species_mash_top_reference":species_top_hit_accession,
"error":failverifyerrormessage}
else:
other_files_dict[sampleName] = {"species":speciesname, "filepath":fasta, "error":failverifyerrormessage}
other_files_dict[sampleName] = {"species":speciesname, "filepath":fasta,
"species_mash_hash_ratio2ref":species_mash_hash_ratio,
"species_mash_dist2ref": species_mash_dist, "species_mash_top_reference":species_top_hit_accession,
"error":failverifyerrormessage}
else:
ecoli_files_dict[sampleName] = {"species": speciesname,
"filepath": fasta, "error": ""}

ecoli_files_dict[sampleName] = {"species": speciesname,"filepath": fasta,
"species_mash_hash_ratio2ref":species_mash_hash_ratio, "species_mash_dist2ref": species_mash_dist,
"species_mash_top_reference":species_top_hit_accession, "error": ""}

for bf in ofiles:
sampleName = getSampleName(bf)
LOG.warning(f"{sampleName} is non fasta / fastq file. Species identification aborted")
other_files_dict[sampleName] = {"error":"Non fasta / fastq file. ","filepath":bf,"species":"-"}
other_files_dict[sampleName] = {"error":"Non fasta / fastq file. ","filepath":bf,"species":"-",
"species_mash_hash_ratio2ref":"-", "species_mash_dist2ref":"-", "species_mash_top_reference":"-"}

for file in filesnotfound:
sampleName = getSampleName(file)
filesnotfound_dict[sampleName]={"error":"File {} not found!".format(file)}
filesnotfound_dict[sampleName]={"error":"File {} not found!".format(file), }

return ecoli_files_dict, other_files_dict,filesnotfound_dict
83 changes: 21 additions & 62 deletions test/test_O_serotyping.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,15 @@ def test_Otyping(caplog):


with open(os.path.join(tmpdir,"output.tsv")) as outfp:
secondrow = outfp.readlines()[1].split("\t")
Otype = secondrow[2]
Htype = secondrow[3]
lines = outfp.readlines()
header = lines[0].split("\t")
secondrow = lines[1].split("\t")
assert 'O-type' in header, f'O-type column not found in the output.tsv ({header})'
assert 'H-type' in header, f'H-type column not found in the output.tsv ({header})'
col_number = [idx for idx, i in enumerate(header) if i == "O-type"][0]
Otype = secondrow[col_number]
col_number = [idx for idx, i in enumerate(header) if i == "H-type"][0]
Htype = secondrow[col_number]

assert Otype == "-", "Expected no call but reported O-type:" + Otype
assert Htype == "H11", "Expected H11 but reported H-type:" + Htype
Expand Down Expand Up @@ -102,71 +108,23 @@ def test_mixofspecies(caplog):

with open(os.path.join(tmpdir,"output.tsv")) as outfp:
rows = outfp.readlines()
rows=rows[1:] #remove header line
header = rows[0]
resultsrows = rows[1:] #remove header line

serovars=[]; genomenames=[]; QCflag=[]; confidence=[]
for row in rows:
serotypes=[]; species=[]; QCflag=[]

for row in resultsrows:
rowlist = row.split("\t")
serovars.append(rowlist[4])
genomenames.append(rowlist[1])
QCflag.append(rowlist[5])
confidence.append(rowlist[6])
serotypes.append(rowlist[ [idx for idx, i in enumerate(header.split('\t')) if i == "Serotype"][0] ])
species.append(rowlist[ [idx for idx, i in enumerate(header.split('\t')) if i == "Species"][0] ])
QCflag.append(rowlist[ [idx for idx, i in enumerate(header.split('\t')) if i == "QC"][0] ])

assert serovars == ['-:-', 'O22:H8', '-:-']
assert serotypes == ['-:-', 'O22:H8', '-:-']
expectedspecies_list = ["Campylobacter_D jejuni","Escherichia coli","Salmonella enterica"]
for i in range(0,3):
assert bool(re.match(expectedspecies_list[i], genomenames[i])) == True
assert bool(re.match(expectedspecies_list[i], species[i])) == True
assert QCflag == ["WARNING (WRONG SPECIES)","PASS (REPORTABLE)","WARNING (WRONG SPECIES)"]


def test_Ealbertii_1(caplog): #error
LOG.info("Starting 1 of 3 test on EnteroBase on sample ESC_HA8355AA_AS: Escherichia albertii O65:H5")
caplog.set_level(logging.DEBUG)
file = os.path.join(TEST_ROOT,
'Data/ESC_HA8355AA_AS_Ealberii_O65H5.fasta')
tmpdir = tempfile.mkdtemp()
set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir)

ectyper.run_program()
with open(os.path.join(tmpdir,"output.tsv")) as outfp:
rows = outfp.readlines()
secondrow=rows[1:][0] #remove header line
assert "Escherichia albertii" in secondrow
assert "WARNING (WRONG SPECIES)" in secondrow

def test_Ealbertii_2(): #error
LOG.info("Starting 2 of 3 test on EnteroBase on sample on ESC_HA8509AA_AS: Escherichia albertii O5:H5")

file = os.path.join(TEST_ROOT,
'Data/ESC_HA8509AA_AS_EalbertiiO5H5.fasta')
tmpdir = tempfile.mkdtemp()
set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir)
ectyper.run_program()

with open(os.path.join(tmpdir,"output.tsv")) as outfp:
rows = outfp.readlines()
secondrow=rows[1:][0] #check only second row

assert "Escherichia albertii" in secondrow
assert "WARNING (WRONG SPECIES)" in secondrow

def test_Ealbertii_3(caplog):
LOG.info("Starting 3 of 3 test Escherichia albertii O49:NM") #can not type O49 due to poor sequence quality of uncertainty of wet-lab O49 typing
caplog.set_level(logging.DEBUG)
file = os.path.join(TEST_ROOT,
'Data/Ealbertii_O49NM.fasta')

tmpdir = tempfile.mkdtemp()
set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir)
ectyper.run_program()

with open(os.path.join(tmpdir ,"output.tsv")) as outfp:
rows = outfp.readlines()
secondrow=rows[1:][0] #check only second row
assert "Escherichia albertii" in secondrow
assert "WARNING (WRONG SPECIES)" in secondrow


def test_Ecoli_O17H18(caplog):
caplog.set_level(logging.DEBUG)
file = os.path.join(TEST_ROOT,
Expand All @@ -179,7 +137,8 @@ def test_Ecoli_O17H18(caplog):
with open(os.path.join(tmpdir,"output.tsv")) as outfp:
rows = outfp.readlines()
secondrow=rows[1:][0] #check only second row
assert "Escherichia coli\tO17/O77/O44/O106\tH18\tO17/O77/O44/O106:H18\tWARNING MIXED O-TYPE" in secondrow
assert "Escherichia coli" in secondrow.split('\t')
assert "O17/O77/O44/O106\tH18\tO17/O77/O44/O106:H18\tWARNING MIXED O-TYPE" in secondrow

def test_download_refseq_mash(caplog, tmpdir):
caplog.set_level(logging.DEBUG)
Expand Down
Loading

0 comments on commit b2c7c5c

Please sign in to comment.