From b2c7c5c13bda39922770fd06c22a7d267eeaa0ab Mon Sep 17 00:00:00 2001 From: Kirill Bessonov Date: Mon, 28 Oct 2024 15:26:13 -0400 Subject: [PATCH] added new species ID fields that provide info on species hashes to ref and top reference accession id --- README.md | 2 +- ectyper/commandLineOptions.py | 2 +- ectyper/definitions.py | 4 +- ectyper/ectyper.py | 11 ++++- ectyper/init.py | 14 +++++- ectyper/predictionFunctions.py | 19 ++++++-- ectyper/speciesIdentification.py | 44 ++++++++++------- test/test_O_serotyping.py | 83 ++++++++------------------------ test/test_ectyper_integration.py | 2 +- test/test_ectyper_speciesID.py | 70 +++++++++++++++++++++++++-- 10 files changed, 158 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index d6fe5d5..1cfa20c 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Docker and Singularity images are also available from [https://biocontainers.pro ### Databases ECTyper uses multiple databases - - the species identification database is available from [https://zenodo.org/records/10211569](https://zenodo.org/records/10211569) + - the species identification database is available from [Zenodo](https://doi.org/10.5281/zenodo.10211568) repository - the O and H antigen allele sequences are stored in [ectyper_alleles_db.json](ectyper/Data/ectyper_alleles_db.json) - the toxin and pathotype signature marker sequences are stored in [ectyper_patho_stx_toxin_typing_database.json](ectyper/Data/ectyper_patho_stx_toxin_typing_database.json) diff --git a/ectyper/commandLineOptions.py b/ectyper/commandLineOptions.py index 9961105..3ff2308 100644 --- a/ectyper/commandLineOptions.py +++ b/ectyper/commandLineOptions.py @@ -2,7 +2,7 @@ import argparse from ectyper import __version__ -import json, os +import json import ectyper.definitions as definitions def parse_command_line(args=None): diff --git a/ectyper/definitions.py b/ectyper/definitions.py index 0d55670..fa03db7 100644 --- a/ectyper/definitions.py +++ b/ectyper/definitions.py @@ -32,7 +32,7 @@ '15':['O89','O101','O162'], '16':['O169','O183'] } -MASH_URLS = ["https://zenodo.org/records/10211569/files/EnteroRef_GTDBSketch_20231003_V2.msh?download=1"] +MASH_URLS = ["https://zenodo.org/records/13969103/files/EnteroRef_GTDBSketch_20231003_V2.msh?download=1"] HIGH_SIMILARITY_THRESHOLD_O = 0.00771 # alleles that are 99.23% apart will be reported as mixed call ~ 8 nt difference on average MIN_O_IDENTITY_LS = 95 #low similarity group O antigen min identity threshold to pre-filter BLAST output (identical to global threshold) @@ -40,7 +40,7 @@ PATHOTYPE_TOXIN_FIELDS = ['pathotype', 'pathotype_count', 'pathotype_genes', 'pathotype_gene_names', 'pathotype_accessions', 'pathotype_allele_id', 'pathotype_pident', 'pathotype_pcov','pathotype_length_ratio', 'pathotype_rule_ids', 'pathotype_gene_counts', 'pathotype_database', 'stx_genes', 'stx_accessions', 'stx_allele_ids', 'stx_genes_full_name', 'stx_pidents', 'stx_pcovs', 'stx_gene_lengths', 'stx_contigs', 'stx_gene_ranges'] -OUTPUT_TSV_HEADER = ['Name','Species','O-type','H-type','Serotype','QC', +OUTPUT_TSV_HEADER = ['Name','Species', 'SpeciesMashRatio', 'SpeciesMashDistance','SpeciesMashTopID','O-type','H-type','Serotype','QC', 'Evidence','GeneScores','AlleleKeys','GeneIdentities(%)', 'GeneCoverages(%)','GeneContigNames','GeneRanges', 'GeneLengths','DatabaseVer','Warnings','Pathotype', 'PathotypeCounts', 'PathotypeGenes', 'PathotypeGeneNames', 'PathotypeAccessions', 'PathotypeAlleleIDs', diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py index 252200b..da58316 100644 --- a/ectyper/ectyper.py +++ b/ectyper/ectyper.py @@ -348,15 +348,24 @@ def run_prediction(genome_files_dict, args, alleles_fasta, temp_dir, ectyperdb_d with Pool(processes=args.cores) as pool: results = pool.map(gp, genome_groups) - # merge the per-database predictions with the final predictions dict + # merge the database predictions with the final predictions dict for r in results: predictions_dict = {**r, **predictions_dict} + for genome_name in predictions_dict.keys(): + predictions_dict[genome_name]["species"] = "-" + predictions_dict[genome_name]["species_mash_hash_ratio2ref"] = "-" + predictions_dict[genome_name]["species_mash_dist2ref"] = "-" + predictions_dict[genome_name]["species_mash_top_reference"] = "-" try: predictions_dict[genome_name]["species"] = genome_files_dict[genome_name]["species"] + predictions_dict[genome_name]["species_mash_hash_ratio2ref"] = genome_files_dict[genome_name]["species_mash_hash_ratio2ref"] + predictions_dict[genome_name]["species_mash_dist2ref"] = genome_files_dict[genome_name]["species_mash_dist2ref"] + predictions_dict[genome_name]["species_mash_top_reference"] = genome_files_dict[genome_name]["species_mash_top_reference"] predictions_dict[genome_name]["error"] = genome_files_dict[genome_name]["error"] except KeyError as e: predictions_dict[genome_name]["error"] = "Error: "+str(e)+" in "+genome_name + LOG.error(f"Failed on {genome_name} sample that does not exist in the 'genome_files_dict' dictionary with the {e} error") return predictions_dict diff --git a/ectyper/init.py b/ectyper/init.py index d660de3..12dbcaf 100644 --- a/ectyper/init.py +++ b/ectyper/init.py @@ -1,3 +1,4 @@ +import argparse from ectyper import (definitions, speciesIdentification) import logging logging.basicConfig(level=logging.DEBUG) @@ -5,7 +6,18 @@ + def main(): logging.info("Initializing Species ID database ...") - speciesIdentification.get_species_mash(definitions.SPECIES_ID_SKETCH) + parser = argparse.ArgumentParser( + description='Species ID database ectyper initializer' + ) + parser.add_argument( + "--force", + action="store_true", + help="Force species ID database initialization" + ) + args = parser.parse_args() + + speciesIdentification.get_species_mash(definitions.SPECIES_ID_SKETCH, args.force) logging.info("Done") \ No newline at end of file diff --git a/ectyper/predictionFunctions.py b/ectyper/predictionFunctions.py index 2c45865..b01fc83 100644 --- a/ectyper/predictionFunctions.py +++ b/ectyper/predictionFunctions.py @@ -757,6 +757,9 @@ def report_result(final_dict, output_dir, output_file, args): for sample in final_dict.keys(): output_line = [sample] #name of a query sample/genome output_line.append(final_dict[sample]["species"]) #add species info + output_line.append(final_dict[sample]["species_mash_hash_ratio2ref"]) #add species top hit mash hash ratios + output_line.append(final_dict[sample]["species_mash_dist2ref"]) #add species mash distance + output_line.append(final_dict[sample]["species_mash_top_reference"]) #add species mash top hit id if "O" in final_dict[sample].keys(): Otype=final_dict[sample]["O"]["serogroup"] @@ -876,6 +879,7 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo :param predictions_data_frame: the Dict containing the ectyper predictions :return: modified prediction file """ + print(other_dict) # genome names are given without the filename extension for g in all_genomes_list: @@ -886,17 +890,26 @@ def add_non_predicted(all_genomes_list, predictions_dict, other_dict, filesnotfo if gname in other_dict: predictions_dict[gname] = { 'error': other_dict[gname]["error"], - 'species': other_dict[gname]["species"] + 'species': other_dict[gname]["species"], + 'species_mash_hash_ratio2ref': other_dict[gname]["species_mash_hash_ratio2ref"], + 'species_mash_dist2ref': other_dict[gname]["species_mash_dist2ref"], + 'species_mash_top_reference' : other_dict[gname]["species_mash_top_reference"] } elif gname in filesnotfound_dict: predictions_dict[gname] = { 'error': filesnotfound_dict[gname]["error"], - 'species': '-' + 'species': '-', + 'species_mash_hash_ratio2ref':'-', + 'species_mash_dist2ref': '-', + 'species_mash_top_reference' : '-' } else: predictions_dict[gname] = { 'error': f"No O and H antigen determinant E.coli genes were found in {gname}", - 'species': ecoli_dict[gname]["species"] + 'species': ecoli_dict[gname]["species"], + 'species_mash_hash_ratio2ref': ecoli_dict[gname]["species_mash_hash_ratio2ref"], + 'species_mash_dist2ref': ecoli_dict[gname]["species_mash_dist2ref"], + 'species_mash_top_reference' : ecoli_dict[gname]["species_mash_top_reference"] } return predictions_dict diff --git a/ectyper/speciesIdentification.py b/ectyper/speciesIdentification.py index 4b528e2..d5a14e0 100644 --- a/ectyper/speciesIdentification.py +++ b/ectyper/speciesIdentification.py @@ -42,7 +42,7 @@ def setLockFile(lockfilepath): time.sleep(60) # recheck every 1 min if lock file was removed LOG.info("Lock file doest not exist or is removed by other process. Continue with databases download ...") -def get_species_mash(targetpath): +def get_species_mash(targetpath, force_download=False): """ Get MASH sketch of genomes for species identification and check that the most recent version is installed :return returns boolean value depending on success of failure to download the MASH sketch @@ -54,12 +54,12 @@ def get_species_mash(targetpath): os.remove(lockfilepath) - if bool_downloadMashSketch(targetpath): + if bool_downloadMashSketch(targetpath) or force_download == True: LOG.info("MASH species id sketch is missing and needs to be downloaded ...") setLockFile(lockfilepath) for url in definitions.MASH_URLS: try: - if os.path.exists(targetpath) == False: + if os.path.exists(targetpath) == False or force_download == True: LOG.info("Downloading ~900MB from {}.".format(url)) response = requests.get(url,timeout=10, verify=False) response.raise_for_status() @@ -84,7 +84,7 @@ def get_species_mash(targetpath): return False #if all mirrors failed else: - LOG.info("MASH species id sketch is in good health and does not need to be downloaded".format( + LOG.info("MASH species id sketch at {} exists and is in good health and does not need to be downloaded".format( targetpath )) return True @@ -209,7 +209,6 @@ def get_species(file, args, cores=1): LOG.info('For {} following top hits and hash ratios returned by MASH {}'.format(file, [(top_hit_line.split("\t")[0],top_hit_line.split("\t")[4]) for top_hit_line in top_hit_lines if len(top_hit_line.split("\t")[0])>0])) - top_hit_line_elements = top_hit_line.split() if len(top_hit_line_elements) < 5: @@ -231,7 +230,7 @@ def get_species(file, args, cores=1): else: LOG.warning(f"Could not determine species based on MASH distance for {file}") species = "-" - return species + return species, top_match_hashratio, top_match_dist, top_match def getSampleName(file): @@ -286,31 +285,40 @@ def verify_ecoli_and_inputs(fasta_fastq_files_dict, ofiles, filesnotfound, args) #do species always regardless of --verify param. Do prediction on fastq files if available for better accuracy if fasta_fastq_files_dict[fasta]: fastq_file = fasta_fastq_files_dict[fasta] - speciesname = get_species(fastq_file, args, args.cores) + speciesname, species_mash_hash_ratio, species_mash_dist, species_top_hit_accession = get_species(fastq_file, args, args.cores) else: - speciesname = get_species(fasta, args, args.cores) + speciesname, species_mash_hash_ratio, species_mash_dist, species_top_hit_accession = get_species(fasta, args, args.cores) if args.verify: - failverifyerrormessage += "Sample identified as " + speciesname + ": serotyping results are only valid for E.coli samples." \ - "If sure that sample is E.coli run without --verify parameter." + failverifyerrormessage += "Sample identified as " + speciesname + ": typing results are only valid for E.coli samples." \ + "If sure that sample is E.coli or want results regardless try running without the --verify parameter." if re.match("Escherichia coli", speciesname): - ecoli_files_dict[sampleName] = {"species":speciesname, + ecoli_files_dict[sampleName] = {"species":speciesname, "species_mash_hash_ratio2ref":species_mash_hash_ratio, + "species_mash_dist2ref": species_mash_dist, "species_mash_top_reference":species_top_hit_accession, "filepath":fasta, "error": ""} elif is_escherichia_genus(speciesname): - other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta,"error":failverifyerrormessage} + other_files_dict[sampleName] = {"species":speciesname,"filepath":fasta, + "species_mash_hash_ratio2ref":species_mash_hash_ratio, + "species_mash_dist2ref": species_mash_dist, "species_mash_top_reference":species_top_hit_accession, + "error":failverifyerrormessage} else: - other_files_dict[sampleName] = {"species":speciesname, "filepath":fasta, "error":failverifyerrormessage} + other_files_dict[sampleName] = {"species":speciesname, "filepath":fasta, + "species_mash_hash_ratio2ref":species_mash_hash_ratio, + "species_mash_dist2ref": species_mash_dist, "species_mash_top_reference":species_top_hit_accession, + "error":failverifyerrormessage} else: - ecoli_files_dict[sampleName] = {"species": speciesname, - "filepath": fasta, "error": ""} - + ecoli_files_dict[sampleName] = {"species": speciesname,"filepath": fasta, + "species_mash_hash_ratio2ref":species_mash_hash_ratio, "species_mash_dist2ref": species_mash_dist, + "species_mash_top_reference":species_top_hit_accession, "error": ""} + for bf in ofiles: sampleName = getSampleName(bf) LOG.warning(f"{sampleName} is non fasta / fastq file. Species identification aborted") - other_files_dict[sampleName] = {"error":"Non fasta / fastq file. ","filepath":bf,"species":"-"} + other_files_dict[sampleName] = {"error":"Non fasta / fastq file. ","filepath":bf,"species":"-", + "species_mash_hash_ratio2ref":"-", "species_mash_dist2ref":"-", "species_mash_top_reference":"-"} for file in filesnotfound: sampleName = getSampleName(file) - filesnotfound_dict[sampleName]={"error":"File {} not found!".format(file)} + filesnotfound_dict[sampleName]={"error":"File {} not found!".format(file), } return ecoli_files_dict, other_files_dict,filesnotfound_dict \ No newline at end of file diff --git a/test/test_O_serotyping.py b/test/test_O_serotyping.py index 7af5f9e..e5e2915 100644 --- a/test/test_O_serotyping.py +++ b/test/test_O_serotyping.py @@ -55,9 +55,15 @@ def test_Otyping(caplog): with open(os.path.join(tmpdir,"output.tsv")) as outfp: - secondrow = outfp.readlines()[1].split("\t") - Otype = secondrow[2] - Htype = secondrow[3] + lines = outfp.readlines() + header = lines[0].split("\t") + secondrow = lines[1].split("\t") + assert 'O-type' in header, f'O-type column not found in the output.tsv ({header})' + assert 'H-type' in header, f'H-type column not found in the output.tsv ({header})' + col_number = [idx for idx, i in enumerate(header) if i == "O-type"][0] + Otype = secondrow[col_number] + col_number = [idx for idx, i in enumerate(header) if i == "H-type"][0] + Htype = secondrow[col_number] assert Otype == "-", "Expected no call but reported O-type:" + Otype assert Htype == "H11", "Expected H11 but reported H-type:" + Htype @@ -102,71 +108,23 @@ def test_mixofspecies(caplog): with open(os.path.join(tmpdir,"output.tsv")) as outfp: rows = outfp.readlines() - rows=rows[1:] #remove header line + header = rows[0] + resultsrows = rows[1:] #remove header line - serovars=[]; genomenames=[]; QCflag=[]; confidence=[] - for row in rows: + serotypes=[]; species=[]; QCflag=[] + + for row in resultsrows: rowlist = row.split("\t") - serovars.append(rowlist[4]) - genomenames.append(rowlist[1]) - QCflag.append(rowlist[5]) - confidence.append(rowlist[6]) + serotypes.append(rowlist[ [idx for idx, i in enumerate(header.split('\t')) if i == "Serotype"][0] ]) + species.append(rowlist[ [idx for idx, i in enumerate(header.split('\t')) if i == "Species"][0] ]) + QCflag.append(rowlist[ [idx for idx, i in enumerate(header.split('\t')) if i == "QC"][0] ]) - assert serovars == ['-:-', 'O22:H8', '-:-'] + assert serotypes == ['-:-', 'O22:H8', '-:-'] expectedspecies_list = ["Campylobacter_D jejuni","Escherichia coli","Salmonella enterica"] for i in range(0,3): - assert bool(re.match(expectedspecies_list[i], genomenames[i])) == True + assert bool(re.match(expectedspecies_list[i], species[i])) == True assert QCflag == ["WARNING (WRONG SPECIES)","PASS (REPORTABLE)","WARNING (WRONG SPECIES)"] - -def test_Ealbertii_1(caplog): #error - LOG.info("Starting 1 of 3 test on EnteroBase on sample ESC_HA8355AA_AS: Escherichia albertii O65:H5") - caplog.set_level(logging.DEBUG) - file = os.path.join(TEST_ROOT, - 'Data/ESC_HA8355AA_AS_Ealberii_O65H5.fasta') - tmpdir = tempfile.mkdtemp() - set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) - - ectyper.run_program() - with open(os.path.join(tmpdir,"output.tsv")) as outfp: - rows = outfp.readlines() - secondrow=rows[1:][0] #remove header line - assert "Escherichia albertii" in secondrow - assert "WARNING (WRONG SPECIES)" in secondrow - -def test_Ealbertii_2(): #error - LOG.info("Starting 2 of 3 test on EnteroBase on sample on ESC_HA8509AA_AS: Escherichia albertii O5:H5") - - file = os.path.join(TEST_ROOT, - 'Data/ESC_HA8509AA_AS_EalbertiiO5H5.fasta') - tmpdir = tempfile.mkdtemp() - set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) - ectyper.run_program() - - with open(os.path.join(tmpdir,"output.tsv")) as outfp: - rows = outfp.readlines() - secondrow=rows[1:][0] #check only second row - - assert "Escherichia albertii" in secondrow - assert "WARNING (WRONG SPECIES)" in secondrow - -def test_Ealbertii_3(caplog): - LOG.info("Starting 3 of 3 test Escherichia albertii O49:NM") #can not type O49 due to poor sequence quality of uncertainty of wet-lab O49 typing - caplog.set_level(logging.DEBUG) - file = os.path.join(TEST_ROOT, - 'Data/Ealbertii_O49NM.fasta') - - tmpdir = tempfile.mkdtemp() - set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) - ectyper.run_program() - - with open(os.path.join(tmpdir ,"output.tsv")) as outfp: - rows = outfp.readlines() - secondrow=rows[1:][0] #check only second row - assert "Escherichia albertii" in secondrow - assert "WARNING (WRONG SPECIES)" in secondrow - - def test_Ecoli_O17H18(caplog): caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, @@ -179,7 +137,8 @@ def test_Ecoli_O17H18(caplog): with open(os.path.join(tmpdir,"output.tsv")) as outfp: rows = outfp.readlines() secondrow=rows[1:][0] #check only second row - assert "Escherichia coli\tO17/O77/O44/O106\tH18\tO17/O77/O44/O106:H18\tWARNING MIXED O-TYPE" in secondrow + assert "Escherichia coli" in secondrow.split('\t') + assert "O17/O77/O44/O106\tH18\tO17/O77/O44/O106:H18\tWARNING MIXED O-TYPE" in secondrow def test_download_refseq_mash(caplog, tmpdir): caplog.set_level(logging.DEBUG) diff --git a/test/test_ectyper_integration.py b/test/test_ectyper_integration.py index efe7a80..58f17cc 100644 --- a/test/test_ectyper_integration.py +++ b/test/test_ectyper_integration.py @@ -144,7 +144,7 @@ def test_multiple_directories(caplog): assert any([True if re.match(r".+sample.fasta.+WARNING\s+\(WRONG\s+SPECIES\).+Non fasta / fastq file", line) else False for line in caplog.text.splitlines()]), "Issue with sample.fasta" assert any([True if re.match(r".+sampletar.+WARNING\s+\(WRONG\s+SPECIES\).+Non fasta / fastq file", line) else False for line in caplog.text.splitlines()]), "Issue with sampletar" assert any([True if re.match(r".+test_junk.+WARNING\s+\(WRONG\s+SPECIES\).+Non fasta / fastq file", line) else False for line in caplog.text.splitlines()]), "Issue with test_junk" - assert any([True if re.match(r".+GCA_000181775\.1_ASM18177v1_genomic\s+Escherichia\s+coli\s+O157\s+H7.+REPORTABLE", line) else False for line in caplog.text.splitlines()]), "Issue with GCF_000181775.1_ASM18177v1" + assert any([True if re.match(r".+GCA_000181775\.1_ASM18177v1_genomic\s+Escherichia\s+coli.+O157\s+H7.+REPORTABLE", line) else False for line in caplog.text.splitlines()]), "Issue with GCF_000181775.1_ASM18177v1" def test_mash_sketch_and_assembly_metadata(tmpdir): diff --git a/test/test_ectyper_speciesID.py b/test/test_ectyper_speciesID.py index 8d3ac24..7ec1d6c 100644 --- a/test/test_ectyper_speciesID.py +++ b/test/test_ectyper_speciesID.py @@ -48,14 +48,16 @@ def test_failed_species_identification(caplog): file = os.path.join(TEST_ROOT, 'Data/GCF_001672015.1.fna') set_input(input=file, verify=True) ectyper.run_program() - assert "GCF_001672015.1\tEscherichia coli\t-\tH8\t-:H8\tWARNING (-:H TYPING)" in caplog.text + assert "GCF_001672015.1\tEscherichia coli" in caplog.text + assert "-\tH8\t-:H8\tWARNING (-:H TYPING)" in caplog.text def test_failed_species_identification_nospeciesverify(caplog): caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, 'Data/GCF_001672015.1.fna') set_input(input=file, verify=False) ectyper.run_program() - assert "GCF_001672015.1\tEscherichia coli\t-\tH8\t-:H8\t-" in caplog.text + assert "GCF_001672015.1\tEscherichia coli" in caplog.text + assert "-\tH8\t-:H8\t-" in caplog.text def test_non_existing_accession_in_meta(caplog): @@ -82,4 +84,66 @@ def test_speciesID_non_existing(): args = ectyper.commandLineOptions.parse_command_line() result = ectyper.speciesIdentification.verify_ecoli_and_inputs(fasta_fastq_files_dict={fastafile:None}, ofiles={}, filesnotfound={}, args=args) - assert result[0]['GCF_001672015.1']['species'] == 'Escherichia coli' \ No newline at end of file + assert result[0]['GCF_001672015.1']['species'] == 'Escherichia coli' + +def test_speciesID_Shigella(caplog): + fastafile=os.path.join(TEST_ROOT, 'Data/ESC_BA0255AA_AS_Shigella_sonnei.fasta') + caplog.set_level(logging.DEBUG) + tmpdir = tempfile.mkdtemp() + set_input(input=fastafile, verify=False, output=tmpdir) + expected_species = "Shigella sonnei" + + ectyper.run_program() + with open(os.path.join(tmpdir,"output.tsv")) as outfp: + rows = outfp.readlines() + species_col_number = [idx for idx, i in enumerate(rows[0].split('\t')) if i == "Species"] + assert len(species_col_number) == 1, "Could not find the species column" + secondrow=rows[1:][0].split('\t') #check only second row + assert secondrow[species_col_number[0]] == expected_species, f"Could not find species {expected_species}" + +def test_Ealbertii_1(caplog): #error + logging.info("Starting 1 of 3 test on EnteroBase on sample ESC_HA8355AA_AS: Escherichia albertii O65:H5") + caplog.set_level(logging.DEBUG) + file = os.path.join(TEST_ROOT, + 'Data/ESC_HA8355AA_AS_Ealberii_O65H5.fasta') + tmpdir = tempfile.mkdtemp() + set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) + ectyper.run_program() + + with open(os.path.join(tmpdir,"output.tsv")) as outfp: + rows = outfp.readlines() + secondrow=rows[1:][0] #remove header line + assert "Escherichia albertii" in secondrow + assert "WARNING (WRONG SPECIES)" in secondrow + +def test_Ealbertii_2(caplog): #error + logging.info("Starting 2 of 3 test on EnteroBase on sample on ESC_HA8509AA_AS: Escherichia albertii O5:H5") + caplog.set_level(logging.DEBUG) + file = os.path.join(TEST_ROOT, + 'Data/ESC_HA8509AA_AS_EalbertiiO5H5.fasta') + tmpdir = tempfile.mkdtemp() + set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) + ectyper.run_program() + + with open(os.path.join(tmpdir,"output.tsv")) as outfp: + rows = outfp.readlines() + secondrow=rows[1:][0] #check only second row + + assert "Escherichia albertii" in secondrow + assert "WARNING (WRONG SPECIES)" in secondrow + +def test_Ealbertii_3(caplog): + logging.info("Starting 3 of 3 test Escherichia albertii O49:NM") #can not type O49 due to poor sequence quality of uncertainty of wet-lab O49 typing + caplog.set_level(logging.DEBUG) + file = os.path.join(TEST_ROOT, + 'Data/Ealbertii_O49NM.fasta') + + tmpdir = tempfile.mkdtemp() + set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) + ectyper.run_program() + + with open(os.path.join(tmpdir ,"output.tsv")) as outfp: + rows = outfp.readlines() + secondrow=rows[1:][0] #check only second row + assert "Escherichia albertii" in secondrow + assert "WARNING (WRONG SPECIES)" in secondrow \ No newline at end of file