From cef58c75ad573fe8fac1c128997cd7e2740151f5 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Tue, 3 Sep 2024 09:21:57 +0200 Subject: [PATCH] add test files to package, use parse to check accuracy --- mailcom/inout.py | 53 ------------------------------------------------ mailcom/parse.py | 42 +++++++++++++++++++------------------- 2 files changed, 21 insertions(+), 74 deletions(-) diff --git a/mailcom/inout.py b/mailcom/inout.py index 063e053..fef7a01 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -1,5 +1,3 @@ -from email import policy -from email.parser import BytesParser from pathlib import Path import os import eml_parser @@ -37,57 +35,6 @@ def get_text(file): return(email_content["content"]) -def delete_header(text): - items_to_delete = [ - "Von:", - "Gesendet:", - "Betreff:", - "An:", - "Cc:", - "Sujet :", - "Date :", - "De :", - "Pour :", - "Copie :", - "Mailbeispiel", - "Mailbeispil", - "transféré", - "Sent:", - "https:", - "Von meinem iPhone gesendet", - "Anfang der weitergeleiteten", - ] - lines_to_delete = [] - text_out_list = text.splitlines() - for index, line in enumerate(text_out_list): - if any(i == "@" for i in line): - # print("Deleting: found @: {}".format(line)) - lines_to_delete.append(index) - # elif any(i == ">" for i in line): - # lines_to_delete.append(index) - # print("Deleting: found >: {}".format(line)) - elif any(i in line for i in items_to_delete): - lines_to_delete.append(index) - # print("Deleting {}".format(line)) - # check if any lines have been found - if lines_to_delete: - # delete lines - for i in reversed(lines_to_delete): - # print("xxxxx {}".format(text_out_list[i])) - del text_out_list[i] - # reduce whitespace not to confuse spacy - # remove tabs and outer whitespace - text_out_list = [line.replace("\t", " ").strip() for line in text_out_list] - # remove hyphens - this is risky though - - text_out_list = [line.replace("-", " ").strip() for line in text_out_list] - text_out_list = [line.replace("_", " ").strip() for line in text_out_list] - # reduce whitespace to one - text_out_list = [" ".join(line.split()) for line in text_out_list] - # delete empty lines - text_out_list = [line for line in text_out_list if line] - return " ".join(text_out_list) - - def write_file(text, name): with open("{}.out".format(name), "w") as file: file.write(text) diff --git a/mailcom/parse.py b/mailcom/parse.py index b6e2cd6..ab046a9 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -2,7 +2,7 @@ import spacy as sp from transformers import pipeline from pathlib import Path -from .inout import get_text, delete_header, list_of_files, write_file +from mailcom.inout import get_text, list_of_files, write_file # please modify this section depending on your setup # input language - either "es" or "fr" @@ -10,10 +10,10 @@ lang = "es" # lang = "fr" # path where the input files can be found -path_input = Path("./data/in/") +path_input = Path("./test/data/") # path where the output files should be written to # this is generated if not present yet -path_output = Path("./data/out/") +path_output = Path("../data/out/") # the ner tool - currently only "transformers" tool = "transformers" # please do not modify below this section unless you know what you are doing @@ -104,8 +104,8 @@ def make_dir(path: str): if __name__ == "__main__": - nlp_spacy = init_spacy(lang) - nlp_transformers = init_transformers() + # nlp_spacy = init_spacy(lang) + # nlp_transformers = init_transformers() # check that input dir is there if not check_dir(path_input): @@ -116,27 +116,27 @@ def make_dir(path: str): print("Generating output directory/ies.") make_dir(path_output) # process the text - eml_files = list_of_files(path_input, "eml") - html_files = list_of_files(path_input, "html") + eml_files = list_of_files(path_input) + # html_files = list_of_files(path_input, "html") for file in eml_files: text = get_text(file) + print(text) # skip this text if email could not be parsed if not text: continue - text = delete_header(text) - doc_spacy = nlp_spacy(text) - text = get_sentences(doc_spacy) + # doc_spacy = nlp_spacy(text) + # text = get_sentences(doc_spacy) # start with first line # here you can limit the number of sentences to parse - newlist = [] - max_i = len(text) - for i in range(0, max_i): - if tool == "transformers": - nlps = nlp_transformers(text[i]) - doc = nlps - newlist.append(process_doc(doc, ner_tool=tool, text=text[i])) - newlist[i] = " ".join(newlist[i]) + # newlist = [] + # max_i = len(text) + # for i in range(0, max_i): + # if tool == "transformers": + # nlps = nlp_transformers(text[i]) + # doc = nlps + # newlist.append(process_doc(doc, ner_tool=tool, text=text[i])) + # newlist[i] = " ".join(newlist[i]) # join the new and old lines for comparison - printout = "New: " + " ".join(newlist) + "\n" - printout = printout + "Old: " + " ".join(text[0:max_i]) - write_file(printout, path_output + "/" + file) + # printout = "New: " + " ".join(newlist) + "\n" + # printout = printout + "Old: " + " ".join(text[0:max_i]) + # write_file(printout, path_output + "/" + file)