add test files to package, use parse to check accuracy

ssciwr · Sep 3, 2024 · cef58c7 · cef58c7
1 parent 44c2ebe
commit cef58c7
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 74 deletions.
diff --git a/mailcom/inout.py b/mailcom/inout.py
@@ -1,5 +1,3 @@
-from email import policy
-from email.parser import BytesParser
 from pathlib import Path
 import os
 import eml_parser
@@ -37,57 +35,6 @@ def get_text(file):
     return(email_content["content"])
 
 
-def delete_header(text):
-    items_to_delete = [
-        "Von:",
-        "Gesendet:",
-        "Betreff:",
-        "An:",
-        "Cc:",
-        "Sujet :",
-        "Date :",
-        "De :",
-        "Pour :",
-        "Copie :",
-        "Mailbeispiel",
-        "Mailbeispil",
-        "transféré",
-        "Sent:",
-        "https:",
-        "Von meinem iPhone gesendet",
-        "Anfang der weitergeleiteten",
-    ]
-    lines_to_delete = []
-    text_out_list = text.splitlines()
-    for index, line in enumerate(text_out_list):
-        if any(i == "@" for i in line):
-            # print("Deleting: found @: {}".format(line))
-            lines_to_delete.append(index)
-        # elif any(i == ">" for i in line):
-        # lines_to_delete.append(index)
-        # print("Deleting: found >: {}".format(line))
-        elif any(i in line for i in items_to_delete):
-            lines_to_delete.append(index)
-            # print("Deleting {}".format(line))
-    # check if any lines have been found
-    if lines_to_delete:
-        # delete lines
-        for i in reversed(lines_to_delete):
-            # print("xxxxx {}".format(text_out_list[i]))
-            del text_out_list[i]
-    # reduce whitespace not to confuse spacy
-    # remove tabs and outer whitespace
-    text_out_list = [line.replace("\t", " ").strip() for line in text_out_list]
-    # remove hyphens - this is risky though -
-    text_out_list = [line.replace("-", " ").strip() for line in text_out_list]
-    text_out_list = [line.replace("_", " ").strip() for line in text_out_list]
-    # reduce whitespace to one
-    text_out_list = [" ".join(line.split()) for line in text_out_list]
-    # delete empty lines
-    text_out_list = [line for line in text_out_list if line]
-    return " ".join(text_out_list)
-
-
 def write_file(text, name):
     with open("{}.out".format(name), "w") as file:
         file.write(text)
diff --git a/mailcom/parse.py b/mailcom/parse.py
@@ -2,18 +2,18 @@
 import spacy as sp
 from transformers import pipeline
 from pathlib import Path
-from .inout import get_text, delete_header, list_of_files, write_file
+from mailcom.inout import get_text, list_of_files, write_file
 
 # please modify this section depending on your setup
 # input language - either "es" or "fr"
 # will also need pt
 lang = "es"
 # lang = "fr"
 # path where the input files can be found
-path_input = Path("./data/in/")
+path_input = Path("./test/data/")
 # path where the output files should be written to
 # this is generated if not present yet
-path_output = Path("./data/out/")
+path_output = Path("../data/out/")
 # the ner tool - currently only "transformers"
 tool = "transformers"
 # please do not modify below this section unless you know what you are doing
@@ -104,8 +104,8 @@ def make_dir(path: str):
 
 
 if __name__ == "__main__":
-    nlp_spacy = init_spacy(lang)
-    nlp_transformers = init_transformers()
+    # nlp_spacy = init_spacy(lang)
+    # nlp_transformers = init_transformers()
 
     # check that input dir is there
     if not check_dir(path_input):
@@ -116,27 +116,27 @@ def make_dir(path: str):
         print("Generating output directory/ies.")
         make_dir(path_output)
     # process the text
-    eml_files = list_of_files(path_input, "eml")
-    html_files = list_of_files(path_input, "html")
+    eml_files = list_of_files(path_input)
+    # html_files = list_of_files(path_input, "html")
     for file in eml_files:
         text = get_text(file)
+        print(text)
         # skip this text if email could not be parsed
         if not text:
             continue
-        text = delete_header(text)
-        doc_spacy = nlp_spacy(text)
-        text = get_sentences(doc_spacy)
+        # doc_spacy = nlp_spacy(text)
+        # text = get_sentences(doc_spacy)
         # start with first line
         # here you can limit the number of sentences to parse
-        newlist = []
-        max_i = len(text)
-        for i in range(0, max_i):
-            if tool == "transformers":
-                nlps = nlp_transformers(text[i])
-                doc = nlps
-            newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
-            newlist[i] = " ".join(newlist[i])
+        # newlist = []
+        # max_i = len(text)
+        # for i in range(0, max_i):
+        #     if tool == "transformers":
+        #         nlps = nlp_transformers(text[i])
+        #         doc = nlps
+        #     newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
+        #     newlist[i] = " ".join(newlist[i])
         # join the new and old lines for comparison
-        printout = "New: " + " ".join(newlist) + "\n"
-        printout = printout + "Old: " + " ".join(text[0:max_i])
-        write_file(printout, path_output + "/" + file)
+        # printout = "New: " + " ".join(newlist) + "\n"
+        # printout = printout + "Old: " + " ".join(text[0:max_i])
+        # write_file(printout, path_output + "/" + file)