Skip to content

Commit

Permalink
add test files to package, use parse to check accuracy
Browse files Browse the repository at this point in the history
  • Loading branch information
iulusoy committed Sep 3, 2024
1 parent 44c2ebe commit cef58c7
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 74 deletions.
53 changes: 0 additions & 53 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from email import policy
from email.parser import BytesParser
from pathlib import Path
import os
import eml_parser
Expand Down Expand Up @@ -37,57 +35,6 @@ def get_text(file):
return(email_content["content"])

Check warning on line 35 in mailcom/inout.py

View check run for this annotation

Codecov / codecov/patch

mailcom/inout.py#L35

Added line #L35 was not covered by tests


def delete_header(text):
items_to_delete = [
"Von:",
"Gesendet:",
"Betreff:",
"An:",
"Cc:",
"Sujet :",
"Date :",
"De :",
"Pour :",
"Copie :",
"Mailbeispiel",
"Mailbeispil",
"transféré",
"Sent:",
"https:",
"Von meinem iPhone gesendet",
"Anfang der weitergeleiteten",
]
lines_to_delete = []
text_out_list = text.splitlines()
for index, line in enumerate(text_out_list):
if any(i == "@" for i in line):
# print("Deleting: found @: {}".format(line))
lines_to_delete.append(index)
# elif any(i == ">" for i in line):
# lines_to_delete.append(index)
# print("Deleting: found >: {}".format(line))
elif any(i in line for i in items_to_delete):
lines_to_delete.append(index)
# print("Deleting {}".format(line))
# check if any lines have been found
if lines_to_delete:
# delete lines
for i in reversed(lines_to_delete):
# print("xxxxx {}".format(text_out_list[i]))
del text_out_list[i]
# reduce whitespace not to confuse spacy
# remove tabs and outer whitespace
text_out_list = [line.replace("\t", " ").strip() for line in text_out_list]
# remove hyphens - this is risky though -
text_out_list = [line.replace("-", " ").strip() for line in text_out_list]
text_out_list = [line.replace("_", " ").strip() for line in text_out_list]
# reduce whitespace to one
text_out_list = [" ".join(line.split()) for line in text_out_list]
# delete empty lines
text_out_list = [line for line in text_out_list if line]
return " ".join(text_out_list)


def write_file(text, name):
with open("{}.out".format(name), "w") as file:
file.write(text)
42 changes: 21 additions & 21 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
import spacy as sp
from transformers import pipeline
from pathlib import Path
from .inout import get_text, delete_header, list_of_files, write_file
from mailcom.inout import get_text, list_of_files, write_file

# please modify this section depending on your setup
# input language - either "es" or "fr"
# will also need pt
lang = "es"
# lang = "fr"
# path where the input files can be found
path_input = Path("./data/in/")
path_input = Path("./test/data/")
# path where the output files should be written to
# this is generated if not present yet
path_output = Path("./data/out/")
path_output = Path("../data/out/")
# the ner tool - currently only "transformers"
tool = "transformers"
# please do not modify below this section unless you know what you are doing
Expand Down Expand Up @@ -104,8 +104,8 @@ def make_dir(path: str):


if __name__ == "__main__":
nlp_spacy = init_spacy(lang)
nlp_transformers = init_transformers()
# nlp_spacy = init_spacy(lang)
# nlp_transformers = init_transformers()

# check that input dir is there
if not check_dir(path_input):
Expand All @@ -116,27 +116,27 @@ def make_dir(path: str):
print("Generating output directory/ies.")
make_dir(path_output)
# process the text
eml_files = list_of_files(path_input, "eml")
html_files = list_of_files(path_input, "html")
eml_files = list_of_files(path_input)

Check warning on line 119 in mailcom/parse.py

View check run for this annotation

Codecov / codecov/patch

mailcom/parse.py#L119

Added line #L119 was not covered by tests
# html_files = list_of_files(path_input, "html")
for file in eml_files:
text = get_text(file)
print(text)

Check warning on line 123 in mailcom/parse.py

View check run for this annotation

Codecov / codecov/patch

mailcom/parse.py#L123

Added line #L123 was not covered by tests
# skip this text if email could not be parsed
if not text:
continue
text = delete_header(text)
doc_spacy = nlp_spacy(text)
text = get_sentences(doc_spacy)
# doc_spacy = nlp_spacy(text)
# text = get_sentences(doc_spacy)
# start with first line
# here you can limit the number of sentences to parse
newlist = []
max_i = len(text)
for i in range(0, max_i):
if tool == "transformers":
nlps = nlp_transformers(text[i])
doc = nlps
newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
newlist[i] = " ".join(newlist[i])
# newlist = []
# max_i = len(text)
# for i in range(0, max_i):
# if tool == "transformers":
# nlps = nlp_transformers(text[i])
# doc = nlps
# newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
# newlist[i] = " ".join(newlist[i])
# join the new and old lines for comparison
printout = "New: " + " ".join(newlist) + "\n"
printout = printout + "Old: " + " ".join(text[0:max_i])
write_file(printout, path_output + "/" + file)
# printout = "New: " + " ".join(newlist) + "\n"
# printout = printout + "Old: " + " ".join(text[0:max_i])
# write_file(printout, path_output + "/" + file)

0 comments on commit cef58c7

Please sign in to comment.