Skip to content

Commit

Permalink
changed inout to a classstructure (#33)
Browse files Browse the repository at this point in the history
* changed inout to a classstructure

* edited tests

* use instant of the class for better interface

* update parse to run with inouthandler class

---------

Co-authored-by: Inga Ulusoy <[email protected]>
  • Loading branch information
Olthoff231381 and iulusoy authored Sep 23, 2024
1 parent b67c25f commit 95c4add
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 100 deletions.
129 changes: 69 additions & 60 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,69 +3,78 @@
import eml_parser
from bs4 import BeautifulSoup

def list_of_files(directory_name: str) -> list[Path]:
"""Function to create a list of files that are present in a directory as path objects.
Args:
directory_name (str): The directory where the files are located.
Returns:
list[Path]: A list of Path objects that represent the files in the directory."""
if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(directory_name))
mypath = Path(directory_name)
pattern = [".eml", ".html"] # we would not change the file type through user input
email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
if len(email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
return email_list
class InoutHandler:
def __init__(self, directory_name: str):
"""Constructor for the InoutHandler class.
Args:
directory_name (str): The directory where the files are located.
"""
self.directory_name = directory_name
# presets
self.pattern = [".eml", ".html"]

def list_of_files(self):
"""Method to create a list of Path objects (files) that are present
in a directory."""
if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(self.directory_name))
mypath = Path(self.directory_name)
self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern]
if len(self.email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))

def get_html_text(text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.
def get_html_text(self, text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.
Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
if soup.find():
text_check = soup.get_text()
return text_check

def get_text(self, file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.
Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
if soup.find():
text_check = soup.get_text()
return text_check
Args:
file (Path): The path to the email file.
Returns:
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
attachmenttypes = []
# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
self.email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(self.email_content["content"])

def get_text(file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.
def validate_data(self):
pass

Args:
file (Path): The path to the email file.
Returns:
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
attachmenttypes = []
# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(email_content["content"])
def data_to_xml(self):
pass

def write_file(self, text: str, name: str)-> None:
"""Write the extracted string to a text file.
def write_file(text: str, name: str)-> None:
"""Write the extracted string to a text file.
Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)
Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)
14 changes: 9 additions & 5 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import spacy as sp
from transformers import pipeline
from pathlib import Path
from mailcom.inout import get_text, list_of_files, get_html_text
from mailcom.inout import InoutHandler

# please modify this section depending on your setup
# input language - either "es" or "fr"
Expand Down Expand Up @@ -116,12 +116,16 @@ def make_dir(path: str):
print("Generating output directory/ies.")
make_dir(path_output)
# process the text
eml_files = list_of_files(path_input)
io = InoutHandler(path_input)
io.list_of_files()
# html_files = list_of_files(path_input, "html")
for file in eml_files:
text = get_text(file)
text = get_html_text(text)
for file in io.email_list:
text = io.get_text(file)
text = io.get_html_text(text)
print(text)
print(io.email_content["date"])
print(io.email_content["attachment"])
print(io.email_content["attachement type"])
# skip this text if email could not be parsed
if not text:
continue
Expand Down
65 changes: 30 additions & 35 deletions mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,51 @@
from mailcom.inout import list_of_files, get_text, get_html_text
from mailcom import inout
import pytest
from pathlib import Path
from importlib import resources
import datetime

pkg = resources.files("mailcom")

FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml")

TEXT_REF = "J'espère que tu vas bien!"

def test_list_of_files_found(tmp_path):
p = tmp_path / "test.eml"
p.write_text("test")
assert len(list_of_files(tmp_path)) != 0
@pytest.fixture()
def get_instant(tmp_path):
return inout.InoutHandler(tmp_path)

def test_list_of_files_empty(tmp_path):
def test_list_of_files(get_instant):
with pytest.raises(ValueError):
list_of_files(tmp_path)

def test_list_of_files_dir_not_existing():
with pytest.raises(OSError):
list_of_files("nonexistingDir")

def test_list_of_files_correct_format(tmp_path):
p = tmp_path / "test.eml"
get_instant.list_of_files()
p = get_instant.directory_name / "test.eml"
p.write_text("test")
p = tmp_path / "test2.html"
get_instant.list_of_files()
assert len(get_instant.email_list) != 0
get_instant2 = inout.InoutHandler("nonexistingDir")
with pytest.raises(OSError):
get_instant2.list_of_files()
p = get_instant.directory_name / "test2.html"
p.write_text("test2")
p = tmp_path / "test3.xml"
p = get_instant.directory_name / "test3.xml"
p.write_text("test3")
assert tmp_path / "test3.xml" not in list_of_files(tmp_path)
get_instant.list_of_files()
assert get_instant.directory_name / "test3.xml" not in get_instant.email_list

def test_get_text(tmp_path):
p = tmp_path / "test.eml"
def test_get_text(get_instant):
p = get_instant.directory_name / "test.eml"
p.write_text("test")
assert get_text(p) == 'test'
text = get_text(FILE_PATH)
print(text[0:25])
extracted_text = get_instant.get_text(p)
assert extracted_text == 'test'
text = get_instant.get_text(FILE_PATH)
assert text[0:25] == TEXT_REF

def test_get_text_err():
assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc)
assert get_instant.email_content["attachment"] == 2
assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
with pytest.raises(OSError):
list_of_files("nonexistingDir")
get_instant.get_text(get_instant.directory_name / "nonexisting.eml")

def test_get_html_text():
def test_get_html_text(get_instant):
html = """<html><head><title>Test</title></head></html>"""
assert get_html_text(html) == 'Test'

def test_get_html_text_noHtml():
assert get_instant.get_html_text(html) == 'Test'
noHtml = """Test"""
assert get_html_text(noHtml) == 'Test'

def test_get_text_no_file(tmp_path):
p = tmp_path / "test.eml"
with pytest.raises(OSError):
get_text(p)
assert get_instant.get_html_text(noHtml) == 'Test'

0 comments on commit 95c4add

Please sign in to comment.