changed inout to a classstructure (#33)

* changed inout to a classstructure * edited tests * use instant of the class for better interface * update parse to run with inouthandler class --------- Co-authored-by: Inga Ulusoy <[email protected]>
ssciwr · Sep 23, 2024 · 95c4add · 95c4add
1 parent b67c25f
commit 95c4add
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 100 deletions.
diff --git a/mailcom/inout.py b/mailcom/inout.py
@@ -3,69 +3,78 @@
 import eml_parser
 from bs4 import BeautifulSoup
 
-def list_of_files(directory_name: str) -> list[Path]:
-    """Function to create a list of files that are present in a directory as path objects.
-    
-    Args: 
-        directory_name (str): The directory where the files are located.
-    
-    Returns:
-        list[Path]: A list of Path objects that represent the files in the directory."""
-    if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
-        raise OSError("Path {} does not exist".format(directory_name))
-    mypath = Path(directory_name)
-    pattern = [".eml", ".html"]  # we would not change the file type through user input
-    email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
-    if len(email_list) == 0:
-        raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
-    return email_list
+class InoutHandler:
+    def __init__(self, directory_name: str):
+        """Constructor for the InoutHandler class.
+        
+        Args: 
+            directory_name (str): The directory where the files are located.
+        """        
+        self.directory_name = directory_name
+        # presets
+        self.pattern = [".eml", ".html"]
+
+    def list_of_files(self):
+        """Method to create a list of Path objects (files) that are present 
+        in a directory."""
+        if not os.path.exists(self.directory_name):  # check if given dir exists raises error otherwise
+            raise OSError("Path {} does not exist".format(self.directory_name))
+        mypath = Path(self.directory_name)
+        self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern]
+        if len(self.email_list) == 0:
+            raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
 
-def get_html_text(text_check: str) -> str:
-    """Clean up a string if it contains html content.
-    Args:
-        text_check (str): The string that may contain html content.
+    def get_html_text(self, text_check: str) -> str:
+        """Clean up a string if it contains html content.
+        Args:
+            text_check (str): The string that may contain html content.
+            
+        Returns:
+            str: The (potentially) cleaned up string."""
+        soup = BeautifulSoup(text_check , 'html.parser')
+        if soup.find():
+            text_check = soup.get_text()
+        return text_check
+
+    def get_text(self, file: Path) -> str:
+        """Function to extract the textual content and other metadata from an email file.
         
-    Returns:
-        str: The (potentially) cleaned up string."""
-    soup = BeautifulSoup(text_check , 'html.parser')
-    if soup.find():
-        text_check = soup.get_text()
-    return text_check
+        Args:
+            file (Path): The path to the email file.
+            
+        Returns:
+            str: The textual content of the email. In the future, this will return the 
+            complete dictionary with the metadata."""
+        if not file.is_file(): # check if given file exists raises error otherwise
+            raise OSError("File {} does not exist".format(file))
+        with open(file, 'rb') as fhdl:
+            raw_email = fhdl.read()
+        ep = eml_parser.EmlParser(include_raw_body=True)
+        parsed_eml = ep.decode_email_bytes(raw_email)
+        attachmenttypes = []
+        # find if there are any attachements, and if yes, how many
+        attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
+        # find the types of attachements
+        if attachments > 0:
+            attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
+        self.email_content = {"content": parsed_eml["body"][0]["content"], 
+                    "date": parsed_eml["header"]["date"], 
+                    "attachment": attachments, 
+                    "attachement type": attachmenttypes
+                    }
+        return(self.email_content["content"])
 
-def get_text(file: Path) -> str:
-    """Function to extract the textual content and other metadata from an email file.
+    def validate_data(self):
+        pass
 
-    Args:
-        file (Path): The path to the email file.
-        
-    Returns:
-        str: The textual content of the email. In the future, this will return the 
-        complete dictionary with the metadata."""
-    if not file.is_file(): # check if given file exists raises error otherwise
-        raise OSError("File {} does not exist".format(file))
-    with open(file, 'rb') as fhdl:
-        raw_email = fhdl.read()
-    ep = eml_parser.EmlParser(include_raw_body=True)
-    parsed_eml = ep.decode_email_bytes(raw_email)
-    attachmenttypes = []
-    # find if there are any attachements, and if yes, how many
-    attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
-    # find the types of attachements
-    if attachments > 0:
-        attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
-    email_content = {"content": parsed_eml["body"][0]["content"], 
-                 "date": parsed_eml["header"]["date"], 
-                 "attachment": attachments, 
-                 "attachement type": attachmenttypes
-                 }
-    return(email_content["content"])
+    def data_to_xml(self):
+        pass
 
+    def write_file(self, text: str, name: str)-> None:
+        """Write the extracted string to a text file.
 
-def write_file(text: str, name: str)-> None:
-    """Write the extracted string to a text file.
-    
-    Args:
-        text (str): The string to be written to the file.
-        name (str): The name of the file to be written."""
-    with open("{}.out".format(name), "w") as file:
-        file.write(text)
+        Args:
+            text (str): The string to be written to the file.
+            name (str): The name of the file to be written."""
+        with open("{}.out".format(name), "w") as file:
+            file.write(text)
diff --git a/mailcom/parse.py b/mailcom/parse.py
@@ -2,7 +2,7 @@
 import spacy as sp
 from transformers import pipeline
 from pathlib import Path
-from mailcom.inout import get_text, list_of_files, get_html_text
+from mailcom.inout import InoutHandler
 
 # please modify this section depending on your setup
 # input language - either "es" or "fr"
@@ -116,12 +116,16 @@ def make_dir(path: str):
         print("Generating output directory/ies.")
         make_dir(path_output)
     # process the text
-    eml_files = list_of_files(path_input)
+    io = InoutHandler(path_input)
+    io.list_of_files()
     # html_files = list_of_files(path_input, "html")
-    for file in eml_files:
-        text = get_text(file)
-        text = get_html_text(text)
+    for file in io.email_list:
+        text = io.get_text(file)
+        text = io.get_html_text(text)
         print(text)
+        print(io.email_content["date"])
+        print(io.email_content["attachment"])
+        print(io.email_content["attachement type"])
         # skip this text if email could not be parsed
         if not text:
             continue

diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py
@@ -1,56 +1,51 @@
-from mailcom.inout import list_of_files, get_text, get_html_text
+from mailcom import inout
 import pytest
 from pathlib import Path
 from importlib import resources
+import datetime
 
 pkg = resources.files("mailcom")
 
 FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml")
+
 TEXT_REF = "J'espère que tu vas bien!"
 
-def test_list_of_files_found(tmp_path):
-    p = tmp_path / "test.eml"
-    p.write_text("test")
-    assert len(list_of_files(tmp_path)) != 0
+@pytest.fixture()
+def get_instant(tmp_path):
+    return inout.InoutHandler(tmp_path)
 
-def test_list_of_files_empty(tmp_path):
+def test_list_of_files(get_instant):
     with pytest.raises(ValueError):
-        list_of_files(tmp_path)
-
-def test_list_of_files_dir_not_existing():
-    with pytest.raises(OSError):
-        list_of_files("nonexistingDir")
-
-def test_list_of_files_correct_format(tmp_path):
-    p = tmp_path / "test.eml"
+        get_instant.list_of_files()
+    p = get_instant.directory_name / "test.eml"
     p.write_text("test")
-    p = tmp_path / "test2.html"
+    get_instant.list_of_files()
+    assert len(get_instant.email_list) != 0
+    get_instant2 = inout.InoutHandler("nonexistingDir")
+    with pytest.raises(OSError):
+        get_instant2.list_of_files()
+    p = get_instant.directory_name / "test2.html"
     p.write_text("test2")
-    p = tmp_path / "test3.xml"
+    p = get_instant.directory_name / "test3.xml"
     p.write_text("test3")
-    assert tmp_path / "test3.xml" not in list_of_files(tmp_path)
+    get_instant.list_of_files()
+    assert get_instant.directory_name / "test3.xml" not in get_instant.email_list
 
-def test_get_text(tmp_path):
-    p = tmp_path / "test.eml"
+def test_get_text(get_instant):
+    p = get_instant.directory_name / "test.eml"
     p.write_text("test")
-    assert get_text(p) == 'test'
-    text = get_text(FILE_PATH)
-    print(text[0:25])
+    extracted_text = get_instant.get_text(p)
+    assert extracted_text == 'test'
+    text = get_instant.get_text(FILE_PATH)
     assert text[0:25] == TEXT_REF
-
-def test_get_text_err():
+    assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc)
+    assert get_instant.email_content["attachment"] == 2
+    assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
     with pytest.raises(OSError):
-        list_of_files("nonexistingDir")
+        get_instant.get_text(get_instant.directory_name / "nonexisting.eml")
 
-def test_get_html_text():
+def test_get_html_text(get_instant):
     html = """<html><head><title>Test</title></head></html>"""
-    assert get_html_text(html) == 'Test'
-
-def test_get_html_text_noHtml():
+    assert get_instant.get_html_text(html) == 'Test'
     noHtml = """Test"""
-    assert get_html_text(noHtml) == 'Test'
-
-def test_get_text_no_file(tmp_path):
-    p = tmp_path / "test.eml"
-    with pytest.raises(OSError):
-        get_text(p)
+    assert get_instant.get_html_text(noHtml) == 'Test'