Merge pull request #29 from yobix-ai/3-return-metadata-with-extractio…

…n-result change extractor api to return tuple of result and metadata
yobix-ai · Nov 17, 2024 · 6ee0cd6 · 6ee0cd6
2 parents ef326fa + 5e32273
commit 6ee0cd6
Show file tree

Hide file tree

Showing 19 changed files with 426 additions and 350 deletions.
diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
@@ -136,32 +136,16 @@ impl Extractor {
         Ok(Self(inner))
     }
 
-    /// Extracts text from a file path. Returns a stream of the extracted text
-    /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_file(&self, filename: &str) -> PyResult<StreamReader> {
-        let reader = self
-            .0
-            .extract_file(filename)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
-
-        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
-        Ok(StreamReader {
-            reader,
-            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
-            py_bytes: None,
-        })
-    }
-
     /// Extracts text from a file path. Returns a tuple with stream of the extracted text
     /// the stream is decoded using the extractor's `encoding` and tika metadata.
-    pub fn extract_file_with_metadata<'py>(
+    pub fn extract_file<'py>(
         &self,
         filename: &str,
         py: Python<'py>,
     ) -> PyResult<(StreamReader, PyObject)> {
         let (reader, metadata) = self
             .0
-            .extract_file_with_metadata(filename)
+            .extract_file(filename)
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
 
         // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
@@ -176,58 +160,17 @@ impl Extractor {
         ))
     }
 
-    /// Extracts text from a file path. Returns a string that is of maximum length
-    /// of the extractor's `extract_string_max_length`
-    pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
-        self.0
-            .extract_file_to_string(filename)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
-    }
-
-    /// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
-    /// of the extractor's `extract_string_max_length` and the metadata.
-    pub fn extract_file_to_string_with_metadata<'py>(
-        &self,
-        filename: &str,
-        py: Python<'py>,
-    ) -> PyResult<(String, PyObject)> {
-        let (content, metadata) = self
-            .0
-            .extract_file_to_string_with_metadata(filename)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
-
-        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
-        Ok((content, py_metadata.into()))
-    }
-
-    /// Extracts text from a bytearray. Returns a stream of the extracted text
-    /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
-        let slice = buffer.to_vec();
-        let reader = self
-            .0
-            .extract_bytes(&slice)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
-
-        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
-        Ok(StreamReader {
-            reader,
-            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
-            py_bytes: None,
-        })
-    }
-
     /// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
     /// the stream is decoded using the extractor's `encoding` and tika metadata.
-    pub fn extract_bytes_with_metadata<'py>(
+    pub fn extract_bytes<'py>(
         &self,
         buffer: &Bound<'_, PyByteArray>,
         py: Python<'py>,
     ) -> PyResult<(StreamReader, PyObject)> {
         let slice = buffer.to_vec();
         let (reader, metadata) = self
             .0
-            .extract_bytes_with_metadata(&slice)
+            .extract_bytes(&slice)
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
 
         // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
@@ -242,32 +185,16 @@ impl Extractor {
         ))
     }
 
-    /// Extracts text from a url. Returns a string that is of maximum length
-    /// of the extractor's `extract_string_max_length`
-    pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
-        let reader = self
-            .0
-            .extract_url(&url)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
-
-        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
-        Ok(StreamReader {
-            reader,
-            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
-            py_bytes: None,
-        })
-    }
-
     /// Extracts text from a url. Returns a tuple with string that is of maximum length
     /// of the extractor's `extract_string_max_length` and tika metdata.
-    pub fn extract_url_with_metadata<'py>(
+    pub fn extract_url<'py>(
         &self,
         url: &str,
         py: Python<'py>,
     ) -> PyResult<(StreamReader, PyObject)> {
         let (reader, metadata) = self
             .0
-            .extract_url_with_metadata(&url)
+            .extract_url(&url)
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
 
         // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
@@ -282,6 +209,56 @@ impl Extractor {
         ))
     }
 
+
+    /// Extracts text from a file path. Returns a tuple with string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and the metadata as dict.
+    pub fn extract_file_to_string<'py>(
+        &self,
+        filename: &str,
+        py: Python<'py>,
+    ) -> PyResult<(String, PyObject)> {
+        let (content, metadata) = self
+            .0
+            .extract_file_to_string(filename)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((content, py_metadata.into()))
+    }
+
+    /// Extracts text from a bytearray. string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and the metadata as dict.
+    pub fn extract_bytes_to_string<'py>(
+        &self,
+        buffer: &Bound<'_, PyByteArray>,
+        py: Python<'py>,
+    ) -> PyResult<(String, PyObject)> {
+        let (content, metadata) = self
+            .0
+            .extract_bytes_to_string(&buffer.to_vec())
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((content, py_metadata.into()))
+    }
+
+    /// Extracts text from a URL. Returns a tuple with string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and the metadata as dict.
+    pub fn extract_url_to_string<'py>(
+        &self,
+        url: &str,
+        py: Python<'py>,
+    ) -> PyResult<(String, PyObject)> {
+        let (content, metadata) = self
+            .0
+            .extract_url_to_string(url)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((content, py_metadata.into()))
+    }
+
     fn __repr__(&self) -> String {
         format!("{:?}", self.0)
     }

diff --git a/...hon/tests/test_extract_bytes_to_stream.py → ...actous-python/tests/test_extract_bytes.py b/...hon/tests/test_extract_bytes_to_stream.py → ...actous-python/tests/test_extract_bytes.py
@@ -2,7 +2,8 @@
 import pytest
 
 from extractous import Extractor
-from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray
+from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
+    is_expected_metadata_contained
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -21,50 +22,58 @@
 
 
 @pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
-def test_extract_bytes_to_stream(file_name, target_dist):
+def test_extract_bytes_to_string(file_name, target_dist):
     """Test the extraction from bytes of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
     expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
 
+    # Extract
     file_bytes = read_file_to_bytearray(original_filepath)
 
     extractor = Extractor()
-    reader = extractor.extract_bytes(file_bytes)
-    result = read_to_string(reader)
+    result, metadata = extractor.extract_bytes_to_string(file_bytes)
 
-    # Expected
-    with open(expected_result_filepath, "r",  encoding="utf8") as file:
-        expected = file.read()
-
+    # Check Expected
     assert cosine_similarity(result, expected) > target_dist, \
         f"Cosine similarity is less than {target_dist} for file: {file_name}"
 
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity > target_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
 
-TEST_CASES_METADATA = [
-    ("2022_Q3_AAPL.pdf", 0.9),
-    ("science-exploration-1p.pptx", 0.9),
-    ("simple.odt", 0.9),
-    ("table-multi-row-column-cells-actual.csv", 0.6),
-    ("vodafone.xlsx", 0.8),
-    ("category-level.docx", 0.9),
-    ("simple.doc", 0.9),
-    ("simple.pptx", 0.9),
-    ("table-multi-row-column-cells.png", 0.9),
-    ("winter-sports.epub", 0.8),
-    ("bug_16.docx", 0.9),
-]
-
-
-@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
-def test_extract_bytes_to_stream(file_name, similarity_percent):
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_bytes_to_stream(file_name, target_dist):
     """Test the extraction from bytes of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
     expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
-    file_bytes = read_file_to_bytearray(original_filepath)
-    extractor = Extractor()
-    _reader, metadata = extractor.extract_bytes_with_metadata(file_bytes)
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
     with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
         expected_metadata = json.load(file)
+
+    # Extract
+    file_bytes = read_file_to_bytearray(original_filepath)
+
+    extractor = Extractor()
+    reader, metadata = extractor.extract_bytes(file_bytes)
+    result = read_to_string(reader)
+
+    # Check Expected
+    assert cosine_similarity(result, expected) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
     percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
-    assert percent_similarity > similarity_percent, \
+    assert percent_similarity > target_dist, \
         f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
diff --git a/...thon/tests/test_extract_file_to_string.py → ...ractous-python/tests/test_extract_file.py b/...thon/tests/test_extract_file_to_string.py → ...ractous-python/tests/test_extract_file.py
@@ -2,7 +2,7 @@
 import pytest
 
 from extractous import Extractor
-from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained
+from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -19,44 +19,55 @@
     #("eng-ocr.pdf", 0.9),
 ]
 
-
 @pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
 def test_extract_file_to_string(file_name, target_dist):
     """Test the extraction and comparison of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
     expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
-    extractor = Extractor()
-    result = extractor.extract_file_to_string(original_filepath)
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
     with open(expected_result_filepath, "r",  encoding="utf8") as file:
         expected = file.read()
-
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    extractor = Extractor()
+    result, metadata = extractor.extract_file_to_string(original_filepath)
+
+    # Check extracted
     assert cosine_similarity(result, expected) > target_dist, \
         f"Cosine similarity is less than {target_dist} for file: {file_name}"
 
+    # Check metadata
+    #metadata.pop("dc:format")
+    assert is_expected_metadata_contained(expected_metadata, metadata)
 
-TEST_CASES_METADATA = [
-    "2022_Q3_AAPL.pdf",
-    "science-exploration-1p.pptx",
-    "simple.odt",
-    "table-multi-row-column-cells-actual.csv",
-    "vodafone.xlsx",
-    "category-level.docx",
-    "simple.doc",
-    "simple.pptx",
-    "table-multi-row-column-cells.png",
-    "winter-sports.epub",
-    "bug_16.docx",
-]
 
-@pytest.mark.parametrize("file_name", TEST_CASES_METADATA)
-def test_extract_file_to_string_with_metadata(file_name):
-    """Test the extraction and comparison of various file types."""
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_file_to_stream(file_name, target_dist):
+    """Test the extraction from bytes of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
     expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
-    extractor = Extractor()
-    _result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
     with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
         expected_metadata = json.load(file)
 
-    #metadata.pop("dc:format")
-    assert is_expected_metadata_contained(expected_metadata, metadata)
+    # Extract
+    extractor = Extractor()
+    reader, metadata = extractor.extract_file(original_filepath)
+    result = read_to_string(reader)
+
+    # Check Expected
+    assert cosine_similarity(result, expected) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity > target_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"