From 69c4317b478e39010b2368431b6950a5bb01c89f Mon Sep 17 00:00:00 2001
From: nmammeri <nmammeri@gmail.com>
Date: Sat, 16 Nov 2024 18:09:59 +0100
Subject: [PATCH 1/4] feat!: change extractor api to return tuple of result and
 metadata

---
 bindings/extractous-python/src/extractor.rs   |  73 +--------
 .../tests/test_extract_bytes_to_stream.py     |  44 ++----
 .../tests/test_extract_file_to_string.py      |  43 ++----
 .../tests/test_extract_url.py                 |   6 +-
 bindings/extractous-python/tests/test_ocr.py  |   6 +-
 bindings/extractous-python/tests/test_pdf.py  |   6 +-
 extractous-core/Cargo.toml                    |   3 +-
 extractous-core/examples/extract_to_stream.rs |   2 +-
 extractous-core/examples/extract_to_string.rs |   2 +-
 extractous-core/src/errors.rs                 |   2 +-
 extractous-core/src/extractor.rs              | 122 +++------------
 extractous-core/src/lib.rs                    |   5 +-
 extractous-core/src/test_utils.rs             |   8 +
 extractous-core/src/tika/jni_utils.rs         |   2 +-
 extractous-core/src/tika/parse.rs             | 140 ++++++++++++++----
 extractous-core/src/tika/wrappers.rs          |   6 +-
 .../tests/extract_to_stream_tests.rs          |  39 +----
 .../tests/extract_to_string_tests.rs          |  38 +----
 .../main/java/ai/yobix/TikaNativeMain.java    |  74 ++++++++-
 .../META-INF/native-image/jni-config.json     |  22 ++-
 20 files changed, 293 insertions(+), 350 deletions(-)
diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
index f1bf357..965510e 100644
--- a/bindings/extractous-python/src/extractor.rs
+++ b/bindings/extractous-python/src/extractor.rs
@@ -136,32 +136,16 @@ impl Extractor {
         Ok(Self(inner))
     }
 
-    /// Extracts text from a file path. Returns a stream of the extracted text
-    /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_file(&self, filename: &str) -> PyResult<StreamReader> {
-        let reader = self
-            .0
-            .extract_file(filename)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
-
-        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
-        Ok(StreamReader {
-            reader,
-            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
-            py_bytes: None,
-        })
-    }
-
     /// Extracts text from a file path. Returns a tuple with stream of the extracted text
     /// the stream is decoded using the extractor's `encoding` and tika metadata.
-    pub fn extract_file_with_metadata<'py>(
+    pub fn extract_file<'py>(
         &self,
         filename: &str,
         py: Python<'py>,
     ) -> PyResult<(StreamReader, PyObject)> {
         let (reader, metadata) = self
             .0
-            .extract_file_with_metadata(filename)
+            .extract_file(filename)
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
 
         // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
@@ -176,50 +160,25 @@ impl Extractor {
         ))
     }
 
-    /// Extracts text from a file path. Returns a string that is of maximum length
-    /// of the extractor's `extract_string_max_length`
-    pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
-        self.0
-            .extract_file_to_string(filename)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
-    }
-
     /// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
     /// of the extractor's `extract_string_max_length` and the metadata.
-    pub fn extract_file_to_string_with_metadata<'py>(
+    pub fn extract_file_to_string<'py>(
         &self,
         filename: &str,
         py: Python<'py>,
     ) -> PyResult<(String, PyObject)> {
         let (content, metadata) = self
             .0
-            .extract_file_to_string_with_metadata(filename)
+            .extract_file_to_string(filename)
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
 
         let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
         Ok((content, py_metadata.into()))
     }
 
-    /// Extracts text from a bytearray. Returns a stream of the extracted text
-    /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
-        let slice = buffer.to_vec();
-        let reader = self
-            .0
-            .extract_bytes(&slice)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
-
-        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
-        Ok(StreamReader {
-            reader,
-            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
-            py_bytes: None,
-        })
-    }
-
     /// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
     /// the stream is decoded using the extractor's `encoding` and tika metadata.
-    pub fn extract_bytes_with_metadata<'py>(
+    pub fn extract_bytes<'py>(
         &self,
         buffer: &Bound<'_, PyByteArray>,
         py: Python<'py>,
@@ -227,7 +186,7 @@ impl Extractor {
         let slice = buffer.to_vec();
         let (reader, metadata) = self
             .0
-            .extract_bytes_with_metadata(&slice)
+            .extract_bytes(&slice)
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
 
         // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
@@ -242,32 +201,16 @@ impl Extractor {
         ))
     }
 
-    /// Extracts text from a url. Returns a string that is of maximum length
-    /// of the extractor's `extract_string_max_length`
-    pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
-        let reader = self
-            .0
-            .extract_url(&url)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
-
-        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
-        Ok(StreamReader {
-            reader,
-            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
-            py_bytes: None,
-        })
-    }
-
     /// Extracts text from a url. Returns a tuple with string that is of maximum length
     /// of the extractor's `extract_string_max_length` and tika metdata.
-    pub fn extract_url_with_metadata<'py>(
+    pub fn extract_url<'py>(
         &self,
         url: &str,
         py: Python<'py>,
     ) -> PyResult<(StreamReader, PyObject)> {
         let (reader, metadata) = self
             .0
-            .extract_url_with_metadata(&url)
+            .extract_url(&url)
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
 
         // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
index 67033ca..87f0872 100644
--- a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
+++ b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
@@ -25,46 +25,26 @@ def test_extract_bytes_to_stream(file_name, target_dist):
     """Test the extraction from bytes of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
     expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
 
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
     file_bytes = read_file_to_bytearray(original_filepath)
 
     extractor = Extractor()
-    reader = extractor.extract_bytes(file_bytes)
+    reader, metadata = extractor.extract_bytes(file_bytes)
     result = read_to_string(reader)
 
-    # Expected
-    with open(expected_result_filepath, "r",  encoding="utf8") as file:
-        expected = file.read()
-    
+    # Check Expected
     assert cosine_similarity(result, expected) > target_dist, \
         f"Cosine similarity is less than {target_dist} for file: {file_name}"
 
-
-TEST_CASES_METADATA = [
-    ("2022_Q3_AAPL.pdf", 0.9),
-    ("science-exploration-1p.pptx", 0.9),
-    ("simple.odt", 0.9),
-    ("table-multi-row-column-cells-actual.csv", 0.6),
-    ("vodafone.xlsx", 0.8),
-    ("category-level.docx", 0.9),
-    ("simple.doc", 0.9),
-    ("simple.pptx", 0.9),
-    ("table-multi-row-column-cells.png", 0.9),
-    ("winter-sports.epub", 0.8),
-    ("bug_16.docx", 0.9),
-]
-
-
-@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
-def test_extract_bytes_to_stream(file_name, similarity_percent):
-    """Test the extraction from bytes of various file types."""
-    original_filepath = f"../../test_files/documents/{file_name}"
-    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
-    file_bytes = read_file_to_bytearray(original_filepath)
-    extractor = Extractor()
-    _reader, metadata = extractor.extract_bytes_with_metadata(file_bytes)
-    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
-        expected_metadata = json.load(file)
+    # Check metadata
     percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
-    assert percent_similarity > similarity_percent, \
+    assert percent_similarity > target_dist, \
         f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
\ No newline at end of file
diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file_to_string.py
index d538133..4b1aa89 100644
--- a/bindings/extractous-python/tests/test_extract_file_to_string.py
+++ b/bindings/extractous-python/tests/test_extract_file_to_string.py
@@ -19,44 +19,27 @@
     #("eng-ocr.pdf", 0.9),
 ]
 
-
 @pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
 def test_extract_file_to_string(file_name, target_dist):
     """Test the extraction and comparison of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
     expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
-    extractor = Extractor()
-    result = extractor.extract_file_to_string(original_filepath)
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
     with open(expected_result_filepath, "r",  encoding="utf8") as file:
         expected = file.read()
-    
-    assert cosine_similarity(result, expected) > target_dist, \
-        f"Cosine similarity is less than {target_dist} for file: {file_name}"
-
-
-TEST_CASES_METADATA = [
-    "2022_Q3_AAPL.pdf",
-    "science-exploration-1p.pptx",
-    "simple.odt",
-    "table-multi-row-column-cells-actual.csv",
-    "vodafone.xlsx",
-    "category-level.docx",
-    "simple.doc",
-    "simple.pptx",
-    "table-multi-row-column-cells.png",
-    "winter-sports.epub",
-    "bug_16.docx",
-]
-
-@pytest.mark.parametrize("file_name", TEST_CASES_METADATA)
-def test_extract_file_to_string_with_metadata(file_name):
-    """Test the extraction and comparison of various file types."""
-    original_filepath = f"../../test_files/documents/{file_name}"
-    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
-    extractor = Extractor()
-    _result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
     with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
         expected_metadata = json.load(file)
 
+    # Extract
+    extractor = Extractor()
+    result, metadata = extractor.extract_file_to_string(original_filepath)
+
+    # Check extracted
+    assert cosine_similarity(result, expected) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
     #metadata.pop("dc:format")
-    assert is_expected_metadata_contained(expected_metadata, metadata)
+    assert is_expected_metadata_contained(expected_metadata, metadata)
\ No newline at end of file
diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
index f36431f..32bb5fe 100644
--- a/bindings/extractous-python/tests/test_extract_url.py
+++ b/bindings/extractous-python/tests/test_extract_url.py
@@ -4,12 +4,8 @@
 def test_extract_url():
     extractor = Extractor()
 
-    reader = extractor.extract_url("https://www.google.com")
+    reader, metadata  = extractor.extract_url("https://www.google.com")
     result = read_to_string(reader)
 
     assert "Google" in result
-
-def test_extract_url_with_metadata():
-    extractor = Extractor()
-    _reader, metadata = extractor.extract_url_with_metadata("https://www.google.com")
     assert len(metadata.keys()) > 0
diff --git a/bindings/extractous-python/tests/test_ocr.py b/bindings/extractous-python/tests/test_ocr.py
index 4baaf76..909365b 100644
--- a/bindings/extractous-python/tests/test_ocr.py
+++ b/bindings/extractous-python/tests/test_ocr.py
@@ -5,7 +5,7 @@
 def test_ara_ocr_png():
     ocr_config = TesseractOcrConfig().set_language("ara")
     extractor = Extractor().set_ocr_config(ocr_config)
-    result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png")
+    result, metadata = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png")
 
     with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
         expected = file.read()
@@ -25,7 +25,7 @@ def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf():
     extractor = extractor.set_ocr_config(ocr_config)
     extractor = extractor.set_pdf_config(pdf_config)
 
-    result = extractor.extract_file_to_string(test_file)
+    result, metadata = extractor.extract_file_to_string(test_file)
 
     with open(expected_result_file, "r", encoding="utf8") as file:
         expected = file.read()
@@ -43,6 +43,6 @@ def test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf():
 
     extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(pdf_config)
 
-    result = extractor.extract_file_to_string(test_file)
+    result, metadata = extractor.extract_file_to_string(test_file)
 
     assert result.strip() == ""
diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py
index a14d9ed..320bf5f 100644
--- a/bindings/extractous-python/tests/test_pdf.py
+++ b/bindings/extractous-python/tests/test_pdf.py
@@ -8,14 +8,14 @@ def expected_result():
 
 def test_extract_file_to_string():
     extractor = Extractor()
-    result = extractor.extract_file_to_string("tests/quarkus.pdf")
+    result, metadata = extractor.extract_file_to_string("tests/quarkus.pdf")
 
     #print(result)
     assert result == expected_result()
 
 def test_extract_file():
     extractor = Extractor()
-    reader = extractor.extract_file("tests/quarkus.pdf")
+    reader, metadata = extractor.extract_file("tests/quarkus.pdf")
 
     result = read_to_string(reader)
 
@@ -27,7 +27,7 @@ def test_extract_bytes():
 
     with open("tests/quarkus.pdf", "rb") as file:
         buffer = bytearray(file.read())
-    reader = extractor.extract_bytes(buffer)
+    reader, metadata = extractor.extract_bytes(buffer)
 
     result = read_to_string(reader)
 
diff --git a/extractous-core/Cargo.toml b/extractous-core/Cargo.toml
index 6aa8280..4bfc28d 100644
--- a/extractous-core/Cargo.toml
+++ b/extractous-core/Cargo.toml
@@ -30,12 +30,13 @@ bytemuck =  { version = "1.17.1"}
 strum = { version = "0.26.2" }
 strum_macros = { version = "0.26.2" }
 
+serde_json = "1.0"
+
 [dev-dependencies]
 textdistance = "1.1.0"
 test-case = "3.0"
 criterion = "0.5.1"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0"
 
 [build-dependencies]
 fs_extra = { version = "1.3.0" }
diff --git a/extractous-core/examples/extract_to_stream.rs b/extractous-core/examples/extract_to_stream.rs
index 9bbb142..08d4386 100644
--- a/extractous-core/examples/extract_to_stream.rs
+++ b/extractous-core/examples/extract_to_stream.rs
@@ -9,7 +9,7 @@ fn main() {
 
     // Extract the provided file content to a string
     let extractor = Extractor::new();
-    let stream = extractor.extract_file(file_path).unwrap();
+    let (stream, _metadata) = extractor.extract_file(file_path).unwrap();
     // Extract url
     // let stream = extractor.extract_url("https://www.google.com/").unwrap();
     // Extract bytes
diff --git a/extractous-core/examples/extract_to_string.rs b/extractous-core/examples/extract_to_string.rs
index 1b9d444..36b2916 100644
--- a/extractous-core/examples/extract_to_string.rs
+++ b/extractous-core/examples/extract_to_string.rs
@@ -7,6 +7,6 @@ fn main() {
 
     // Extract the provided file content to a string
     let extractor = Extractor::new();
-    let content = extractor.extract_file_to_string(file_path).unwrap();
+    let (content, _metadata) = extractor.extract_file_to_string(file_path).unwrap();
     println!("{}", content);
 }
diff --git a/extractous-core/src/errors.rs b/extractous-core/src/errors.rs
index 6b72777..8f4aa22 100644
--- a/extractous-core/src/errors.rs
+++ b/extractous-core/src/errors.rs
@@ -47,4 +47,4 @@ impl From<Error> for io::Error {
 }
 
 /// Result that is a wrapper of Result<T, extractous::Error>
-pub type ExtractResult<T> = Result<T, Error>;
+pub type ExtractResult<T> = Result<T, Error>;
\ No newline at end of file
diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs
index 26e26b8..82ae75b 100644
--- a/extractous-core/src/extractor.rs
+++ b/extractous-core/src/extractor.rs
@@ -1,10 +1,14 @@
+use std::collections::HashMap;
 use crate::errors::ExtractResult;
 use crate::tika;
 use crate::tika::JReaderInputStream;
-use crate::tika::Metadata;
 use crate::{OfficeParserConfig, PdfParserConfig, TesseractOcrConfig};
 use strum_macros::{Display, EnumString};
 
+
+/// Metadata type alias
+pub type Metadata = HashMap<String, Vec<String>>;
+
 /// CharSet enum of all supported encodings
 #[derive(Debug, Clone, Default, Copy, PartialEq, Eq, Hash, Display, EnumString)]
 #[allow(non_camel_case_types)]
@@ -24,7 +28,7 @@ pub enum CharSet {
 /// use std::io::prelude::*;
 ///
 /// let extractor = Extractor::new();
-/// let reader = extractor.extract_file("README.md").unwrap();
+/// let (reader, metadata) = extractor.extract_file("README.md").unwrap();
 ///
 /// let mut buf_reader = BufReader::new(reader);
 /// let mut content = String::new();
@@ -48,10 +52,11 @@ impl std::io::Read for StreamReader {
 /// extracting text in one line. For example
 /// ```rust
 /// use extractous::{CharSet, Extractor};
-/// let text = Extractor::new()
+/// let (text, metadata) = Extractor::new()
 ///             .set_extract_string_max_length(1000)
-///             .extract_file_to_string("README.md");
-/// println!("{}", text.unwrap());
+///             .extract_file_to_string("README.md")
+///             .unwrap();
+/// println!("{}", text);
 /// ```
 ///
 #[derive(Debug, Clone)]
@@ -113,22 +118,9 @@ impl Extractor {
         self
     }
 
-    /// Extracts text from a file path. Returns a stream of the extracted text
-    /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_file(&self, file_path: &str) -> ExtractResult<StreamReader> {
-        tika::parse_file(
-            file_path,
-            &self.encoding,
-            &self.pdf_config,
-            &self.office_config,
-            &self.ocr_config,
-        )
-        .map(|(stream_reader, _metadata)| stream_reader)
-    }
-
     /// Extracts text from a file path. Returns a tuple with stream of the extracted text and metadata.
     /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_file_with_metadata(
+    pub fn extract_file(
         &self,
         file_path: &str,
     ) -> ExtractResult<(StreamReader, Metadata)> {
@@ -141,22 +133,9 @@ impl Extractor {
         )
     }
 
-    /// Extracts text from a byte buffer. Returns a stream of the extracted text
-    /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_bytes(&self, buffer: &[u8]) -> ExtractResult<StreamReader> {
-        tika::parse_bytes(
-            buffer,
-            &self.encoding,
-            &self.pdf_config,
-            &self.office_config,
-            &self.ocr_config,
-        )
-        .map(|(stream_reader, _metadata)| stream_reader)
-    }
-
     /// Extracts text from a byte buffer. Returns a tuple with stream of the extracted text and metadata.
     /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_bytes_with_metadata(
+    pub fn extract_bytes(
         &self,
         buffer: &[u8],
     ) -> ExtractResult<(StreamReader, Metadata)> {
@@ -169,22 +148,9 @@ impl Extractor {
         )
     }
 
-    /// Extracts text from an url. Returns a stream of the extracted text
-    /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_url(&self, url: &str) -> ExtractResult<StreamReader> {
-        tika::parse_url(
-            url,
-            &self.encoding,
-            &self.pdf_config,
-            &self.office_config,
-            &self.ocr_config,
-        )
-        .map(|(stream_reader, _metadata)| stream_reader)
-    }
-
     /// Extracts text from an url. Returns a tuple with stream of the extracted text and metadata.
     /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_url_with_metadata(&self, url: &str) -> ExtractResult<(StreamReader, Metadata)> {
+    pub fn extract_url(&self, url: &str) -> ExtractResult<(StreamReader, Metadata)> {
         tika::parse_url(
             url,
             &self.encoding,
@@ -194,22 +160,9 @@ impl Extractor {
         )
     }
 
-    /// Extracts text from a file path. Returns a string that is of maximum length
-    /// of the extractor's `extract_string_max_length`
-    pub fn extract_file_to_string(&self, file_path: &str) -> ExtractResult<String> {
-        tika::parse_file_to_string(
-            file_path,
-            self.extract_string_max_length,
-            &self.pdf_config,
-            &self.office_config,
-            &self.ocr_config,
-        )
-        .map(|(content, _metadata)| content)
-    }
-
     /// Extracts text from a file path. Returns a tuple with string that is of maximum length
     /// of the extractor's `extract_string_max_length` and metadata.
-    pub fn extract_file_to_string_with_metadata(
+    pub fn extract_file_to_string(
         &self,
         file_path: &str,
     ) -> ExtractResult<(String, Metadata)> {
@@ -250,16 +203,8 @@ mod tests {
         // Parse the files using extractous
         let extractor = Extractor::new();
         let result = extractor.extract_file_to_string(TEST_FILE);
-        let content = result.unwrap();
+        let (content, metadata) = result.unwrap();
         assert_eq!(content.trim(), expected_content.trim());
-    }
-
-    #[test]
-    fn extract_file_to_string_with_metadata_test() {
-        // Parse the files using extractous
-        let extractor = Extractor::new();
-        let result = extractor.extract_file_to_string_with_metadata(TEST_FILE);
-        let (_content, metadata) = result.unwrap();
         assert!(
             metadata.len() > 0,
             "Metadata should contain at least one entry"
@@ -283,16 +228,10 @@ mod tests {
         // Parse the files using extractous
         let extractor = Extractor::new();
         let result = extractor.extract_file(TEST_FILE);
-        let content = read_content_from_stream(result.unwrap());
-        assert_eq!(content.trim(), expected_content.trim());
-    }
+        let (reader, metadata) = result.unwrap();
+        let content = read_content_from_stream(reader);
 
-    #[test]
-    fn extract_file_with_metadata_test() {
-        // Parse the files using extractous
-        let extractor = Extractor::new();
-        let result = extractor.extract_file_with_metadata(TEST_FILE);
-        let (_content, metadata) = result.unwrap();
+        assert_eq!(content.trim(), expected_content.trim());
         assert!(
             metadata.len() > 0,
             "Metadata should contain at least one entry"
@@ -315,17 +254,10 @@ mod tests {
         let file_bytes = read_file_as_bytes(TEST_FILE).unwrap();
         let extractor = Extractor::new();
         let result = extractor.extract_bytes(&file_bytes);
-        let content = read_content_from_stream(result.unwrap());
-        assert_eq!(content.trim(), expected_content.trim());
-    }
+        let (reader, metadata) = result.unwrap();
+        let content = read_content_from_stream(reader);
 
-    #[test]
-    fn extract_bytes_with_metadata_test() {
-        // Parse the bytes using extractous
-        let file_bytes = read_file_as_bytes(TEST_FILE).unwrap();
-        let extractor = Extractor::new();
-        let result = extractor.extract_bytes_with_metadata(&file_bytes);
-        let (_content, metadata) = result.unwrap();
+        assert_eq!(content.trim(), expected_content.trim());
         assert!(
             metadata.len() > 0,
             "Metadata should contain at least one entry"
@@ -337,16 +269,10 @@ mod tests {
         // Parse url by extractous
         let extractor = Extractor::new();
         let result = extractor.extract_url(&TEST_URL);
-        let content = read_content_from_stream(result.unwrap());
-        assert!(content.contains("Google"));
-    }
+        let (reader, metadata) = result.unwrap();
+        let content = read_content_from_stream(reader);
 
-    #[test]
-    fn extract_url_with_metadata_test() {
-        // Parse url by extractous
-        let extractor = Extractor::new();
-        let result = extractor.extract_url_with_metadata(&TEST_URL);
-        let (_content, metadata) = result.unwrap();
+        assert!(content.contains("Google"));
         assert!(
             metadata.len() > 0,
             "Metadata should contain at least one entry"
diff --git a/extractous-core/src/lib.rs b/extractous-core/src/lib.rs
index 5a6b5b2..9c72cdb 100644
--- a/extractous-core/src/lib.rs
+++ b/extractous-core/src/lib.rs
@@ -41,7 +41,7 @@
 //! let mut extractor = Extractor::new().set_extract_string_max_length(1000);
 //!
 //! // Extract text from a file
-//! let text = extractor.extract_file_to_string("README.md").unwrap();
+//! let (text, metadata) = extractor.extract_file_to_string("README.md").unwrap();
 //! println!("{}", text);
 //!
 //! ```
@@ -61,7 +61,7 @@
 //!  .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
 //!
 //! // extract file with extractor
-//! let content = extractor.extract_file_to_string(file_path).unwrap();
+//! let (content, metadata) = extractor.extract_file_to_string(file_path).unwrap();
 //! println!("{}", content);
 //!
 //! ```
@@ -87,7 +87,6 @@ mod tika {
     mod wrappers;
     pub use parse::*;
     pub use wrappers::JReaderInputStream;
-    pub use wrappers::Metadata;
 }
 
 pub mod test_utils;
diff --git a/extractous-core/src/test_utils.rs b/extractous-core/src/test_utils.rs
index ce6815f..d2680ae 100644
--- a/extractous-core/src/test_utils.rs
+++ b/extractous-core/src/test_utils.rs
@@ -1,5 +1,13 @@
 use std::collections::HashMap;
 
+#[cfg(test)]
+pub fn parse_metadata_file(file_path: &str) -> HashMap<String, Vec<String>> {
+    let expected_metadata_string = std::fs::read_to_string(file_path)
+        .unwrap();
+
+    serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted")
+}
+
 pub fn calculate_similarity_percent(
     expected: &HashMap<String, Vec<String>>,
     current: &HashMap<String, Vec<String>>,
diff --git a/extractous-core/src/tika/jni_utils.rs b/extractous-core/src/tika/jni_utils.rs
index 0d299ea..bbbf850 100644
--- a/extractous-core/src/tika/jni_utils.rs
+++ b/extractous-core/src/tika/jni_utils.rs
@@ -1,11 +1,11 @@
 use std::os::raw::{c_char, c_void};
 
 use crate::errors::{Error, ExtractResult};
-use crate::tika::Metadata;
 use jni::errors::jni_error_code_to_result;
 use jni::objects::{JByteBuffer, JObject, JObjectArray, JString, JValue, JValueOwned};
 use jni::{sys, JNIEnv, JavaVM};
 use std::collections::HashMap;
+use crate::Metadata;
 
 /// Calls a static method and prints any thrown exceptions to stderr
 pub fn jni_new_direct_buffer<'local>(
diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs
index 765d365..c029269 100644
--- a/extractous-core/src/tika/parse.rs
+++ b/extractous-core/src/tika/parse.rs
@@ -3,7 +3,7 @@ use std::sync::OnceLock;
 use crate::errors::ExtractResult;
 use crate::tika::jni_utils::*;
 use crate::tika::wrappers::*;
-use crate::{CharSet, OfficeParserConfig, PdfParserConfig, StreamReader, TesseractOcrConfig};
+use crate::{CharSet, Metadata, OfficeParserConfig, PdfParserConfig, StreamReader, TesseractOcrConfig};
 use jni::objects::JValue;
 use jni::{AttachGuard, JavaVM};
 
@@ -88,18 +88,78 @@ pub fn parse_file(
     )
 }
 
+pub fn parse_bytes(
+    buffer: &[u8],
+    char_set: &CharSet,
+    pdf_conf: &PdfParserConfig,
+    office_conf: &OfficeParserConfig,
+    ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult<(StreamReader, Metadata)> {
+    let mut env = get_vm_attach_current_thread()?;
+
+    // Because we know the buffer is used for reading only, cast it to *mut u8 to satisfy the
+    // jni_new_direct_buffer call, which requires a mutable pointer
+    let mut_ptr: *mut u8 = buffer.as_ptr() as *mut u8;
+
+    let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?;
+
+    parse_to_stream(
+        env,
+        (&byte_buffer).into(),
+        char_set,
+        pdf_conf,
+        office_conf,
+        ocr_conf,
+        "parseBytes",
+        "(Ljava/nio/ByteBuffer;\
+        Ljava/lang/String;\
+        Lorg/apache/tika/parser/pdf/PDFParserConfig;\
+        Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
+        Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        )Lai/yobix/ReaderResult;",
+    )
+}
+
+pub fn parse_url(
+    url: &str,
+    char_set: &CharSet,
+    pdf_conf: &PdfParserConfig,
+    office_conf: &OfficeParserConfig,
+    ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult<(StreamReader, Metadata)> {
+    let mut env = get_vm_attach_current_thread()?;
+
+    let url_val = jni_new_string_as_jvalue(&mut env, url)?;
+    parse_to_stream(
+        env,
+        (&url_val).into(),
+        char_set,
+        pdf_conf,
+        office_conf,
+        ocr_conf,
+        "parseUrl",
+        "(Ljava/lang/String;\
+        Ljava/lang/String;\
+        Lorg/apache/tika/parser/pdf/PDFParserConfig;\
+        Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
+        Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        )Lai/yobix/ReaderResult;",
+    )
+}
+
+
 /// Parses a file to a JStringResult using the Apache Tika library.
-pub fn parse_file_to_j_string_result(
-    file_path: &str,
+pub fn parse_to_string(
+    mut env: AttachGuard,
+    data_source_val: JValue,
     max_length: i32,
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
-) -> ExtractResult<JStringResult> {
-    let mut env = get_vm_attach_current_thread()?;
+    method_name: &str,
+    signature: &str,
+) -> ExtractResult<(String, Metadata)> {
 
-    // Create a new Java string from the Rust string
-    let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
     let j_pdf_conf = JPDFParserConfig::new(&mut env, pdf_conf)?;
     let j_office_conf = JOfficeParserConfig::new(&mut env, office_conf)?;
     let j_ocr_conf = JTesseractOcrConfig::new(&mut env, ocr_conf)?;
@@ -107,12 +167,10 @@ pub fn parse_file_to_j_string_result(
     let call_result = jni_call_static_method(
         &mut env,
         "ai/yobix/TikaNativeMain",
-        "parseToString",
-        "(Ljava/lang/String;ILorg/apache/tika/parser/pdf/PDFParserConfig;\
-        Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
-        Lorg/apache/tika/parser/ocr/TesseractOCRConfig;)Lai/yobix/StringResult;",
+        method_name,
+        signature,
         &[
-            (&file_path_val).into(),
+            data_source_val,
             JValue::Int(max_length),
             (&j_pdf_conf.internal).into(),
             (&j_office_conf.internal).into(),
@@ -123,7 +181,7 @@ pub fn parse_file_to_j_string_result(
 
     // Create and process the JStringResult
     let result = JStringResult::new(&mut env, call_result_obj)?;
-    Ok(result)
+    Ok((result.content, result.metadata))
 }
 
 /// Parses a file to a string using the Apache Tika library.
@@ -134,66 +192,84 @@ pub fn parse_file_to_string(
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
 ) -> ExtractResult<(String, Metadata)> {
-    let result =
-        parse_file_to_j_string_result(file_path, max_length, pdf_conf, office_conf, ocr_conf)?;
-    Ok((result.content, result.metadata))
+    let mut env = get_vm_attach_current_thread()?;
+
+    let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
+    parse_to_string(
+        env,
+        (&file_path_val).into(),
+        max_length,
+        pdf_conf,
+        office_conf,
+        ocr_conf,
+        "parseFileToString",
+        "(Ljava/lang/String;\
+        I\
+        Lorg/apache/tika/parser/pdf/PDFParserConfig;\
+        Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
+        Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        )Lai/yobix/StringResult;",
+    )
 }
 
-pub fn parse_bytes(
+/// Parses bytes to a string using the Apache Tika library.
+pub fn parse_bytes_to_string(
     buffer: &[u8],
-    char_set: &CharSet,
+    max_length: i32,
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
-) -> ExtractResult<(StreamReader, Metadata)> {
+) -> ExtractResult<(String, Metadata)> {
     let mut env = get_vm_attach_current_thread()?;
 
+
     // Because we know the buffer is used for reading only, cast it to *mut u8 to satisfy the
     // jni_new_direct_buffer call, which requires a mutable pointer
     let mut_ptr: *mut u8 = buffer.as_ptr() as *mut u8;
 
     let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?;
 
-    parse_to_stream(
+    parse_to_string(
         env,
         (&byte_buffer).into(),
-        char_set,
+        max_length,
         pdf_conf,
         office_conf,
         ocr_conf,
-        "parseBytes",
+        "parseBytesToString",
         "(Ljava/nio/ByteBuffer;\
-        Ljava/lang/String;\
+        I\
         Lorg/apache/tika/parser/pdf/PDFParserConfig;\
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
-        )Lai/yobix/ReaderResult;",
+        )Lai/yobix/StringResult;",
     )
 }
 
-pub fn parse_url(
+/// Parses a url to a string using the Apache Tika library.
+pub fn parse_url_to_string(
     url: &str,
-    char_set: &CharSet,
+    max_length: i32,
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
-) -> ExtractResult<(StreamReader, Metadata)> {
+) -> ExtractResult<(String, Metadata)> {
     let mut env = get_vm_attach_current_thread()?;
 
     let url_val = jni_new_string_as_jvalue(&mut env, url)?;
-    parse_to_stream(
+    parse_to_string(
         env,
         (&url_val).into(),
-        char_set,
+        max_length,
         pdf_conf,
         office_conf,
         ocr_conf,
-        "parseUrl",
+        "parseUrlToString",
         "(Ljava/lang/String;\
-        Ljava/lang/String;\
+        I\
         Lorg/apache/tika/parser/pdf/PDFParserConfig;\
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
-        )Lai/yobix/ReaderResult;",
+        )Lai/yobix/StringResult;",
     )
 }
diff --git a/extractous-core/src/tika/wrappers.rs b/extractous-core/src/tika/wrappers.rs
index aa7ad30..19272f5 100644
--- a/extractous-core/src/tika/wrappers.rs
+++ b/extractous-core/src/tika/wrappers.rs
@@ -4,15 +4,11 @@ use crate::tika::jni_utils::{
     jni_tika_metadata_to_rust_metadata,
 };
 use crate::tika::vm;
-use crate::{OfficeParserConfig, PdfParserConfig, TesseractOcrConfig, DEFAULT_BUF_SIZE};
+use crate::{Metadata, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig, DEFAULT_BUF_SIZE};
 use bytemuck::cast_slice_mut;
 use jni::objects::{GlobalRef, JByteArray, JObject, JValue};
 use jni::sys::jsize;
 use jni::JNIEnv;
-use std::collections::HashMap;
-
-/// Alias Tika Metadata
-pub type Metadata = HashMap<String, Vec<String>>;
 
 /// Wrapper for [`JObject`]s that contain `org.apache.commons.io.input.ReaderInputStream`
 /// It saves a GlobalRef to the java object, which is cleared when the last GlobalRef is dropped
diff --git a/extractous-core/tests/extract_to_stream_tests.rs b/extractous-core/tests/extract_to_stream_tests.rs
index 4bdddc4..fca6935 100644
--- a/extractous-core/tests/extract_to_stream_tests.rs
+++ b/extractous-core/tests/extract_to_stream_tests.rs
@@ -1,6 +1,5 @@
 use extractous::test_utils;
 use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig};
-use std::collections::HashMap;
 use std::fs;
 use std::io::Read;
 use test_case::test_case;
@@ -22,7 +21,7 @@ fn test_extract_bytes_to_stream(file_name: &str, target_dist: f64) {
     let extractor = Extractor::new();
 
     let bytes = fs::read(&format!("../test_files/documents/{}", file_name)).unwrap();
-    let mut stream = extractor.extract_bytes(&bytes).unwrap();
+    let (mut stream, metadata) = extractor.extract_bytes(&bytes).unwrap();
 
     let mut buffer = Vec::new();
     stream.read_to_end(&mut buffer).unwrap();
@@ -41,40 +40,16 @@ fn test_extract_bytes_to_stream(file_name: &str, target_dist: f64) {
         dist
     );
     println!("{}: {}", file_name, dist);
-}
-
-#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
-#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
-#[test_case("simple.odt", 0.9; "Test ODT file")]
-#[test_case("table-multi-row-column-cells-actual.csv", 0.6; "Test CSV file")]
-#[test_case("vodafone.xlsx", 0.8; "Test XLSX file")]
-#[test_case("category-level.docx", 0.9; "Test DOCX file")]
-#[test_case("simple.doc", 0.9; "Test DOC file")]
-#[test_case("simple.pptx", 0.9; "Test another PPTX file")]
-#[test_case("table-multi-row-column-cells.png", 0.9; "Test PNG file")]
-#[test_case("winter-sports.epub", 0.8; "Test EPUB file")]
-#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
-//#[test_case("eng-ocr.pdf", 0.8; "Test eng-ocr PDF file")]
-fn test_extract_bytes_to_stream_with_metadata(file_name: &str, expected_similarity: f64) {
-    /*
-    Note: Expected_similarity exists because the extracted metadata may vary across different platforms, but most of it should still match
-     */
-    let extractor = Extractor::new();
 
-    let bytes = fs::read(&format!("../test_files/documents/{}", file_name)).unwrap();
-    let (_expected_stream_readewr, extracted_metadata) =
-        extractor.extract_bytes_with_metadata(&bytes).unwrap();
-    let expected_metadata_string = fs::read_to_string(format!(
+    // Metadata checking
+    let expected_metadata = test_utils::parse_metadata_file(&format!(
         "../test_files/expected_result/{}.metadata.json",
         file_name
-    ))
-    .unwrap();
-    let expected_metadata: HashMap<String, Vec<String>> =
-        serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted");
+    ));
     let percent_similarity =
-        test_utils::calculate_similarity_percent(&expected_metadata, &extracted_metadata);
+        test_utils::calculate_similarity_percent(&expected_metadata, &metadata);
     assert!(
-        percent_similarity > expected_similarity,
+        percent_similarity > target_dist,
         "The metadata similarity is lower than expected. Current {}% | filename: {}",
         percent_similarity,
         file_name
@@ -89,7 +64,7 @@ fn test_extract_bytes_to_stream_ara_ocr_png() {
 
     // extract file with extractor
     let bytes = fs::read(&"../test_files/documents/ara-ocr.png".to_string()).unwrap();
-    let mut stream = extractor.extract_bytes(&bytes).unwrap();
+    let (mut stream, _metadata) = extractor.extract_bytes(&bytes).unwrap();
 
     let mut buffer = Vec::new();
     stream.read_to_end(&mut buffer).unwrap();
diff --git a/extractous-core/tests/extract_to_string_tests.rs b/extractous-core/tests/extract_to_string_tests.rs
index 27c5e38..3689a41 100644
--- a/extractous-core/tests/extract_to_string_tests.rs
+++ b/extractous-core/tests/extract_to_string_tests.rs
@@ -1,6 +1,5 @@
 use extractous::test_utils;
 use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig};
-use std::collections::HashMap;
 use std::fs;
 use test_case::test_case;
 use textdistance::nstr::cosine;
@@ -20,7 +19,7 @@ use textdistance::nstr::cosine;
 fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
     let extractor = Extractor::new().set_extract_string_max_length(1000000);
     // extract file with extractor
-    let extracted = extractor
+    let (extracted, extracted_metadata) = extractor
         .extract_file_to_string(&format!("../test_files/documents/{}", file_name))
         .unwrap();
     // read expected string
@@ -36,37 +35,12 @@ fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
         dist
     );
     println!("{}: {}", file_name, dist);
-}
 
-#[test_case("2022_Q3_AAPL.pdf"; "Test PDF file")]
-#[test_case("science-exploration-1p.pptx"; "Test PPTX file")]
-#[test_case("simple.odt"; "Test ODT file")]
-#[test_case("table-multi-row-column-cells-actual.csv"; "Test CSV file")]
-#[test_case("vodafone.xlsx"; "Test XLSX file")]
-#[test_case("category-level.docx"; "Test DOCX file")]
-#[test_case("simple.doc"; "Test DOC file")]
-#[test_case("simple.pptx"; "Test another PPTX file")]
-#[test_case("table-multi-row-column-cells.png"; "Test PNG file")]
-#[test_case("winter-sports.epub"; "Test EPUB file")]
-#[test_case("bug_16.docx"; "Test bug16 DOCX file")]
-//#[test_case("eng-ocr.pdf"; "Test eng-ocr PDF file")]
-fn test_extract_file_to_string_with_metadata(file_name: &str) {
-    /*
-    Note: Expected_similarity exists because the extracted metadata may vary across different platforms, but most of it should still match
-     */
-    let extractor = Extractor::new().set_extract_string_max_length(1000000);
-    // extract file with extractor
-    let (_extracted_content, extracted_metadata) = extractor
-        .extract_file_to_string_with_metadata(&format!("../test_files/documents/{}", file_name))
-        .unwrap();
     // read expected metadata
-    let expected_metadata_string = fs::read_to_string(format!(
+    let expected_metadata = test_utils::parse_metadata_file(&format!(
         "../test_files/expected_result/{}.metadata.json",
         file_name
-    ))
-    .unwrap();
-    let expected_metadata: HashMap<String, Vec<String>> =
-        serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted");
+    ));
 
     assert!(test_utils::is_expected_metadata_contained(
         &expected_metadata,
@@ -80,7 +54,7 @@ fn test_extract_file_to_string_ara_ocr_png() {
         .set_ocr_config(TesseractOcrConfig::new().set_language("ara"))
         .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR));
     // extract file with extractor
-    let extracted = extractor
+    let (extracted, _metadata) = extractor
         .extract_file_to_string(&"../test_files/documents/ara-ocr.png".to_string())
         .unwrap();
 
@@ -110,7 +84,7 @@ fn test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf() {
                 .set_extract_unique_inline_images_only(false),
         );
     // extract file with extractor
-    let extracted = extractor
+    let (extracted, _metadata) = extractor
         .extract_file_to_string(&"../test_files/documents/deu-ocr.pdf".to_string())
         .unwrap();
 
@@ -133,7 +107,7 @@ fn test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf() {
         .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
         .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR));
     // extract file with extractor
-    let extracted = extractor
+    let (extracted, _metadata) = extractor
         .extract_file_to_string(&"../test_files/documents/deu-ocr.pdf".to_string())
         .unwrap();
 
diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
index 87a9c37..f9f8e4a 100644
--- a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
+++ b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
@@ -52,7 +52,8 @@ public static StringResult detect(String filePath) {
         final Metadata metadata = new Metadata();
 
         try (final InputStream stream = TikaInputStream.get(path, metadata)) {
-            return new StringResult(tika.detect(stream, metadata));
+            final String result = tika.detect(stream, metadata);
+            return new StringResult(result, metadata);
 
         } catch (java.io.IOException e) {
             return new StringResult((byte) 1, e.getMessage());
@@ -68,7 +69,7 @@ public static StringResult detect(String filePath) {
      * @param maxLength: maximum length of the returned string
      * @return StringResult
      */
-    public static StringResult parseToString(
+    public static StringResult parseFileToString(
             String filePath,
             int maxLength,
             PDFParserConfig pdfConfig,
@@ -80,10 +81,11 @@ public static StringResult parseToString(
             final Metadata metadata = new Metadata();
             final InputStream stream = TikaInputStream.get(path, metadata);
 
-            String parseToStringWithConfig = parseToStringWithConfig(
+            String result = parseToStringWithConfig(
                     stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig);
             // No need to close the stream because parseToString does so
-            return new StringResult(parseToStringWithConfig, metadata);
+            return new StringResult(result, metadata);
+
         } catch (java.io.IOException e) {
             return new StringResult((byte) 1, "Could not open file: " + e.getMessage());
         } catch (TikaException e) {
@@ -91,6 +93,70 @@ public static StringResult parseToString(
         }
     }
 
+    /**
+     * Parses the given Url and returns its content as String
+     *
+     * @param urlString the url to be parsed
+     * @return StringResult
+     */
+    public static StringResult parseUrlToString(
+            String urlString,
+            int maxLength,
+            PDFParserConfig pdfConfig,
+            OfficeParserConfig officeConfig,
+            TesseractOCRConfig tesseractConfig
+    ) {
+        try {
+            final URL url = new URI(urlString).toURL();
+            final Metadata metadata = new Metadata();
+            final TikaInputStream stream = TikaInputStream.get(url, metadata);
+
+            String result = parseToStringWithConfig(
+                    stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig);
+            // No need to close the stream because parseToString does so
+            return new StringResult(result, metadata);
+
+        } catch (MalformedURLException e) {
+            return new StringResult((byte) 2, "Malformed URL error occurred " + e.getMessage());
+        } catch (URISyntaxException e) {
+            return new StringResult((byte) 2, "Malformed URI error occurred: " + e.getMessage());
+        } catch (java.io.IOException e) {
+            return new StringResult((byte) 1, "IO error occurred: " + e.getMessage());
+        } catch (TikaException e) {
+            return new StringResult((byte) 2, "Parse error occurred : " + e.getMessage());
+        }
+    }
+
+    /**
+     * Parses the given array of bytes and return its content as String.
+     *
+     * @param data an array of bytes
+     * @return StringResult
+     */
+    public static StringResult parseBytesToString(
+            ByteBuffer data,
+            int maxLength,
+            PDFParserConfig pdfConfig,
+            OfficeParserConfig officeConfig,
+            TesseractOCRConfig tesseractConfig
+    ) {
+        final Metadata metadata = new Metadata();
+        final ByteBufferInputStream inStream = new ByteBufferInputStream(data);
+        final TikaInputStream stream = TikaInputStream.get(inStream, new TemporaryResources(), metadata);
+
+        try {
+            String result = parseToStringWithConfig(
+                    stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig);
+            // No need to close the stream because parseToString does so
+            return new StringResult(result, metadata);
+        } catch (java.io.IOException e) {
+            return new StringResult((byte) 1, "IO error occurred: " + e.getMessage());
+        } catch (TikaException e) {
+            return new StringResult((byte) 2, "Parse error occurred : " + e.getMessage());
+        }
+    }
+
+
     private static String parseToStringWithConfig(
             InputStream stream,
             Metadata metadata,
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
index 8037da5..a427673 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
@@ -153,6 +153,16 @@
                     "org.apache.tika.parser.ocr.TesseractOCRConfig"
                 ]
             },
+            {
+                "name": "parseBytesToString",
+                "parameterTypes": [
+                    "java.nio.ByteBuffer",
+                    "int",
+                    "org.apache.tika.parser.pdf.PDFParserConfig",
+                    "org.apache.tika.parser.microsoft.OfficeParserConfig",
+                    "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                ]
+            },
             {
                 "name": "parseFile",
                 "parameterTypes": [
@@ -164,7 +174,7 @@
                 ]
             },
             {
-                "name": "parseToString",
+                "name": "parseFileToString",
                 "parameterTypes": [
                     "java.lang.String",
                     "int",
@@ -182,6 +192,16 @@
                     "org.apache.tika.parser.microsoft.OfficeParserConfig",
                     "org.apache.tika.parser.ocr.TesseractOCRConfig"
                 ]
+            },
+            {
+                "name": "parseUrlToString",
+                "parameterTypes": [
+                    "java.lang.String",
+                    "int",
+                    "org.apache.tika.parser.pdf.PDFParserConfig",
+                    "org.apache.tika.parser.microsoft.OfficeParserConfig",
+                    "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                ]
             }
         ],
         "name": "ai.yobix.TikaNativeMain"

From 377d4436e0e29bcb947908a53033e729ad00c443 Mon Sep 17 00:00:00 2001
From: nmammeri <nmammeri@gmail.com>
Date: Sat, 16 Nov 2024 18:34:29 +0100
Subject: [PATCH 2/4] feat: add extract_url_to_string and
 extract_bytes_to_string functions

---
 bindings/extractous-python/src/extractor.rs | 66 ++++++++++++++++-----
 extractous-core/src/extractor.rs            | 30 ++++++++++
 2 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
index 965510e..a63a0b1 100644
--- a/bindings/extractous-python/src/extractor.rs
+++ b/bindings/extractous-python/src/extractor.rs
@@ -160,22 +160,6 @@ impl Extractor {
         ))
     }
 
-    /// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
-    /// of the extractor's `extract_string_max_length` and the metadata.
-    pub fn extract_file_to_string<'py>(
-        &self,
-        filename: &str,
-        py: Python<'py>,
-    ) -> PyResult<(String, PyObject)> {
-        let (content, metadata) = self
-            .0
-            .extract_file_to_string(filename)
-            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
-
-        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
-        Ok((content, py_metadata.into()))
-    }
-
     /// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
     /// the stream is decoded using the extractor's `encoding` and tika metadata.
     pub fn extract_bytes<'py>(
@@ -225,6 +209,56 @@ impl Extractor {
         ))
     }
 
+
+    /// Extracts text from a file path. Returns a tuple with string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and the metadata as dict.
+    pub fn extract_file_to_string<'py>(
+        &self,
+        filename: &str,
+        py: Python<'py>,
+    ) -> PyResult<(String, PyObject)> {
+        let (content, metadata) = self
+            .0
+            .extract_file_to_string(filename)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((content, py_metadata.into()))
+    }
+
+    /// Extracts text from a bytearray. string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and the metadata as dict.
+    pub fn extract_bytes_to_string<'py>(
+        &self,
+        buffer: &Bound<'_, PyByteArray>,
+        py: Python<'py>,
+    ) -> PyResult<(String, PyObject)> {
+        let (content, metadata) = self
+            .0
+            .extract_bytes_to_string(&buffer.to_vec())
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((content, py_metadata.into()))
+    }
+
+    /// Extracts text from a URL. Returns a tuple with string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and the metadata as dict.
+    pub fn extract_url_to_string<'py>(
+        &self,
+        url: &str,
+        py: Python<'py>,
+    ) -> PyResult<(String, PyObject)> {
+        let (content, metadata) = self
+            .0
+            .extract_url_to_string(url)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((content, py_metadata.into()))
+    }
+
     fn __repr__(&self) -> String {
         format!("{:?}", self.0)
     }
diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs
index 82ae75b..bad6eb4 100644
--- a/extractous-core/src/extractor.rs
+++ b/extractous-core/src/extractor.rs
@@ -174,6 +174,36 @@ impl Extractor {
             &self.ocr_config,
         )
     }
+
+    /// Extracts text from a byte buffer. Returns a tuple with string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and metadata.
+    pub fn extract_bytes_to_string(
+        &self,
+        buffer: &[u8],
+    ) -> ExtractResult<(String, Metadata)> {
+        tika::parse_bytes_to_string(
+            buffer,
+            self.extract_string_max_length,
+            &self.pdf_config,
+            &self.office_config,
+            &self.ocr_config,
+        )
+    }
+
+    /// Extracts text from a URL. Returns a tuple with string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and metadata.
+    pub fn extract_url_to_string(
+        &self,
+        url: &str,
+    ) -> ExtractResult<(String, Metadata)> {
+        tika::parse_url_to_string(
+            url,
+            self.extract_string_max_length,
+            &self.pdf_config,
+            &self.office_config,
+            &self.ocr_config,
+        )
+    }
 }
 
 #[cfg(test)]

From 355bfbddb1505fc7b46d901cea64f5c121f6b740 Mon Sep 17 00:00:00 2001
From: nmammeri <nmammeri@gmail.com>
Date: Sat, 16 Nov 2024 18:41:54 +0100
Subject: [PATCH 3/4] tests: make test_utils as dev only code

---
 extractous-core/Cargo.toml                       | 3 +--
 extractous-core/src/lib.rs                       | 1 -
 extractous-core/tests/extract_to_stream_tests.rs | 4 +++-
 extractous-core/tests/extract_to_string_tests.rs | 4 +++-
 extractous-core/{src => tests}/test_utils.rs     | 4 +++-
 5 files changed, 10 insertions(+), 6 deletions(-)
 rename extractous-core/{src => tests}/test_utils.rs (96%)

diff --git a/extractous-core/Cargo.toml b/extractous-core/Cargo.toml
index 4bfc28d..6aa8280 100644
--- a/extractous-core/Cargo.toml
+++ b/extractous-core/Cargo.toml
@@ -30,13 +30,12 @@ bytemuck =  { version = "1.17.1"}
 strum = { version = "0.26.2" }
 strum_macros = { version = "0.26.2" }
 
-serde_json = "1.0"
-
 [dev-dependencies]
 textdistance = "1.1.0"
 test-case = "3.0"
 criterion = "0.5.1"
 serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
 
 [build-dependencies]
 fs_extra = { version = "1.3.0" }
diff --git a/extractous-core/src/lib.rs b/extractous-core/src/lib.rs
index 9c72cdb..26672c2 100644
--- a/extractous-core/src/lib.rs
+++ b/extractous-core/src/lib.rs
@@ -89,4 +89,3 @@ mod tika {
     pub use wrappers::JReaderInputStream;
 }
 
-pub mod test_utils;
diff --git a/extractous-core/tests/extract_to_stream_tests.rs b/extractous-core/tests/extract_to_stream_tests.rs
index fca6935..106ee6d 100644
--- a/extractous-core/tests/extract_to_stream_tests.rs
+++ b/extractous-core/tests/extract_to_stream_tests.rs
@@ -1,10 +1,12 @@
-use extractous::test_utils;
 use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig};
 use std::fs;
 use std::io::Read;
 use test_case::test_case;
 use textdistance::nstr::cosine;
 
+// Declarers the shared test_utils code as module in this integration test
+mod test_utils;
+
 #[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
 #[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
 #[test_case("simple.odt", 0.8; "Test ODT file")]
diff --git a/extractous-core/tests/extract_to_string_tests.rs b/extractous-core/tests/extract_to_string_tests.rs
index 3689a41..c14c3f9 100644
--- a/extractous-core/tests/extract_to_string_tests.rs
+++ b/extractous-core/tests/extract_to_string_tests.rs
@@ -1,9 +1,11 @@
-use extractous::test_utils;
 use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig};
 use std::fs;
 use test_case::test_case;
 use textdistance::nstr::cosine;
 
+// Declarers the shared test_utils code as module in this integration test
+mod test_utils;
+
 #[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
 #[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
 #[test_case("simple.odt", 0.8; "Test ODT file")]
diff --git a/extractous-core/src/test_utils.rs b/extractous-core/tests/test_utils.rs
similarity index 96%
rename from extractous-core/src/test_utils.rs
rename to extractous-core/tests/test_utils.rs
index d2680ae..e2c7758 100644
--- a/extractous-core/src/test_utils.rs
+++ b/extractous-core/tests/test_utils.rs
@@ -1,6 +1,6 @@
 use std::collections::HashMap;
 
-#[cfg(test)]
+#[allow(dead_code)]
 pub fn parse_metadata_file(file_path: &str) -> HashMap<String, Vec<String>> {
     let expected_metadata_string = std::fs::read_to_string(file_path)
         .unwrap();
@@ -8,6 +8,7 @@ pub fn parse_metadata_file(file_path: &str) -> HashMap<String, Vec<String>> {
     serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted")
 }
 
+#[allow(dead_code)]
 pub fn calculate_similarity_percent(
     expected: &HashMap<String, Vec<String>>,
     current: &HashMap<String, Vec<String>>,
@@ -31,6 +32,7 @@ pub fn calculate_similarity_percent(
     (matches as f64) / (total as f64)
 }
 
+#[allow(dead_code)]
 pub fn is_expected_metadata_contained(
     expected: &HashMap<String, Vec<String>>,
     current: &HashMap<String, Vec<String>>,

From 5e32273a188c4f9961d71e19c186fce91a3767e7 Mon Sep 17 00:00:00 2001
From: nmammeri <nmammeri@gmail.com>
Date: Sat, 16 Nov 2024 18:52:30 +0100
Subject: [PATCH 4/4] tests: add extract_url_to_string and
 extract_bytes_to_string tests

---
 ...tes_to_stream.py => test_extract_bytes.py} | 31 +++++++++++++++++-
 ...file_to_string.py => test_extract_file.py} | 32 +++++++++++++++++--
 .../tests/test_extract_url.py                 | 10 +++++-
 3 files changed, 69 insertions(+), 4 deletions(-)
 rename bindings/extractous-python/tests/{test_extract_bytes_to_stream.py => test_extract_bytes.py} (57%)
 rename bindings/extractous-python/tests/{test_extract_file_to_string.py => test_extract_file.py} (55%)

diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes.py
similarity index 57%
rename from bindings/extractous-python/tests/test_extract_bytes_to_stream.py
rename to bindings/extractous-python/tests/test_extract_bytes.py
index 87f0872..55eee31 100644
--- a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
+++ b/bindings/extractous-python/tests/test_extract_bytes.py
@@ -2,7 +2,8 @@
 import pytest
 
 from extractous import Extractor
-from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray
+from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
+    is_expected_metadata_contained
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -20,6 +21,34 @@
 ]
 
 
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_bytes_to_string(file_name, target_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    file_bytes = read_file_to_bytearray(original_filepath)
+
+    extractor = Extractor()
+    result, metadata = extractor.extract_bytes_to_string(file_bytes)
+
+    # Check Expected
+    assert cosine_similarity(result, expected) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity > target_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
 @pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
 def test_extract_bytes_to_stream(file_name, target_dist):
     """Test the extraction from bytes of various file types."""
diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file.py
similarity index 55%
rename from bindings/extractous-python/tests/test_extract_file_to_string.py
rename to bindings/extractous-python/tests/test_extract_file.py
index 4b1aa89..6105eab 100644
--- a/bindings/extractous-python/tests/test_extract_file_to_string.py
+++ b/bindings/extractous-python/tests/test_extract_file.py
@@ -2,7 +2,7 @@
 import pytest
 
 from extractous import Extractor
-from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained
+from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -42,4 +42,32 @@ def test_extract_file_to_string(file_name, target_dist):
 
     # Check metadata
     #metadata.pop("dc:format")
-    assert is_expected_metadata_contained(expected_metadata, metadata)
\ No newline at end of file
+    assert is_expected_metadata_contained(expected_metadata, metadata)
+
+
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_file_to_stream(file_name, target_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    extractor = Extractor()
+    reader, metadata = extractor.extract_file(original_filepath)
+    result = read_to_string(reader)
+
+    # Check Expected
+    assert cosine_similarity(result, expected) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity > target_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
\ No newline at end of file
diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
index 32bb5fe..34e4acf 100644
--- a/bindings/extractous-python/tests/test_extract_url.py
+++ b/bindings/extractous-python/tests/test_extract_url.py
@@ -1,7 +1,7 @@
 from extractous import Extractor
 from utils import read_to_string
 
-def test_extract_url():
+def test_extract_url_to_stream():
     extractor = Extractor()
 
     reader, metadata  = extractor.extract_url("https://www.google.com")
@@ -9,3 +9,11 @@ def test_extract_url():
 
     assert "Google" in result
     assert len(metadata.keys()) > 0
+
+def test_extract_url_to_string():
+    extractor = Extractor()
+
+    content, metadata  = extractor.extract_url_to_string("https://www.google.com")
+
+    assert "Google" in content
+    assert len(metadata.keys()) > 0
\ No newline at end of file