From 69c4317b478e39010b2368431b6950a5bb01c89f Mon Sep 17 00:00:00 2001 From: nmammeri Date: Sat, 16 Nov 2024 18:09:59 +0100 Subject: [PATCH 1/4] feat!: change extractor api to return tuple of result and metadata --- bindings/extractous-python/src/extractor.rs | 73 +-------- .../tests/test_extract_bytes_to_stream.py | 44 ++---- .../tests/test_extract_file_to_string.py | 43 ++---- .../tests/test_extract_url.py | 6 +- bindings/extractous-python/tests/test_ocr.py | 6 +- bindings/extractous-python/tests/test_pdf.py | 6 +- extractous-core/Cargo.toml | 3 +- extractous-core/examples/extract_to_stream.rs | 2 +- extractous-core/examples/extract_to_string.rs | 2 +- extractous-core/src/errors.rs | 2 +- extractous-core/src/extractor.rs | 122 +++------------ extractous-core/src/lib.rs | 5 +- extractous-core/src/test_utils.rs | 8 + extractous-core/src/tika/jni_utils.rs | 2 +- extractous-core/src/tika/parse.rs | 140 ++++++++++++++---- extractous-core/src/tika/wrappers.rs | 6 +- .../tests/extract_to_stream_tests.rs | 39 +---- .../tests/extract_to_string_tests.rs | 38 +---- .../main/java/ai/yobix/TikaNativeMain.java | 74 ++++++++- .../META-INF/native-image/jni-config.json | 22 ++- 20 files changed, 293 insertions(+), 350 deletions(-) diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs index f1bf357..965510e 100644 --- a/bindings/extractous-python/src/extractor.rs +++ b/bindings/extractous-python/src/extractor.rs @@ -136,32 +136,16 @@ impl Extractor { Ok(Self(inner)) } - /// Extracts text from a file path. Returns a stream of the extracted text - /// the stream is decoded using the extractor's `encoding` - pub fn extract_file(&self, filename: &str) -> PyResult { - let reader = self - .0 - .extract_file(filename) - .map_err(|e| PyErr::new::(format!("{:?}", e)))?; - - // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes - Ok(StreamReader { - reader, - buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE), - py_bytes: None, - }) - } - /// Extracts text from a file path. Returns a tuple with stream of the extracted text /// the stream is decoded using the extractor's `encoding` and tika metadata. - pub fn extract_file_with_metadata<'py>( + pub fn extract_file<'py>( &self, filename: &str, py: Python<'py>, ) -> PyResult<(StreamReader, PyObject)> { let (reader, metadata) = self .0 - .extract_file_with_metadata(filename) + .extract_file(filename) .map_err(|e| PyErr::new::(format!("{:?}", e)))?; // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes @@ -176,50 +160,25 @@ impl Extractor { )) } - /// Extracts text from a file path. Returns a string that is of maximum length - /// of the extractor's `extract_string_max_length` - pub fn extract_file_to_string(&self, filename: &str) -> PyResult { - self.0 - .extract_file_to_string(filename) - .map_err(|e| PyErr::new::(format!("{:?}", e))) - } - /// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length /// of the extractor's `extract_string_max_length` and the metadata. - pub fn extract_file_to_string_with_metadata<'py>( + pub fn extract_file_to_string<'py>( &self, filename: &str, py: Python<'py>, ) -> PyResult<(String, PyObject)> { let (content, metadata) = self .0 - .extract_file_to_string_with_metadata(filename) + .extract_file_to_string(filename) .map_err(|e| PyErr::new::(format!("{:?}", e)))?; let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?; Ok((content, py_metadata.into())) } - /// Extracts text from a bytearray. Returns a stream of the extracted text - /// the stream is decoded using the extractor's `encoding` - pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult { - let slice = buffer.to_vec(); - let reader = self - .0 - .extract_bytes(&slice) - .map_err(|e| PyErr::new::(format!("{:?}", e)))?; - - // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes - Ok(StreamReader { - reader, - buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE), - py_bytes: None, - }) - } - /// Extracts text from a bytearray. Returns a tuple with stream of the extracted text /// the stream is decoded using the extractor's `encoding` and tika metadata. - pub fn extract_bytes_with_metadata<'py>( + pub fn extract_bytes<'py>( &self, buffer: &Bound<'_, PyByteArray>, py: Python<'py>, @@ -227,7 +186,7 @@ impl Extractor { let slice = buffer.to_vec(); let (reader, metadata) = self .0 - .extract_bytes_with_metadata(&slice) + .extract_bytes(&slice) .map_err(|e| PyErr::new::(format!("{:?}", e)))?; // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes @@ -242,32 +201,16 @@ impl Extractor { )) } - /// Extracts text from a url. Returns a string that is of maximum length - /// of the extractor's `extract_string_max_length` - pub fn extract_url(&self, url: &str) -> PyResult { - let reader = self - .0 - .extract_url(&url) - .map_err(|e| PyErr::new::(format!("{:?}", e)))?; - - // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes - Ok(StreamReader { - reader, - buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE), - py_bytes: None, - }) - } - /// Extracts text from a url. Returns a tuple with string that is of maximum length /// of the extractor's `extract_string_max_length` and tika metdata. - pub fn extract_url_with_metadata<'py>( + pub fn extract_url<'py>( &self, url: &str, py: Python<'py>, ) -> PyResult<(StreamReader, PyObject)> { let (reader, metadata) = self .0 - .extract_url_with_metadata(&url) + .extract_url(&url) .map_err(|e| PyErr::new::(format!("{:?}", e)))?; // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py index 67033ca..87f0872 100644 --- a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py +++ b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py @@ -25,46 +25,26 @@ def test_extract_bytes_to_stream(file_name, target_dist): """Test the extraction from bytes of various file types.""" original_filepath = f"../../test_files/documents/{file_name}" expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + # Read expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: + expected_metadata = json.load(file) + + # Extract file_bytes = read_file_to_bytearray(original_filepath) extractor = Extractor() - reader = extractor.extract_bytes(file_bytes) + reader, metadata = extractor.extract_bytes(file_bytes) result = read_to_string(reader) - # Expected - with open(expected_result_filepath, "r", encoding="utf8") as file: - expected = file.read() - + # Check Expected assert cosine_similarity(result, expected) > target_dist, \ f"Cosine similarity is less than {target_dist} for file: {file_name}" - -TEST_CASES_METADATA = [ - ("2022_Q3_AAPL.pdf", 0.9), - ("science-exploration-1p.pptx", 0.9), - ("simple.odt", 0.9), - ("table-multi-row-column-cells-actual.csv", 0.6), - ("vodafone.xlsx", 0.8), - ("category-level.docx", 0.9), - ("simple.doc", 0.9), - ("simple.pptx", 0.9), - ("table-multi-row-column-cells.png", 0.9), - ("winter-sports.epub", 0.8), - ("bug_16.docx", 0.9), -] - - -@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA) -def test_extract_bytes_to_stream(file_name, similarity_percent): - """Test the extraction from bytes of various file types.""" - original_filepath = f"../../test_files/documents/{file_name}" - expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" - file_bytes = read_file_to_bytearray(original_filepath) - extractor = Extractor() - _reader, metadata = extractor.extract_bytes_with_metadata(file_bytes) - with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: - expected_metadata = json.load(file) + # Check metadata percent_similarity = calculate_similarity_percent(metadata, expected_metadata) - assert percent_similarity > similarity_percent, \ + assert percent_similarity > target_dist, \ f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" \ No newline at end of file diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file_to_string.py index d538133..4b1aa89 100644 --- a/bindings/extractous-python/tests/test_extract_file_to_string.py +++ b/bindings/extractous-python/tests/test_extract_file_to_string.py @@ -19,44 +19,27 @@ #("eng-ocr.pdf", 0.9), ] - @pytest.mark.parametrize("file_name, target_dist", TEST_CASES) def test_extract_file_to_string(file_name, target_dist): """Test the extraction and comparison of various file types.""" original_filepath = f"../../test_files/documents/{file_name}" expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" - extractor = Extractor() - result = extractor.extract_file_to_string(original_filepath) + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + + # Read expected with open(expected_result_filepath, "r", encoding="utf8") as file: expected = file.read() - - assert cosine_similarity(result, expected) > target_dist, \ - f"Cosine similarity is less than {target_dist} for file: {file_name}" - - -TEST_CASES_METADATA = [ - "2022_Q3_AAPL.pdf", - "science-exploration-1p.pptx", - "simple.odt", - "table-multi-row-column-cells-actual.csv", - "vodafone.xlsx", - "category-level.docx", - "simple.doc", - "simple.pptx", - "table-multi-row-column-cells.png", - "winter-sports.epub", - "bug_16.docx", -] - -@pytest.mark.parametrize("file_name", TEST_CASES_METADATA) -def test_extract_file_to_string_with_metadata(file_name): - """Test the extraction and comparison of various file types.""" - original_filepath = f"../../test_files/documents/{file_name}" - expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" - extractor = Extractor() - _result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath) with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: expected_metadata = json.load(file) + # Extract + extractor = Extractor() + result, metadata = extractor.extract_file_to_string(original_filepath) + + # Check extracted + assert cosine_similarity(result, expected) > target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + + # Check metadata #metadata.pop("dc:format") - assert is_expected_metadata_contained(expected_metadata, metadata) + assert is_expected_metadata_contained(expected_metadata, metadata) \ No newline at end of file diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py index f36431f..32bb5fe 100644 --- a/bindings/extractous-python/tests/test_extract_url.py +++ b/bindings/extractous-python/tests/test_extract_url.py @@ -4,12 +4,8 @@ def test_extract_url(): extractor = Extractor() - reader = extractor.extract_url("https://www.google.com") + reader, metadata = extractor.extract_url("https://www.google.com") result = read_to_string(reader) assert "Google" in result - -def test_extract_url_with_metadata(): - extractor = Extractor() - _reader, metadata = extractor.extract_url_with_metadata("https://www.google.com") assert len(metadata.keys()) > 0 diff --git a/bindings/extractous-python/tests/test_ocr.py b/bindings/extractous-python/tests/test_ocr.py index 4baaf76..909365b 100644 --- a/bindings/extractous-python/tests/test_ocr.py +++ b/bindings/extractous-python/tests/test_ocr.py @@ -5,7 +5,7 @@ def test_ara_ocr_png(): ocr_config = TesseractOcrConfig().set_language("ara") extractor = Extractor().set_ocr_config(ocr_config) - result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png") + result, metadata = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png") with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file: expected = file.read() @@ -25,7 +25,7 @@ def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf(): extractor = extractor.set_ocr_config(ocr_config) extractor = extractor.set_pdf_config(pdf_config) - result = extractor.extract_file_to_string(test_file) + result, metadata = extractor.extract_file_to_string(test_file) with open(expected_result_file, "r", encoding="utf8") as file: expected = file.read() @@ -43,6 +43,6 @@ def test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf(): extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(pdf_config) - result = extractor.extract_file_to_string(test_file) + result, metadata = extractor.extract_file_to_string(test_file) assert result.strip() == "" diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py index a14d9ed..320bf5f 100644 --- a/bindings/extractous-python/tests/test_pdf.py +++ b/bindings/extractous-python/tests/test_pdf.py @@ -8,14 +8,14 @@ def expected_result(): def test_extract_file_to_string(): extractor = Extractor() - result = extractor.extract_file_to_string("tests/quarkus.pdf") + result, metadata = extractor.extract_file_to_string("tests/quarkus.pdf") #print(result) assert result == expected_result() def test_extract_file(): extractor = Extractor() - reader = extractor.extract_file("tests/quarkus.pdf") + reader, metadata = extractor.extract_file("tests/quarkus.pdf") result = read_to_string(reader) @@ -27,7 +27,7 @@ def test_extract_bytes(): with open("tests/quarkus.pdf", "rb") as file: buffer = bytearray(file.read()) - reader = extractor.extract_bytes(buffer) + reader, metadata = extractor.extract_bytes(buffer) result = read_to_string(reader) diff --git a/extractous-core/Cargo.toml b/extractous-core/Cargo.toml index 6aa8280..4bfc28d 100644 --- a/extractous-core/Cargo.toml +++ b/extractous-core/Cargo.toml @@ -30,12 +30,13 @@ bytemuck = { version = "1.17.1"} strum = { version = "0.26.2" } strum_macros = { version = "0.26.2" } +serde_json = "1.0" + [dev-dependencies] textdistance = "1.1.0" test-case = "3.0" criterion = "0.5.1" serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" [build-dependencies] fs_extra = { version = "1.3.0" } diff --git a/extractous-core/examples/extract_to_stream.rs b/extractous-core/examples/extract_to_stream.rs index 9bbb142..08d4386 100644 --- a/extractous-core/examples/extract_to_stream.rs +++ b/extractous-core/examples/extract_to_stream.rs @@ -9,7 +9,7 @@ fn main() { // Extract the provided file content to a string let extractor = Extractor::new(); - let stream = extractor.extract_file(file_path).unwrap(); + let (stream, _metadata) = extractor.extract_file(file_path).unwrap(); // Extract url // let stream = extractor.extract_url("https://www.google.com/").unwrap(); // Extract bytes diff --git a/extractous-core/examples/extract_to_string.rs b/extractous-core/examples/extract_to_string.rs index 1b9d444..36b2916 100644 --- a/extractous-core/examples/extract_to_string.rs +++ b/extractous-core/examples/extract_to_string.rs @@ -7,6 +7,6 @@ fn main() { // Extract the provided file content to a string let extractor = Extractor::new(); - let content = extractor.extract_file_to_string(file_path).unwrap(); + let (content, _metadata) = extractor.extract_file_to_string(file_path).unwrap(); println!("{}", content); } diff --git a/extractous-core/src/errors.rs b/extractous-core/src/errors.rs index 6b72777..8f4aa22 100644 --- a/extractous-core/src/errors.rs +++ b/extractous-core/src/errors.rs @@ -47,4 +47,4 @@ impl From for io::Error { } /// Result that is a wrapper of Result -pub type ExtractResult = Result; +pub type ExtractResult = Result; \ No newline at end of file diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs index 26e26b8..82ae75b 100644 --- a/extractous-core/src/extractor.rs +++ b/extractous-core/src/extractor.rs @@ -1,10 +1,14 @@ +use std::collections::HashMap; use crate::errors::ExtractResult; use crate::tika; use crate::tika::JReaderInputStream; -use crate::tika::Metadata; use crate::{OfficeParserConfig, PdfParserConfig, TesseractOcrConfig}; use strum_macros::{Display, EnumString}; + +/// Metadata type alias +pub type Metadata = HashMap>; + /// CharSet enum of all supported encodings #[derive(Debug, Clone, Default, Copy, PartialEq, Eq, Hash, Display, EnumString)] #[allow(non_camel_case_types)] @@ -24,7 +28,7 @@ pub enum CharSet { /// use std::io::prelude::*; /// /// let extractor = Extractor::new(); -/// let reader = extractor.extract_file("README.md").unwrap(); +/// let (reader, metadata) = extractor.extract_file("README.md").unwrap(); /// /// let mut buf_reader = BufReader::new(reader); /// let mut content = String::new(); @@ -48,10 +52,11 @@ impl std::io::Read for StreamReader { /// extracting text in one line. For example /// ```rust /// use extractous::{CharSet, Extractor}; -/// let text = Extractor::new() +/// let (text, metadata) = Extractor::new() /// .set_extract_string_max_length(1000) -/// .extract_file_to_string("README.md"); -/// println!("{}", text.unwrap()); +/// .extract_file_to_string("README.md") +/// .unwrap(); +/// println!("{}", text); /// ``` /// #[derive(Debug, Clone)] @@ -113,22 +118,9 @@ impl Extractor { self } - /// Extracts text from a file path. Returns a stream of the extracted text - /// the stream is decoded using the extractor's `encoding` - pub fn extract_file(&self, file_path: &str) -> ExtractResult { - tika::parse_file( - file_path, - &self.encoding, - &self.pdf_config, - &self.office_config, - &self.ocr_config, - ) - .map(|(stream_reader, _metadata)| stream_reader) - } - /// Extracts text from a file path. Returns a tuple with stream of the extracted text and metadata. /// the stream is decoded using the extractor's `encoding` - pub fn extract_file_with_metadata( + pub fn extract_file( &self, file_path: &str, ) -> ExtractResult<(StreamReader, Metadata)> { @@ -141,22 +133,9 @@ impl Extractor { ) } - /// Extracts text from a byte buffer. Returns a stream of the extracted text - /// the stream is decoded using the extractor's `encoding` - pub fn extract_bytes(&self, buffer: &[u8]) -> ExtractResult { - tika::parse_bytes( - buffer, - &self.encoding, - &self.pdf_config, - &self.office_config, - &self.ocr_config, - ) - .map(|(stream_reader, _metadata)| stream_reader) - } - /// Extracts text from a byte buffer. Returns a tuple with stream of the extracted text and metadata. /// the stream is decoded using the extractor's `encoding` - pub fn extract_bytes_with_metadata( + pub fn extract_bytes( &self, buffer: &[u8], ) -> ExtractResult<(StreamReader, Metadata)> { @@ -169,22 +148,9 @@ impl Extractor { ) } - /// Extracts text from an url. Returns a stream of the extracted text - /// the stream is decoded using the extractor's `encoding` - pub fn extract_url(&self, url: &str) -> ExtractResult { - tika::parse_url( - url, - &self.encoding, - &self.pdf_config, - &self.office_config, - &self.ocr_config, - ) - .map(|(stream_reader, _metadata)| stream_reader) - } - /// Extracts text from an url. Returns a tuple with stream of the extracted text and metadata. /// the stream is decoded using the extractor's `encoding` - pub fn extract_url_with_metadata(&self, url: &str) -> ExtractResult<(StreamReader, Metadata)> { + pub fn extract_url(&self, url: &str) -> ExtractResult<(StreamReader, Metadata)> { tika::parse_url( url, &self.encoding, @@ -194,22 +160,9 @@ impl Extractor { ) } - /// Extracts text from a file path. Returns a string that is of maximum length - /// of the extractor's `extract_string_max_length` - pub fn extract_file_to_string(&self, file_path: &str) -> ExtractResult { - tika::parse_file_to_string( - file_path, - self.extract_string_max_length, - &self.pdf_config, - &self.office_config, - &self.ocr_config, - ) - .map(|(content, _metadata)| content) - } - /// Extracts text from a file path. Returns a tuple with string that is of maximum length /// of the extractor's `extract_string_max_length` and metadata. - pub fn extract_file_to_string_with_metadata( + pub fn extract_file_to_string( &self, file_path: &str, ) -> ExtractResult<(String, Metadata)> { @@ -250,16 +203,8 @@ mod tests { // Parse the files using extractous let extractor = Extractor::new(); let result = extractor.extract_file_to_string(TEST_FILE); - let content = result.unwrap(); + let (content, metadata) = result.unwrap(); assert_eq!(content.trim(), expected_content.trim()); - } - - #[test] - fn extract_file_to_string_with_metadata_test() { - // Parse the files using extractous - let extractor = Extractor::new(); - let result = extractor.extract_file_to_string_with_metadata(TEST_FILE); - let (_content, metadata) = result.unwrap(); assert!( metadata.len() > 0, "Metadata should contain at least one entry" @@ -283,16 +228,10 @@ mod tests { // Parse the files using extractous let extractor = Extractor::new(); let result = extractor.extract_file(TEST_FILE); - let content = read_content_from_stream(result.unwrap()); - assert_eq!(content.trim(), expected_content.trim()); - } + let (reader, metadata) = result.unwrap(); + let content = read_content_from_stream(reader); - #[test] - fn extract_file_with_metadata_test() { - // Parse the files using extractous - let extractor = Extractor::new(); - let result = extractor.extract_file_with_metadata(TEST_FILE); - let (_content, metadata) = result.unwrap(); + assert_eq!(content.trim(), expected_content.trim()); assert!( metadata.len() > 0, "Metadata should contain at least one entry" @@ -315,17 +254,10 @@ mod tests { let file_bytes = read_file_as_bytes(TEST_FILE).unwrap(); let extractor = Extractor::new(); let result = extractor.extract_bytes(&file_bytes); - let content = read_content_from_stream(result.unwrap()); - assert_eq!(content.trim(), expected_content.trim()); - } + let (reader, metadata) = result.unwrap(); + let content = read_content_from_stream(reader); - #[test] - fn extract_bytes_with_metadata_test() { - // Parse the bytes using extractous - let file_bytes = read_file_as_bytes(TEST_FILE).unwrap(); - let extractor = Extractor::new(); - let result = extractor.extract_bytes_with_metadata(&file_bytes); - let (_content, metadata) = result.unwrap(); + assert_eq!(content.trim(), expected_content.trim()); assert!( metadata.len() > 0, "Metadata should contain at least one entry" @@ -337,16 +269,10 @@ mod tests { // Parse url by extractous let extractor = Extractor::new(); let result = extractor.extract_url(&TEST_URL); - let content = read_content_from_stream(result.unwrap()); - assert!(content.contains("Google")); - } + let (reader, metadata) = result.unwrap(); + let content = read_content_from_stream(reader); - #[test] - fn extract_url_with_metadata_test() { - // Parse url by extractous - let extractor = Extractor::new(); - let result = extractor.extract_url_with_metadata(&TEST_URL); - let (_content, metadata) = result.unwrap(); + assert!(content.contains("Google")); assert!( metadata.len() > 0, "Metadata should contain at least one entry" diff --git a/extractous-core/src/lib.rs b/extractous-core/src/lib.rs index 5a6b5b2..9c72cdb 100644 --- a/extractous-core/src/lib.rs +++ b/extractous-core/src/lib.rs @@ -41,7 +41,7 @@ //! let mut extractor = Extractor::new().set_extract_string_max_length(1000); //! //! // Extract text from a file -//! let text = extractor.extract_file_to_string("README.md").unwrap(); +//! let (text, metadata) = extractor.extract_file_to_string("README.md").unwrap(); //! println!("{}", text); //! //! ``` @@ -61,7 +61,7 @@ //! .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY)); //! //! // extract file with extractor -//! let content = extractor.extract_file_to_string(file_path).unwrap(); +//! let (content, metadata) = extractor.extract_file_to_string(file_path).unwrap(); //! println!("{}", content); //! //! ``` @@ -87,7 +87,6 @@ mod tika { mod wrappers; pub use parse::*; pub use wrappers::JReaderInputStream; - pub use wrappers::Metadata; } pub mod test_utils; diff --git a/extractous-core/src/test_utils.rs b/extractous-core/src/test_utils.rs index ce6815f..d2680ae 100644 --- a/extractous-core/src/test_utils.rs +++ b/extractous-core/src/test_utils.rs @@ -1,5 +1,13 @@ use std::collections::HashMap; +#[cfg(test)] +pub fn parse_metadata_file(file_path: &str) -> HashMap> { + let expected_metadata_string = std::fs::read_to_string(file_path) + .unwrap(); + + serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted") +} + pub fn calculate_similarity_percent( expected: &HashMap>, current: &HashMap>, diff --git a/extractous-core/src/tika/jni_utils.rs b/extractous-core/src/tika/jni_utils.rs index 0d299ea..bbbf850 100644 --- a/extractous-core/src/tika/jni_utils.rs +++ b/extractous-core/src/tika/jni_utils.rs @@ -1,11 +1,11 @@ use std::os::raw::{c_char, c_void}; use crate::errors::{Error, ExtractResult}; -use crate::tika::Metadata; use jni::errors::jni_error_code_to_result; use jni::objects::{JByteBuffer, JObject, JObjectArray, JString, JValue, JValueOwned}; use jni::{sys, JNIEnv, JavaVM}; use std::collections::HashMap; +use crate::Metadata; /// Calls a static method and prints any thrown exceptions to stderr pub fn jni_new_direct_buffer<'local>( diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs index 765d365..c029269 100644 --- a/extractous-core/src/tika/parse.rs +++ b/extractous-core/src/tika/parse.rs @@ -3,7 +3,7 @@ use std::sync::OnceLock; use crate::errors::ExtractResult; use crate::tika::jni_utils::*; use crate::tika::wrappers::*; -use crate::{CharSet, OfficeParserConfig, PdfParserConfig, StreamReader, TesseractOcrConfig}; +use crate::{CharSet, Metadata, OfficeParserConfig, PdfParserConfig, StreamReader, TesseractOcrConfig}; use jni::objects::JValue; use jni::{AttachGuard, JavaVM}; @@ -88,18 +88,78 @@ pub fn parse_file( ) } +pub fn parse_bytes( + buffer: &[u8], + char_set: &CharSet, + pdf_conf: &PdfParserConfig, + office_conf: &OfficeParserConfig, + ocr_conf: &TesseractOcrConfig, +) -> ExtractResult<(StreamReader, Metadata)> { + let mut env = get_vm_attach_current_thread()?; + + // Because we know the buffer is used for reading only, cast it to *mut u8 to satisfy the + // jni_new_direct_buffer call, which requires a mutable pointer + let mut_ptr: *mut u8 = buffer.as_ptr() as *mut u8; + + let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?; + + parse_to_stream( + env, + (&byte_buffer).into(), + char_set, + pdf_conf, + office_conf, + ocr_conf, + "parseBytes", + "(Ljava/nio/ByteBuffer;\ + Ljava/lang/String;\ + Lorg/apache/tika/parser/pdf/PDFParserConfig;\ + Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ + Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + )Lai/yobix/ReaderResult;", + ) +} + +pub fn parse_url( + url: &str, + char_set: &CharSet, + pdf_conf: &PdfParserConfig, + office_conf: &OfficeParserConfig, + ocr_conf: &TesseractOcrConfig, +) -> ExtractResult<(StreamReader, Metadata)> { + let mut env = get_vm_attach_current_thread()?; + + let url_val = jni_new_string_as_jvalue(&mut env, url)?; + parse_to_stream( + env, + (&url_val).into(), + char_set, + pdf_conf, + office_conf, + ocr_conf, + "parseUrl", + "(Ljava/lang/String;\ + Ljava/lang/String;\ + Lorg/apache/tika/parser/pdf/PDFParserConfig;\ + Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ + Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + )Lai/yobix/ReaderResult;", + ) +} + + /// Parses a file to a JStringResult using the Apache Tika library. -pub fn parse_file_to_j_string_result( - file_path: &str, +pub fn parse_to_string( + mut env: AttachGuard, + data_source_val: JValue, max_length: i32, pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, -) -> ExtractResult { - let mut env = get_vm_attach_current_thread()?; + method_name: &str, + signature: &str, +) -> ExtractResult<(String, Metadata)> { - // Create a new Java string from the Rust string - let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?; let j_pdf_conf = JPDFParserConfig::new(&mut env, pdf_conf)?; let j_office_conf = JOfficeParserConfig::new(&mut env, office_conf)?; let j_ocr_conf = JTesseractOcrConfig::new(&mut env, ocr_conf)?; @@ -107,12 +167,10 @@ pub fn parse_file_to_j_string_result( let call_result = jni_call_static_method( &mut env, "ai/yobix/TikaNativeMain", - "parseToString", - "(Ljava/lang/String;ILorg/apache/tika/parser/pdf/PDFParserConfig;\ - Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ - Lorg/apache/tika/parser/ocr/TesseractOCRConfig;)Lai/yobix/StringResult;", + method_name, + signature, &[ - (&file_path_val).into(), + data_source_val, JValue::Int(max_length), (&j_pdf_conf.internal).into(), (&j_office_conf.internal).into(), @@ -123,7 +181,7 @@ pub fn parse_file_to_j_string_result( // Create and process the JStringResult let result = JStringResult::new(&mut env, call_result_obj)?; - Ok(result) + Ok((result.content, result.metadata)) } /// Parses a file to a string using the Apache Tika library. @@ -134,66 +192,84 @@ pub fn parse_file_to_string( office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, ) -> ExtractResult<(String, Metadata)> { - let result = - parse_file_to_j_string_result(file_path, max_length, pdf_conf, office_conf, ocr_conf)?; - Ok((result.content, result.metadata)) + let mut env = get_vm_attach_current_thread()?; + + let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?; + parse_to_string( + env, + (&file_path_val).into(), + max_length, + pdf_conf, + office_conf, + ocr_conf, + "parseFileToString", + "(Ljava/lang/String;\ + I\ + Lorg/apache/tika/parser/pdf/PDFParserConfig;\ + Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ + Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + )Lai/yobix/StringResult;", + ) } -pub fn parse_bytes( +/// Parses bytes to a string using the Apache Tika library. +pub fn parse_bytes_to_string( buffer: &[u8], - char_set: &CharSet, + max_length: i32, pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, -) -> ExtractResult<(StreamReader, Metadata)> { +) -> ExtractResult<(String, Metadata)> { let mut env = get_vm_attach_current_thread()?; + // Because we know the buffer is used for reading only, cast it to *mut u8 to satisfy the // jni_new_direct_buffer call, which requires a mutable pointer let mut_ptr: *mut u8 = buffer.as_ptr() as *mut u8; let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?; - parse_to_stream( + parse_to_string( env, (&byte_buffer).into(), - char_set, + max_length, pdf_conf, office_conf, ocr_conf, - "parseBytes", + "parseBytesToString", "(Ljava/nio/ByteBuffer;\ - Ljava/lang/String;\ + I\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ - )Lai/yobix/ReaderResult;", + )Lai/yobix/StringResult;", ) } -pub fn parse_url( +/// Parses a url to a string using the Apache Tika library. +pub fn parse_url_to_string( url: &str, - char_set: &CharSet, + max_length: i32, pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, -) -> ExtractResult<(StreamReader, Metadata)> { +) -> ExtractResult<(String, Metadata)> { let mut env = get_vm_attach_current_thread()?; let url_val = jni_new_string_as_jvalue(&mut env, url)?; - parse_to_stream( + parse_to_string( env, (&url_val).into(), - char_set, + max_length, pdf_conf, office_conf, ocr_conf, - "parseUrl", + "parseUrlToString", "(Ljava/lang/String;\ - Ljava/lang/String;\ + I\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ - )Lai/yobix/ReaderResult;", + )Lai/yobix/StringResult;", ) } diff --git a/extractous-core/src/tika/wrappers.rs b/extractous-core/src/tika/wrappers.rs index aa7ad30..19272f5 100644 --- a/extractous-core/src/tika/wrappers.rs +++ b/extractous-core/src/tika/wrappers.rs @@ -4,15 +4,11 @@ use crate::tika::jni_utils::{ jni_tika_metadata_to_rust_metadata, }; use crate::tika::vm; -use crate::{OfficeParserConfig, PdfParserConfig, TesseractOcrConfig, DEFAULT_BUF_SIZE}; +use crate::{Metadata, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig, DEFAULT_BUF_SIZE}; use bytemuck::cast_slice_mut; use jni::objects::{GlobalRef, JByteArray, JObject, JValue}; use jni::sys::jsize; use jni::JNIEnv; -use std::collections::HashMap; - -/// Alias Tika Metadata -pub type Metadata = HashMap>; /// Wrapper for [`JObject`]s that contain `org.apache.commons.io.input.ReaderInputStream` /// It saves a GlobalRef to the java object, which is cleared when the last GlobalRef is dropped diff --git a/extractous-core/tests/extract_to_stream_tests.rs b/extractous-core/tests/extract_to_stream_tests.rs index 4bdddc4..fca6935 100644 --- a/extractous-core/tests/extract_to_stream_tests.rs +++ b/extractous-core/tests/extract_to_stream_tests.rs @@ -1,6 +1,5 @@ use extractous::test_utils; use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig}; -use std::collections::HashMap; use std::fs; use std::io::Read; use test_case::test_case; @@ -22,7 +21,7 @@ fn test_extract_bytes_to_stream(file_name: &str, target_dist: f64) { let extractor = Extractor::new(); let bytes = fs::read(&format!("../test_files/documents/{}", file_name)).unwrap(); - let mut stream = extractor.extract_bytes(&bytes).unwrap(); + let (mut stream, metadata) = extractor.extract_bytes(&bytes).unwrap(); let mut buffer = Vec::new(); stream.read_to_end(&mut buffer).unwrap(); @@ -41,40 +40,16 @@ fn test_extract_bytes_to_stream(file_name: &str, target_dist: f64) { dist ); println!("{}: {}", file_name, dist); -} - -#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")] -#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")] -#[test_case("simple.odt", 0.9; "Test ODT file")] -#[test_case("table-multi-row-column-cells-actual.csv", 0.6; "Test CSV file")] -#[test_case("vodafone.xlsx", 0.8; "Test XLSX file")] -#[test_case("category-level.docx", 0.9; "Test DOCX file")] -#[test_case("simple.doc", 0.9; "Test DOC file")] -#[test_case("simple.pptx", 0.9; "Test another PPTX file")] -#[test_case("table-multi-row-column-cells.png", 0.9; "Test PNG file")] -#[test_case("winter-sports.epub", 0.8; "Test EPUB file")] -#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")] -//#[test_case("eng-ocr.pdf", 0.8; "Test eng-ocr PDF file")] -fn test_extract_bytes_to_stream_with_metadata(file_name: &str, expected_similarity: f64) { - /* - Note: Expected_similarity exists because the extracted metadata may vary across different platforms, but most of it should still match - */ - let extractor = Extractor::new(); - let bytes = fs::read(&format!("../test_files/documents/{}", file_name)).unwrap(); - let (_expected_stream_readewr, extracted_metadata) = - extractor.extract_bytes_with_metadata(&bytes).unwrap(); - let expected_metadata_string = fs::read_to_string(format!( + // Metadata checking + let expected_metadata = test_utils::parse_metadata_file(&format!( "../test_files/expected_result/{}.metadata.json", file_name - )) - .unwrap(); - let expected_metadata: HashMap> = - serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted"); + )); let percent_similarity = - test_utils::calculate_similarity_percent(&expected_metadata, &extracted_metadata); + test_utils::calculate_similarity_percent(&expected_metadata, &metadata); assert!( - percent_similarity > expected_similarity, + percent_similarity > target_dist, "The metadata similarity is lower than expected. Current {}% | filename: {}", percent_similarity, file_name @@ -89,7 +64,7 @@ fn test_extract_bytes_to_stream_ara_ocr_png() { // extract file with extractor let bytes = fs::read(&"../test_files/documents/ara-ocr.png".to_string()).unwrap(); - let mut stream = extractor.extract_bytes(&bytes).unwrap(); + let (mut stream, _metadata) = extractor.extract_bytes(&bytes).unwrap(); let mut buffer = Vec::new(); stream.read_to_end(&mut buffer).unwrap(); diff --git a/extractous-core/tests/extract_to_string_tests.rs b/extractous-core/tests/extract_to_string_tests.rs index 27c5e38..3689a41 100644 --- a/extractous-core/tests/extract_to_string_tests.rs +++ b/extractous-core/tests/extract_to_string_tests.rs @@ -1,6 +1,5 @@ use extractous::test_utils; use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig}; -use std::collections::HashMap; use std::fs; use test_case::test_case; use textdistance::nstr::cosine; @@ -20,7 +19,7 @@ use textdistance::nstr::cosine; fn test_extract_file_to_string(file_name: &str, target_dist: f64) { let extractor = Extractor::new().set_extract_string_max_length(1000000); // extract file with extractor - let extracted = extractor + let (extracted, extracted_metadata) = extractor .extract_file_to_string(&format!("../test_files/documents/{}", file_name)) .unwrap(); // read expected string @@ -36,37 +35,12 @@ fn test_extract_file_to_string(file_name: &str, target_dist: f64) { dist ); println!("{}: {}", file_name, dist); -} -#[test_case("2022_Q3_AAPL.pdf"; "Test PDF file")] -#[test_case("science-exploration-1p.pptx"; "Test PPTX file")] -#[test_case("simple.odt"; "Test ODT file")] -#[test_case("table-multi-row-column-cells-actual.csv"; "Test CSV file")] -#[test_case("vodafone.xlsx"; "Test XLSX file")] -#[test_case("category-level.docx"; "Test DOCX file")] -#[test_case("simple.doc"; "Test DOC file")] -#[test_case("simple.pptx"; "Test another PPTX file")] -#[test_case("table-multi-row-column-cells.png"; "Test PNG file")] -#[test_case("winter-sports.epub"; "Test EPUB file")] -#[test_case("bug_16.docx"; "Test bug16 DOCX file")] -//#[test_case("eng-ocr.pdf"; "Test eng-ocr PDF file")] -fn test_extract_file_to_string_with_metadata(file_name: &str) { - /* - Note: Expected_similarity exists because the extracted metadata may vary across different platforms, but most of it should still match - */ - let extractor = Extractor::new().set_extract_string_max_length(1000000); - // extract file with extractor - let (_extracted_content, extracted_metadata) = extractor - .extract_file_to_string_with_metadata(&format!("../test_files/documents/{}", file_name)) - .unwrap(); // read expected metadata - let expected_metadata_string = fs::read_to_string(format!( + let expected_metadata = test_utils::parse_metadata_file(&format!( "../test_files/expected_result/{}.metadata.json", file_name - )) - .unwrap(); - let expected_metadata: HashMap> = - serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted"); + )); assert!(test_utils::is_expected_metadata_contained( &expected_metadata, @@ -80,7 +54,7 @@ fn test_extract_file_to_string_ara_ocr_png() { .set_ocr_config(TesseractOcrConfig::new().set_language("ara")) .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR)); // extract file with extractor - let extracted = extractor + let (extracted, _metadata) = extractor .extract_file_to_string(&"../test_files/documents/ara-ocr.png".to_string()) .unwrap(); @@ -110,7 +84,7 @@ fn test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf() { .set_extract_unique_inline_images_only(false), ); // extract file with extractor - let extracted = extractor + let (extracted, _metadata) = extractor .extract_file_to_string(&"../test_files/documents/deu-ocr.pdf".to_string()) .unwrap(); @@ -133,7 +107,7 @@ fn test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf() { .set_ocr_config(TesseractOcrConfig::new().set_language("deu")) .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR)); // extract file with extractor - let extracted = extractor + let (extracted, _metadata) = extractor .extract_file_to_string(&"../test_files/documents/deu-ocr.pdf".to_string()) .unwrap(); diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java index 87a9c37..f9f8e4a 100644 --- a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java +++ b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java @@ -52,7 +52,8 @@ public static StringResult detect(String filePath) { final Metadata metadata = new Metadata(); try (final InputStream stream = TikaInputStream.get(path, metadata)) { - return new StringResult(tika.detect(stream, metadata)); + final String result = tika.detect(stream, metadata); + return new StringResult(result, metadata); } catch (java.io.IOException e) { return new StringResult((byte) 1, e.getMessage()); @@ -68,7 +69,7 @@ public static StringResult detect(String filePath) { * @param maxLength: maximum length of the returned string * @return StringResult */ - public static StringResult parseToString( + public static StringResult parseFileToString( String filePath, int maxLength, PDFParserConfig pdfConfig, @@ -80,10 +81,11 @@ public static StringResult parseToString( final Metadata metadata = new Metadata(); final InputStream stream = TikaInputStream.get(path, metadata); - String parseToStringWithConfig = parseToStringWithConfig( + String result = parseToStringWithConfig( stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig); // No need to close the stream because parseToString does so - return new StringResult(parseToStringWithConfig, metadata); + return new StringResult(result, metadata); + } catch (java.io.IOException e) { return new StringResult((byte) 1, "Could not open file: " + e.getMessage()); } catch (TikaException e) { @@ -91,6 +93,70 @@ public static StringResult parseToString( } } + /** + * Parses the given Url and returns its content as String + * + * @param urlString the url to be parsed + * @return StringResult + */ + public static StringResult parseUrlToString( + String urlString, + int maxLength, + PDFParserConfig pdfConfig, + OfficeParserConfig officeConfig, + TesseractOCRConfig tesseractConfig + ) { + try { + final URL url = new URI(urlString).toURL(); + final Metadata metadata = new Metadata(); + final TikaInputStream stream = TikaInputStream.get(url, metadata); + + String result = parseToStringWithConfig( + stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig); + // No need to close the stream because parseToString does so + return new StringResult(result, metadata); + + } catch (MalformedURLException e) { + return new StringResult((byte) 2, "Malformed URL error occurred " + e.getMessage()); + } catch (URISyntaxException e) { + return new StringResult((byte) 2, "Malformed URI error occurred: " + e.getMessage()); + } catch (java.io.IOException e) { + return new StringResult((byte) 1, "IO error occurred: " + e.getMessage()); + } catch (TikaException e) { + return new StringResult((byte) 2, "Parse error occurred : " + e.getMessage()); + } + } + + /** + * Parses the given array of bytes and return its content as String. + * + * @param data an array of bytes + * @return StringResult + */ + public static StringResult parseBytesToString( + ByteBuffer data, + int maxLength, + PDFParserConfig pdfConfig, + OfficeParserConfig officeConfig, + TesseractOCRConfig tesseractConfig + ) { + final Metadata metadata = new Metadata(); + final ByteBufferInputStream inStream = new ByteBufferInputStream(data); + final TikaInputStream stream = TikaInputStream.get(inStream, new TemporaryResources(), metadata); + + try { + String result = parseToStringWithConfig( + stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig); + // No need to close the stream because parseToString does so + return new StringResult(result, metadata); + } catch (java.io.IOException e) { + return new StringResult((byte) 1, "IO error occurred: " + e.getMessage()); + } catch (TikaException e) { + return new StringResult((byte) 2, "Parse error occurred : " + e.getMessage()); + } + } + + private static String parseToStringWithConfig( InputStream stream, Metadata metadata, diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json index 8037da5..a427673 100644 --- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json +++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json @@ -153,6 +153,16 @@ "org.apache.tika.parser.ocr.TesseractOCRConfig" ] }, + { + "name": "parseBytesToString", + "parameterTypes": [ + "java.nio.ByteBuffer", + "int", + "org.apache.tika.parser.pdf.PDFParserConfig", + "org.apache.tika.parser.microsoft.OfficeParserConfig", + "org.apache.tika.parser.ocr.TesseractOCRConfig" + ] + }, { "name": "parseFile", "parameterTypes": [ @@ -164,7 +174,7 @@ ] }, { - "name": "parseToString", + "name": "parseFileToString", "parameterTypes": [ "java.lang.String", "int", @@ -182,6 +192,16 @@ "org.apache.tika.parser.microsoft.OfficeParserConfig", "org.apache.tika.parser.ocr.TesseractOCRConfig" ] + }, + { + "name": "parseUrlToString", + "parameterTypes": [ + "java.lang.String", + "int", + "org.apache.tika.parser.pdf.PDFParserConfig", + "org.apache.tika.parser.microsoft.OfficeParserConfig", + "org.apache.tika.parser.ocr.TesseractOCRConfig" + ] } ], "name": "ai.yobix.TikaNativeMain" From 377d4436e0e29bcb947908a53033e729ad00c443 Mon Sep 17 00:00:00 2001 From: nmammeri Date: Sat, 16 Nov 2024 18:34:29 +0100 Subject: [PATCH 2/4] feat: add extract_url_to_string and extract_bytes_to_string functions --- bindings/extractous-python/src/extractor.rs | 66 ++++++++++++++++----- extractous-core/src/extractor.rs | 30 ++++++++++ 2 files changed, 80 insertions(+), 16 deletions(-) diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs index 965510e..a63a0b1 100644 --- a/bindings/extractous-python/src/extractor.rs +++ b/bindings/extractous-python/src/extractor.rs @@ -160,22 +160,6 @@ impl Extractor { )) } - /// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length - /// of the extractor's `extract_string_max_length` and the metadata. - pub fn extract_file_to_string<'py>( - &self, - filename: &str, - py: Python<'py>, - ) -> PyResult<(String, PyObject)> { - let (content, metadata) = self - .0 - .extract_file_to_string(filename) - .map_err(|e| PyErr::new::(format!("{:?}", e)))?; - - let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?; - Ok((content, py_metadata.into())) - } - /// Extracts text from a bytearray. Returns a tuple with stream of the extracted text /// the stream is decoded using the extractor's `encoding` and tika metadata. pub fn extract_bytes<'py>( @@ -225,6 +209,56 @@ impl Extractor { )) } + + /// Extracts text from a file path. Returns a tuple with string that is of maximum length + /// of the extractor's `extract_string_max_length` and the metadata as dict. + pub fn extract_file_to_string<'py>( + &self, + filename: &str, + py: Python<'py>, + ) -> PyResult<(String, PyObject)> { + let (content, metadata) = self + .0 + .extract_file_to_string(filename) + .map_err(|e| PyErr::new::(format!("{:?}", e)))?; + + let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?; + Ok((content, py_metadata.into())) + } + + /// Extracts text from a bytearray. string that is of maximum length + /// of the extractor's `extract_string_max_length` and the metadata as dict. + pub fn extract_bytes_to_string<'py>( + &self, + buffer: &Bound<'_, PyByteArray>, + py: Python<'py>, + ) -> PyResult<(String, PyObject)> { + let (content, metadata) = self + .0 + .extract_bytes_to_string(&buffer.to_vec()) + .map_err(|e| PyErr::new::(format!("{:?}", e)))?; + + // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes + let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?; + Ok((content, py_metadata.into())) + } + + /// Extracts text from a URL. Returns a tuple with string that is of maximum length + /// of the extractor's `extract_string_max_length` and the metadata as dict. + pub fn extract_url_to_string<'py>( + &self, + url: &str, + py: Python<'py>, + ) -> PyResult<(String, PyObject)> { + let (content, metadata) = self + .0 + .extract_url_to_string(url) + .map_err(|e| PyErr::new::(format!("{:?}", e)))?; + + let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?; + Ok((content, py_metadata.into())) + } + fn __repr__(&self) -> String { format!("{:?}", self.0) } diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs index 82ae75b..bad6eb4 100644 --- a/extractous-core/src/extractor.rs +++ b/extractous-core/src/extractor.rs @@ -174,6 +174,36 @@ impl Extractor { &self.ocr_config, ) } + + /// Extracts text from a byte buffer. Returns a tuple with string that is of maximum length + /// of the extractor's `extract_string_max_length` and metadata. + pub fn extract_bytes_to_string( + &self, + buffer: &[u8], + ) -> ExtractResult<(String, Metadata)> { + tika::parse_bytes_to_string( + buffer, + self.extract_string_max_length, + &self.pdf_config, + &self.office_config, + &self.ocr_config, + ) + } + + /// Extracts text from a URL. Returns a tuple with string that is of maximum length + /// of the extractor's `extract_string_max_length` and metadata. + pub fn extract_url_to_string( + &self, + url: &str, + ) -> ExtractResult<(String, Metadata)> { + tika::parse_url_to_string( + url, + self.extract_string_max_length, + &self.pdf_config, + &self.office_config, + &self.ocr_config, + ) + } } #[cfg(test)] From 355bfbddb1505fc7b46d901cea64f5c121f6b740 Mon Sep 17 00:00:00 2001 From: nmammeri Date: Sat, 16 Nov 2024 18:41:54 +0100 Subject: [PATCH 3/4] tests: make test_utils as dev only code --- extractous-core/Cargo.toml | 3 +-- extractous-core/src/lib.rs | 1 - extractous-core/tests/extract_to_stream_tests.rs | 4 +++- extractous-core/tests/extract_to_string_tests.rs | 4 +++- extractous-core/{src => tests}/test_utils.rs | 4 +++- 5 files changed, 10 insertions(+), 6 deletions(-) rename extractous-core/{src => tests}/test_utils.rs (96%) diff --git a/extractous-core/Cargo.toml b/extractous-core/Cargo.toml index 4bfc28d..6aa8280 100644 --- a/extractous-core/Cargo.toml +++ b/extractous-core/Cargo.toml @@ -30,13 +30,12 @@ bytemuck = { version = "1.17.1"} strum = { version = "0.26.2" } strum_macros = { version = "0.26.2" } -serde_json = "1.0" - [dev-dependencies] textdistance = "1.1.0" test-case = "3.0" criterion = "0.5.1" serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" [build-dependencies] fs_extra = { version = "1.3.0" } diff --git a/extractous-core/src/lib.rs b/extractous-core/src/lib.rs index 9c72cdb..26672c2 100644 --- a/extractous-core/src/lib.rs +++ b/extractous-core/src/lib.rs @@ -89,4 +89,3 @@ mod tika { pub use wrappers::JReaderInputStream; } -pub mod test_utils; diff --git a/extractous-core/tests/extract_to_stream_tests.rs b/extractous-core/tests/extract_to_stream_tests.rs index fca6935..106ee6d 100644 --- a/extractous-core/tests/extract_to_stream_tests.rs +++ b/extractous-core/tests/extract_to_stream_tests.rs @@ -1,10 +1,12 @@ -use extractous::test_utils; use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig}; use std::fs; use std::io::Read; use test_case::test_case; use textdistance::nstr::cosine; +// Declarers the shared test_utils code as module in this integration test +mod test_utils; + #[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")] #[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")] #[test_case("simple.odt", 0.8; "Test ODT file")] diff --git a/extractous-core/tests/extract_to_string_tests.rs b/extractous-core/tests/extract_to_string_tests.rs index 3689a41..c14c3f9 100644 --- a/extractous-core/tests/extract_to_string_tests.rs +++ b/extractous-core/tests/extract_to_string_tests.rs @@ -1,9 +1,11 @@ -use extractous::test_utils; use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig}; use std::fs; use test_case::test_case; use textdistance::nstr::cosine; +// Declarers the shared test_utils code as module in this integration test +mod test_utils; + #[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")] #[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")] #[test_case("simple.odt", 0.8; "Test ODT file")] diff --git a/extractous-core/src/test_utils.rs b/extractous-core/tests/test_utils.rs similarity index 96% rename from extractous-core/src/test_utils.rs rename to extractous-core/tests/test_utils.rs index d2680ae..e2c7758 100644 --- a/extractous-core/src/test_utils.rs +++ b/extractous-core/tests/test_utils.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -#[cfg(test)] +#[allow(dead_code)] pub fn parse_metadata_file(file_path: &str) -> HashMap> { let expected_metadata_string = std::fs::read_to_string(file_path) .unwrap(); @@ -8,6 +8,7 @@ pub fn parse_metadata_file(file_path: &str) -> HashMap> { serde_json::from_str(&expected_metadata_string).expect("JSON was not well-formatted") } +#[allow(dead_code)] pub fn calculate_similarity_percent( expected: &HashMap>, current: &HashMap>, @@ -31,6 +32,7 @@ pub fn calculate_similarity_percent( (matches as f64) / (total as f64) } +#[allow(dead_code)] pub fn is_expected_metadata_contained( expected: &HashMap>, current: &HashMap>, From 5e32273a188c4f9961d71e19c186fce91a3767e7 Mon Sep 17 00:00:00 2001 From: nmammeri Date: Sat, 16 Nov 2024 18:52:30 +0100 Subject: [PATCH 4/4] tests: add extract_url_to_string and extract_bytes_to_string tests --- ...tes_to_stream.py => test_extract_bytes.py} | 31 +++++++++++++++++- ...file_to_string.py => test_extract_file.py} | 32 +++++++++++++++++-- .../tests/test_extract_url.py | 10 +++++- 3 files changed, 69 insertions(+), 4 deletions(-) rename bindings/extractous-python/tests/{test_extract_bytes_to_stream.py => test_extract_bytes.py} (57%) rename bindings/extractous-python/tests/{test_extract_file_to_string.py => test_extract_file.py} (55%) diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes.py similarity index 57% rename from bindings/extractous-python/tests/test_extract_bytes_to_stream.py rename to bindings/extractous-python/tests/test_extract_bytes.py index 87f0872..55eee31 100644 --- a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py +++ b/bindings/extractous-python/tests/test_extract_bytes.py @@ -2,7 +2,8 @@ import pytest from extractous import Extractor -from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray +from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \ + is_expected_metadata_contained TEST_CASES = [ ("2022_Q3_AAPL.pdf", 0.9), @@ -20,6 +21,34 @@ ] +@pytest.mark.parametrize("file_name, target_dist", TEST_CASES) +def test_extract_bytes_to_string(file_name, target_dist): + """Test the extraction from bytes of various file types.""" + original_filepath = f"../../test_files/documents/{file_name}" + expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + + # Read expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: + expected_metadata = json.load(file) + + # Extract + file_bytes = read_file_to_bytearray(original_filepath) + + extractor = Extractor() + result, metadata = extractor.extract_bytes_to_string(file_bytes) + + # Check Expected + assert cosine_similarity(result, expected) > target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + + # Check metadata + percent_similarity = calculate_similarity_percent(metadata, expected_metadata) + assert percent_similarity > target_dist, \ + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" + @pytest.mark.parametrize("file_name, target_dist", TEST_CASES) def test_extract_bytes_to_stream(file_name, target_dist): """Test the extraction from bytes of various file types.""" diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file.py similarity index 55% rename from bindings/extractous-python/tests/test_extract_file_to_string.py rename to bindings/extractous-python/tests/test_extract_file.py index 4b1aa89..6105eab 100644 --- a/bindings/extractous-python/tests/test_extract_file_to_string.py +++ b/bindings/extractous-python/tests/test_extract_file.py @@ -2,7 +2,7 @@ import pytest from extractous import Extractor -from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained +from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string TEST_CASES = [ ("2022_Q3_AAPL.pdf", 0.9), @@ -42,4 +42,32 @@ def test_extract_file_to_string(file_name, target_dist): # Check metadata #metadata.pop("dc:format") - assert is_expected_metadata_contained(expected_metadata, metadata) \ No newline at end of file + assert is_expected_metadata_contained(expected_metadata, metadata) + + +@pytest.mark.parametrize("file_name, target_dist", TEST_CASES) +def test_extract_file_to_stream(file_name, target_dist): + """Test the extraction from bytes of various file types.""" + original_filepath = f"../../test_files/documents/{file_name}" + expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + + # Read expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: + expected_metadata = json.load(file) + + # Extract + extractor = Extractor() + reader, metadata = extractor.extract_file(original_filepath) + result = read_to_string(reader) + + # Check Expected + assert cosine_similarity(result, expected) > target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + + # Check metadata + percent_similarity = calculate_similarity_percent(metadata, expected_metadata) + assert percent_similarity > target_dist, \ + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" \ No newline at end of file diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py index 32bb5fe..34e4acf 100644 --- a/bindings/extractous-python/tests/test_extract_url.py +++ b/bindings/extractous-python/tests/test_extract_url.py @@ -1,7 +1,7 @@ from extractous import Extractor from utils import read_to_string -def test_extract_url(): +def test_extract_url_to_stream(): extractor = Extractor() reader, metadata = extractor.extract_url("https://www.google.com") @@ -9,3 +9,11 @@ def test_extract_url(): assert "Google" in result assert len(metadata.keys()) > 0 + +def test_extract_url_to_string(): + extractor = Extractor() + + content, metadata = extractor.extract_url_to_string("https://www.google.com") + + assert "Google" in content + assert len(metadata.keys()) > 0 \ No newline at end of file