diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml index 8a41640..529017a 100644 --- a/.github/workflows/release_python.yml +++ b/.github/workflows/release_python.yml @@ -1,6 +1,8 @@ -# This file was autogenerated by maturin v1.6.0 using: + +# +This file was autogenerated by maturin v1.6.0 using: # maturin generate-ci github -o ../../.github/workflows/release_pyton_pytest.yml --pytest -# +# # Then adapted to the project # name: CI @@ -38,7 +40,7 @@ jobs: python-version: '3.8' # On linux we don't use graalvm/setup-graalvm@v1.2.5 action to install graalvm because it will install it - # on the runner machine and on linux the build will happen inside a manylinux docker. + # on the runner machine and on linux the build will happen inside a manylinux docker. # Instead, we use a script to install graalvm inside the docker container # the script is launched by setting the before-script-linux config option of the maturin action - name: Build wheels @@ -60,7 +62,7 @@ jobs: with: name: wheels-linux-${{ matrix.platform.target }} path: bindings/extractous-python/dist - + - name: pytest if: ${{ startsWith(matrix.platform.target, 'x86_64') }} shell: bash @@ -70,7 +72,7 @@ jobs: python3 -m venv .venv source .venv/bin/activate pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall - pip install pytest scikit-learn + pip install pytest scikit-learn lxml cd bindings/extractous-python pytest -s @@ -85,7 +87,7 @@ jobs: apt-get update apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara apt-get install -y --no-install-recommends python3 python3-pip - pip3 install -U pip pytest scikit-learn + pip3 install -U pip pytest scikit-learn lxml run: | set -e pip3 install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall @@ -129,7 +131,7 @@ jobs: python -m venv .venv .venv\Scripts\activate.bat pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall - pip install pytest scikit-learn + pip install pytest scikit-learn lxml cd bindings\extractous-python pytest -s . @@ -186,7 +188,7 @@ jobs: python3 -m venv .venv source .venv/bin/activate pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall - pip install pytest scikit-learn + pip install pytest scikit-learn lxml cd bindings/extractous-python pytest -s @@ -206,7 +208,7 @@ jobs: name: wheels-sdist path: bindings/extractous-python/dist - # Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ + # Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ # We use 2 actions one to publish on PyPi on tag pushes to main brnach and the other to publish on TestPyPi on any push publish-to-testpypi: name: Publish to TestPyPI diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs index 3ccfdd6..269321e 100644 --- a/bindings/extractous-python/src/extractor.rs +++ b/bindings/extractous-python/src/extractor.rs @@ -258,6 +258,55 @@ impl Extractor { Ok((content, py_metadata.into())) } + /// Extracts text from a file path. Returns a tuple with string that is of maximum length + /// of the extractor's `extract_string_max_length` and the metadata as dict. + pub fn extract_file_to_xml<'py>( + &self, + filename: &str, + py: Python<'py>, + ) -> PyResult<(String, PyObject)> { + let (content, metadata) = self + .0 + .extract_file_to_xml(filename) + .map_err(|e| PyErr::new::(format!("{:?}", e)))?; + + let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?; + Ok((content, py_metadata.into())) + } + + /// Extracts text from a bytearray. string that is of maximum length + /// of the extractor's `extract_string_max_length` and the metadata as dict. + pub fn extract_bytes_to_xml<'py>( + &self, + buffer: &Bound<'_, PyByteArray>, + py: Python<'py>, + ) -> PyResult<(String, PyObject)> { + let (content, metadata) = self + .0 + .extract_bytes_to_xml(&buffer.to_vec()) + .map_err(|e| PyErr::new::(format!("{:?}", e)))?; + + // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes + let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?; + Ok((content, py_metadata.into())) + } + + /// Extracts text from a URL. Returns a tuple with string that is of maximum length + /// of the extractor's `extract_string_max_length` and the metadata as dict. + pub fn extract_url_to_xml<'py>( + &self, + url: &str, + py: Python<'py>, + ) -> PyResult<(String, PyObject)> { + let (content, metadata) = self + .0 + .extract_url_to_xml(url) + .map_err(|e| PyErr::new::(format!("{:?}", e)))?; + + let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?; + Ok((content, py_metadata.into())) + } + fn __repr__(&self) -> String { format!("{:?}", self.0) } diff --git a/bindings/extractous-python/tests/test_extract_bytes.py b/bindings/extractous-python/tests/test_extract_bytes.py index df0ca07..ffc32dd 100644 --- a/bindings/extractous-python/tests/test_extract_bytes.py +++ b/bindings/extractous-python/tests/test_extract_bytes.py @@ -3,7 +3,7 @@ from extractous import Extractor from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \ - is_expected_metadata_contained + is_expected_metadata_contained, extract_body_text TEST_CASES = [ ("2022_Q3_AAPL.pdf", 0.9, 0.8), @@ -49,6 +49,35 @@ def test_extract_bytes_to_string(file_name, target_dist, metadata_dist): assert percent_similarity >= metadata_dist, \ f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" +@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) +def test_extract_bytes_to_xml(file_name, target_dist, metadata_dist): + """Test the extraction from bytes of various file types.""" + original_filepath = f"../../test_files/documents/{file_name}" + expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + + # Read expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: + expected_metadata = json.load(file) + + # Extract + file_bytes = read_file_to_bytearray(original_filepath) + + extractor = Extractor() + result_xml, metadata = extractor.extract_bytes_to_xml(file_bytes) + result_text = extract_body_text(result_xml) + + # Check Expected + assert cosine_similarity(result_text, expected) >= target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + + # Check metadata + percent_similarity = calculate_similarity_percent(metadata, expected_metadata) + assert percent_similarity >= metadata_dist, \ + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" + @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist): """Test the extraction from bytes of various file types.""" @@ -76,4 +105,4 @@ def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist): # Check metadata percent_similarity = calculate_similarity_percent(metadata, expected_metadata) assert percent_similarity >= metadata_dist, \ - f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" \ No newline at end of file + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" diff --git a/bindings/extractous-python/tests/test_extract_file.py b/bindings/extractous-python/tests/test_extract_file.py index 15b97cc..d07b5dd 100644 --- a/bindings/extractous-python/tests/test_extract_file.py +++ b/bindings/extractous-python/tests/test_extract_file.py @@ -2,7 +2,7 @@ import pytest from extractous import Extractor -from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string +from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string, extract_body_text TEST_CASES = [ ("2022_Q3_AAPL.pdf", 0.9, 0.8), @@ -49,6 +49,34 @@ def test_extract_file_to_string(file_name, target_dist, metadata_dist): assert percent_similarity >= metadata_dist, \ f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" +@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) +def test_extract_file_to_xml(file_name, target_dist, metadata_dist): + """Test the extraction and comparison of various file types.""" + original_filepath = f"../../test_files/documents/{file_name}" + expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + + # Read expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: + expected_metadata = json.load(file) + + # Extract + extractor = Extractor() + result_xml, metadata = extractor.extract_file_to_xml(original_filepath) + result_text = extract_body_text(result_xml) + + # Check extracted + assert cosine_similarity(result_text, expected) >= target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + + # Check metadata + percent_similarity = calculate_similarity_percent(metadata, expected_metadata) + assert percent_similarity >= metadata_dist, \ + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" + + @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) def test_extract_file_to_stream(file_name, target_dist, metadata_dist): @@ -75,4 +103,4 @@ def test_extract_file_to_stream(file_name, target_dist, metadata_dist): # Check metadata percent_similarity = calculate_similarity_percent(metadata, expected_metadata) assert percent_similarity >= metadata_dist, \ - f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" \ No newline at end of file + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py index 34e4acf..51b1ce2 100644 --- a/bindings/extractous-python/tests/test_extract_url.py +++ b/bindings/extractous-python/tests/test_extract_url.py @@ -16,4 +16,12 @@ def test_extract_url_to_string(): content, metadata = extractor.extract_url_to_string("https://www.google.com") assert "Google" in content - assert len(metadata.keys()) > 0 \ No newline at end of file + assert len(metadata.keys()) > 0 + +def test_extract_url_to_xml(): + extractor = Extractor() + + content, metadata = extractor.extract_url_to_xml("https://www.google.com") + + assert "Google" in content + assert len(metadata.keys()) > 0 diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py index 8368db9..c72f825 100644 --- a/bindings/extractous-python/tests/utils.py +++ b/bindings/extractous-python/tests/utils.py @@ -1,6 +1,6 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity as cosine_sim - +from lxml import etree def cosine_similarity(text1, text2): """Calculate the cosine similarity between two texts.""" @@ -78,3 +78,20 @@ def calculate_similarity_percent(expected, current): # Return the similarity percentage return matches / total + + +def extract_body_text(xml: str) -> str: + """ + Extracts and returns plain text content from the section of an XML + string. + """ + try: + parser = etree.XMLParser(recover=True) + root = etree.fromstring(xml.encode(), parser=parser) + ns= {"ns": "http://www.w3.org/1999/xhtml"} + body = root.find(".//ns:body", namespaces=ns) + if body is None: + return "" + return "\n".join(body.itertext()).strip() + except ET.ParseError as e: + raise ValueError(f"Invalid XML input: {e}") diff --git a/extractous-core/Cargo.lock b/extractous-core/Cargo.lock index c41bf9a..b74e466 100644 --- a/extractous-core/Cargo.lock +++ b/extractous-core/Cargo.lock @@ -473,6 +473,7 @@ dependencies = [ "fs_extra", "jni", "libc", + "quick-xml", "reqwest", "serde", "serde_json", @@ -1325,6 +1326,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "quick-xml" +version = "0.37.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.37" diff --git a/extractous-core/Cargo.toml b/extractous-core/Cargo.toml index d1c4004..b31b545 100644 --- a/extractous-core/Cargo.toml +++ b/extractous-core/Cargo.toml @@ -36,6 +36,7 @@ test-case = "3.0" criterion = "0.5.1" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +quick-xml = "0.37.1" [build-dependencies] fs_extra = { version = "1.3.0" } diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs index b8f220d..0759bae 100644 --- a/extractous-core/src/extractor.rs +++ b/extractous-core/src/extractor.rs @@ -162,6 +162,7 @@ impl Extractor { &self.pdf_config, &self.office_config, &self.ocr_config, + false, ) } @@ -174,6 +175,7 @@ impl Extractor { &self.pdf_config, &self.office_config, &self.ocr_config, + false, ) } @@ -186,8 +188,49 @@ impl Extractor { &self.pdf_config, &self.office_config, &self.ocr_config, + false, ) } + + /// Extracts text from a file path. Returns a tuple with xml string that is of maximum length + /// of the extractor's `extract_string_max_length` and metadata. + pub fn extract_file_to_xml(&self, file_path: &str) -> ExtractResult<(String, Metadata)> { + tika::parse_file_to_string( + file_path, + self.extract_string_max_length, + &self.pdf_config, + &self.office_config, + &self.ocr_config, + true, + ) + } + + /// Extracts text from a byte buffer. Returns a tuple with xml string that is of maximum length + /// of the extractor's `extract_string_max_length` and metadata. + pub fn extract_bytes_to_xml(&self, buffer: &[u8]) -> ExtractResult<(String, Metadata)> { + tika::parse_bytes_to_string( + buffer, + self.extract_string_max_length, + &self.pdf_config, + &self.office_config, + &self.ocr_config, + true, + ) + } + + /// Extracts text from a URL. Returns a tuple with xml string that is of maximum length + /// of the extractor's `extract_string_max_length` and metadata. + pub fn extract_url_to_xml(&self, url: &str) -> ExtractResult<(String, Metadata)> { + tika::parse_url_to_string( + url, + self.extract_string_max_length, + &self.pdf_config, + &self.office_config, + &self.ocr_config, + true, + ) + } + } #[cfg(test)] @@ -197,6 +240,7 @@ mod tests { use std::fs::File; use std::io::BufReader; use std::io::{self, Read}; + use std::str; const TEST_FILE: &str = "README.md"; @@ -292,4 +336,20 @@ mod tests { "Metadata should contain at least one entry" ); } + + #[test] + fn extract_file_to_xml_test() { + // Parse the files using extractous + let extractor = Extractor::new(); + let result = extractor.extract_file_to_xml(TEST_FILE); + let (content, metadata) = result.unwrap(); + assert!( + content.len() > 0, + "Metadata should contain at least one entry" + ); + assert!( + metadata.len() > 0, + "Metadata should contain at least one entry" + ); + } } diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs index 4f941af..dbf5e76 100644 --- a/extractous-core/src/tika/parse.rs +++ b/extractous-core/src/tika/parse.rs @@ -157,6 +157,7 @@ pub fn parse_to_string( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, method_name: &str, signature: &str, ) -> ExtractResult<(String, Metadata)> { @@ -175,6 +176,7 @@ pub fn parse_to_string( (&j_pdf_conf.internal).into(), (&j_office_conf.internal).into(), (&j_ocr_conf.internal).into(), + JValue::Bool(if as_xml { 1 } else { 0 }), ], ); let call_result_obj = call_result?.l()?; @@ -191,6 +193,7 @@ pub fn parse_file_to_string( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, ) -> ExtractResult<(String, Metadata)> { let mut env = get_vm_attach_current_thread()?; @@ -202,12 +205,14 @@ pub fn parse_file_to_string( pdf_conf, office_conf, ocr_conf, + as_xml, "parseFileToString", "(Ljava/lang/String;\ I\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + Z\ )Lai/yobix/StringResult;", ) } @@ -219,6 +224,7 @@ pub fn parse_bytes_to_string( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, ) -> ExtractResult<(String, Metadata)> { let mut env = get_vm_attach_current_thread()?; @@ -235,12 +241,14 @@ pub fn parse_bytes_to_string( pdf_conf, office_conf, ocr_conf, + as_xml, "parseBytesToString", "(Ljava/nio/ByteBuffer;\ I\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + Z\ )Lai/yobix/StringResult;", ) } @@ -252,6 +260,7 @@ pub fn parse_url_to_string( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, ) -> ExtractResult<(String, Metadata)> { let mut env = get_vm_attach_current_thread()?; @@ -263,12 +272,14 @@ pub fn parse_url_to_string( pdf_conf, office_conf, ocr_conf, + as_xml, "parseUrlToString", "(Ljava/lang/String;\ I\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + Z\ )Lai/yobix/StringResult;", ) } diff --git a/extractous-core/tests/extract_to_xml_tests.rs b/extractous-core/tests/extract_to_xml_tests.rs new file mode 100644 index 0000000..807f76c --- /dev/null +++ b/extractous-core/tests/extract_to_xml_tests.rs @@ -0,0 +1,100 @@ +use extractous::Extractor; +use std::fs; +use test_case::test_case; +use textdistance::nstr::cosine; +use quick_xml::reader::Reader; +// use quick_xml::events::{Event, BytesStart}; +use quick_xml::events::Event; + + +// Declarers the shared test_utils code as module in this integration test +mod test_utils; + +fn extract_p_tag_content(xml: &str) -> String { + let mut reader = Reader::from_str(xml); + reader.config_mut().trim_text(true); // Trim surrounding whitespace + let mut buf = Vec::new(); + let mut collected_content = String::new(); + let mut inside_body = false; + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) if e.name().as_ref() == b"body" => { + inside_body = true; + } + Ok(Event::End(ref e)) if e.name().as_ref() == b"body" => { + inside_body = false; + } + Ok(Event::Text(e)) if inside_body => { + collected_content.push_str(&e.unescape().unwrap().into_owned()); + collected_content.push('\n'); // Separate paragraphs with newline + } + // Ok(Event::Start(e)) => { + // Check if this is a

tag + // if e.name().as_ref() == b"p" { + // if let Ok(Event::Text(e)) = reader.read_event_into(&mut buf) { + // // Append the text content of the

tag + // collected_content.push_str(&e.unescape().unwrap().into_owned()); + // collected_content.push('\n'); // Separate paragraphs with newline + // } + // } + // } + Ok(Event::Eof) => break, + Err(e) => { + eprintln!("Error reading XML: {}", e); + break; + } + _ => (), + } + buf.clear(); + } + + collected_content.trim_end().to_string() +} + +#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")] +#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")] +#[test_case("simple.odt", 0.8; "Test ODT file")] +#[test_case("table-multi-row-column-cells-actual.csv", 0.8; "Test CSV file")] +#[test_case("vodafone.xlsx", 0.4; "Test XLSX file")] +#[test_case("category-level.docx", 0.8; "Test DOCX file")] +#[test_case("simple.doc", 0.8; "Test DOC file")] +#[test_case("simple.pptx", 0.9; "Test another PPTX file")] +#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")] +#[test_case("winter-sports.epub", 0.8; "Test EPUB file")] +#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")] +//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")] +fn test_extract_file_to_xml(file_name: &str, target_dist: f64) { + let extractor = Extractor::new().set_extract_string_max_length(1000000); + // extract file with extractor + let (extracted_xml, extracted_metadata) = extractor + .extract_file_to_xml(&format!("../test_files/documents/{}", file_name)) + .unwrap(); + println!("{}: {}", file_name, extracted_xml); + let extracted = extract_p_tag_content(&extracted_xml); + + // read expected string + let expected = + fs::read_to_string(format!("../test_files/expected_result/{}.txt", file_name)).unwrap(); + + let dist = cosine(&expected.trim(), &extracted.trim()); + assert!( + dist > target_dist, + "Cosine similarity is less than {} for file: {}, dist: {}", + target_dist, + file_name, + dist + ); + println!("{}: {}", file_name, dist); + + // read expected metadata + let expected_metadata = test_utils::parse_metadata_file(&format!( + "../test_files/expected_result/{}.metadata.json", + file_name + )); + + assert!(test_utils::is_expected_metadata_contained( + &expected_metadata, + &extracted_metadata + )); +} diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java index f9f8e4a..327fbe7 100644 --- a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java +++ b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java @@ -16,12 +16,14 @@ import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ToXMLContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.graalvm.nativeimage.IsolateThread; import org.graalvm.nativeimage.c.function.CEntryPoint; import org.graalvm.nativeimage.c.type.CCharPointer; import org.graalvm.nativeimage.c.type.CConst; import org.graalvm.nativeimage.c.type.CTypeConversion; +import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import java.io.IOException; @@ -74,7 +76,9 @@ public static StringResult parseFileToString( int maxLength, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML + // maybe replace with a single config class ) { try { final Path path = Paths.get(filePath); @@ -82,10 +86,9 @@ public static StringResult parseFileToString( final InputStream stream = TikaInputStream.get(path, metadata); String result = parseToStringWithConfig( - stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig); + stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig, asXML); // No need to close the stream because parseToString does so return new StringResult(result, metadata); - } catch (java.io.IOException e) { return new StringResult((byte) 1, "Could not open file: " + e.getMessage()); } catch (TikaException e) { @@ -104,7 +107,8 @@ public static StringResult parseUrlToString( int maxLength, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) { try { final URL url = new URI(urlString).toURL(); @@ -112,7 +116,7 @@ public static StringResult parseUrlToString( final TikaInputStream stream = TikaInputStream.get(url, metadata); String result = parseToStringWithConfig( - stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig); + stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig, asXML); // No need to close the stream because parseToString does so return new StringResult(result, metadata); @@ -138,7 +142,8 @@ public static StringResult parseBytesToString( int maxLength, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) { final Metadata metadata = new Metadata(); final ByteBufferInputStream inStream = new ByteBufferInputStream(data); @@ -146,7 +151,7 @@ public static StringResult parseBytesToString( try { String result = parseToStringWithConfig( - stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig); + stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig, asXML); // No need to close the stream because parseToString does so return new StringResult(result, metadata); } catch (java.io.IOException e) { @@ -156,16 +161,24 @@ public static StringResult parseBytesToString( } } - private static String parseToStringWithConfig( InputStream stream, Metadata metadata, int maxLength, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) throws IOException, TikaException { - final WriteOutContentHandler handler = new WriteOutContentHandler(maxLength); + ContentHandler handler; + ContentHandler handlerForParser; + if (asXML) { + handler = new WriteOutContentHandler(new ToXMLContentHandler(), maxLength); + handlerForParser = handler; + } else { + handler = new WriteOutContentHandler(maxLength); + handlerForParser = new BodyContentHandler(handler); + } try { final TikaConfig config = TikaConfig.getDefaultConfig(); @@ -177,8 +190,7 @@ private static String parseToStringWithConfig( parsecontext.set(OfficeParserConfig.class, officeConfig); parsecontext.set(TesseractOCRConfig.class, tesseractConfig); - parser.parse(stream, new BodyContentHandler(handler), metadata, parsecontext); - + parser.parse(stream, handlerForParser, metadata, parsecontext); } catch (SAXException e) { if (!WriteLimitReachedException.isWriteLimitReached(e)) { // This should never happen with BodyContentHandler... @@ -336,4 +348,4 @@ private static CCharPointer cParseToString(IsolateThread thread, @CConst CCharPo } } -} \ No newline at end of file +} diff --git a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json index bd46a15..08962c8 100644 --- a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json +++ b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json @@ -113,7 +113,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -133,7 +134,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -153,7 +155,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] } ], @@ -5940,4 +5943,4 @@ "type": "java.util.concurrent.locks.ReentrantLock$Sync" } ] -} \ No newline at end of file +} diff --git a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json index 8285d7d..93757ce 100644 --- a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json +++ b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json @@ -119,7 +119,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -139,7 +140,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -159,7 +161,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] } ], @@ -6053,4 +6056,4 @@ "type": "java.util.concurrent.locks.ReentrantLock$Sync" } ] -} \ No newline at end of file +} diff --git a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json index f703e05..3cbb0e1 100644 --- a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json +++ b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json @@ -113,7 +113,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -133,7 +134,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -153,7 +155,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] } ], @@ -6142,4 +6145,4 @@ "type": "java.util.concurrent.locks.ReentrantLock$Sync" } ] -} \ No newline at end of file +}