-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: adapted the python binding to work with the new extractor api
- Loading branch information
Showing
6 changed files
with
138 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
use crate::ecore; | ||
use pyo3::exceptions::PyTypeError; | ||
use pyo3::prelude::*; | ||
use pyo3::types::PyBytes; | ||
use std::io::Read; | ||
|
||
// PyO3 supports unit-only enums (which contain only unit variants) | ||
// These simple enums behave similarly to Python's enumerations (enum.Enum) | ||
#[pyclass(eq, eq_int)] | ||
#[derive(Clone, PartialEq)] | ||
#[allow(non_camel_case_types)] | ||
pub enum CharSet { | ||
UTF_8, | ||
US_ASCII, | ||
UTF_16BE, | ||
} | ||
|
||
impl From<CharSet> for ecore::CharSet { | ||
fn from(charset: CharSet) -> Self { | ||
match charset { | ||
CharSet::UTF_8 => ecore::CharSet::UTF_8, | ||
CharSet::US_ASCII => ecore::CharSet::US_ASCII, | ||
CharSet::UTF_16BE => ecore::CharSet::UTF_16BE, | ||
} | ||
} | ||
} | ||
|
||
#[pyclass] | ||
pub struct StreamReader(ecore::StreamReader); | ||
|
||
#[pymethods] | ||
impl StreamReader { | ||
// Expose the `read` method as `read` in Python | ||
pub fn read<'py>(&mut self, py: Python<'py>, size: usize) -> PyResult<Bound<'py, PyBytes>> { | ||
let mut buf = vec![0u8; size]; | ||
match self.0.read(&mut buf) { | ||
Ok(bytes_read) => { | ||
buf.truncate(bytes_read); // Truncate buffer to actual read size | ||
let py_bytes = PyBytes::new_bound(py, &buf); | ||
Ok(py_bytes) | ||
} | ||
Err(e) => Err(PyErr::new::<pyo3::exceptions::PyIOError, _>(format!( | ||
"{}", | ||
e | ||
))), | ||
} | ||
} | ||
} | ||
|
||
/// `Extractor` is the entry for all extract APIs | ||
/// | ||
/// Create a new `Extractor` with the default configuration. | ||
#[pyclass] | ||
pub struct Extractor(ecore::Extractor); | ||
|
||
#[pymethods] | ||
impl Extractor { | ||
#[new] | ||
pub fn new() -> Self { | ||
Extractor(ecore::Extractor::new()) | ||
} | ||
|
||
pub fn set_extract_string_max_length(&self, max_length: i32) -> Self { | ||
let inner = self.0.clone().set_extract_string_max_length(max_length); | ||
Self(inner) | ||
} | ||
|
||
pub fn set_encoding(&self, encoding: CharSet) -> PyResult<Self> { | ||
let inner = self.0.clone().set_encoding(encoding.into()); | ||
Ok(Self(inner)) | ||
} | ||
|
||
// pub fn set_pdf_config(&self, config: ecore::PdfParserConfig) -> PyResult<Self> { | ||
// let inner = self.0.clone().set_pdf_config(config); | ||
// Ok(Self(inner)) | ||
// } | ||
|
||
pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> { | ||
self.0 | ||
.extract_file_to_string(filename) | ||
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e))) | ||
} | ||
|
||
pub fn extract_file(&self, filename: &str) -> PyResult<StreamReader> { | ||
let reader = self | ||
.0 | ||
.extract_file(filename) | ||
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?; | ||
|
||
Ok(StreamReader(reader)) | ||
} | ||
|
||
fn __repr__(&self) -> String { | ||
format!("{:?}", self.0) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,25 @@ | ||
use pyo3::exceptions::PyTypeError; | ||
// Expose the extract-rs rust core as `ecore`. | ||
// We will use `ecore::Xxx` to represents all types from extract-rs rust core. | ||
pub use ::extract_rs as ecore; | ||
use pyo3::prelude::*; | ||
|
||
/// extract content of file | ||
#[pyfunction] | ||
fn extract(filename: &str) -> PyResult<String> { | ||
match extract_rs::extract(filename) { | ||
Ok(content) => Ok(content), | ||
Err(e) => Err(PyErr::new::<PyTypeError, _>(format!("{:?}", e))) | ||
} | ||
} | ||
//use pyo3::exceptions::PyTypeError; | ||
|
||
// Modules | ||
mod errors; | ||
//pub use errors::*; | ||
mod extractor; | ||
pub use extractor::*; | ||
mod config; | ||
//pub use config::*; | ||
|
||
/// A Python module implemented in Rust. The name of this function must match | ||
/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to | ||
/// import the module. | ||
#[pymodule] | ||
fn _extractrs(m: &Bound<'_, PyModule>) -> PyResult<()> { | ||
m.add_function(wrap_pyfunction!(extract, m)?)?; | ||
m.add_class::<CharSet>()?; | ||
m.add_class::<StreamReader>()?; | ||
m.add_class::<Extractor>()?; | ||
Ok(()) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,27 @@ | ||
from extractrs import Extractor | ||
|
||
from extractrs import extract | ||
|
||
def expected_result(): | ||
return "\nHello Quarkus\n\n\n" | ||
|
||
|
||
def test_extract_file_to_string(): | ||
extractor = Extractor() | ||
result = extractor.extract_file_to_string("tests/quarkus.pdf") | ||
|
||
print(result) | ||
assert result == expected_result() | ||
|
||
|
||
def test_extract_file(): | ||
extractor = Extractor() | ||
reader = extractor.extract_file("tests/quarkus.pdf") | ||
|
||
result = "" | ||
b = reader.read(4096) | ||
while len(b) > 0: | ||
result += b.decode("utf-8") | ||
b = reader.read(4096) | ||
|
||
def test_pdf(): | ||
result = extract("tests/quarkus.pdf") | ||
print("") | ||
print(result) | ||
assert result == "\nHello Quarkus\n\n\n" | ||
assert result == expected_result() |