Skip to content

Commit

Permalink
feat: adapted the python binding to work with the new extractor api
Browse files Browse the repository at this point in the history
  • Loading branch information
nmammeri committed Sep 11, 2024
1 parent b20209a commit 0f78452
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 17 deletions.
2 changes: 1 addition & 1 deletion bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ doc = false

[dependencies]
# "abi3-py310" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.10
pyo3 = { version = "0.21.2", features = ["abi3", "abi3-py38"] }
pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38"] }
extract-rs = { path = "../../extract-core" }
1 change: 1 addition & 0 deletions bindings/python/src/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions bindings/python/src/errors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

96 changes: 96 additions & 0 deletions bindings/python/src/extractor.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
use crate::ecore;
use pyo3::exceptions::PyTypeError;
use pyo3::prelude::*;
use pyo3::types::PyBytes;
use std::io::Read;

// PyO3 supports unit-only enums (which contain only unit variants)
// These simple enums behave similarly to Python's enumerations (enum.Enum)
#[pyclass(eq, eq_int)]
#[derive(Clone, PartialEq)]
#[allow(non_camel_case_types)]
pub enum CharSet {
UTF_8,
US_ASCII,
UTF_16BE,
}

impl From<CharSet> for ecore::CharSet {
fn from(charset: CharSet) -> Self {
match charset {
CharSet::UTF_8 => ecore::CharSet::UTF_8,
CharSet::US_ASCII => ecore::CharSet::US_ASCII,
CharSet::UTF_16BE => ecore::CharSet::UTF_16BE,
}
}
}

#[pyclass]
pub struct StreamReader(ecore::StreamReader);

#[pymethods]
impl StreamReader {
// Expose the `read` method as `read` in Python
pub fn read<'py>(&mut self, py: Python<'py>, size: usize) -> PyResult<Bound<'py, PyBytes>> {
let mut buf = vec![0u8; size];
match self.0.read(&mut buf) {
Ok(bytes_read) => {
buf.truncate(bytes_read); // Truncate buffer to actual read size
let py_bytes = PyBytes::new_bound(py, &buf);
Ok(py_bytes)
}
Err(e) => Err(PyErr::new::<pyo3::exceptions::PyIOError, _>(format!(
"{}",
e
))),
}
}
}

/// `Extractor` is the entry for all extract APIs
///
/// Create a new `Extractor` with the default configuration.
#[pyclass]
pub struct Extractor(ecore::Extractor);

#[pymethods]
impl Extractor {
#[new]
pub fn new() -> Self {
Extractor(ecore::Extractor::new())
}

pub fn set_extract_string_max_length(&self, max_length: i32) -> Self {
let inner = self.0.clone().set_extract_string_max_length(max_length);
Self(inner)
}

pub fn set_encoding(&self, encoding: CharSet) -> PyResult<Self> {
let inner = self.0.clone().set_encoding(encoding.into());
Ok(Self(inner))
}

// pub fn set_pdf_config(&self, config: ecore::PdfParserConfig) -> PyResult<Self> {
// let inner = self.0.clone().set_pdf_config(config);
// Ok(Self(inner))
// }

pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
self.0
.extract_file_to_string(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
}

pub fn extract_file(&self, filename: &str) -> PyResult<StreamReader> {
let reader = self
.0
.extract_file(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

Ok(StreamReader(reader))
}

fn __repr__(&self) -> String {
format!("{:?}", self.0)
}
}
27 changes: 16 additions & 11 deletions bindings/python/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
use pyo3::exceptions::PyTypeError;
// Expose the extract-rs rust core as `ecore`.
// We will use `ecore::Xxx` to represents all types from extract-rs rust core.
pub use ::extract_rs as ecore;
use pyo3::prelude::*;

/// extract content of file
#[pyfunction]
fn extract(filename: &str) -> PyResult<String> {
match extract_rs::extract(filename) {
Ok(content) => Ok(content),
Err(e) => Err(PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
}
}
//use pyo3::exceptions::PyTypeError;

// Modules
mod errors;
//pub use errors::*;
mod extractor;
pub use extractor::*;
mod config;
//pub use config::*;

/// A Python module implemented in Rust. The name of this function must match
/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
/// import the module.
#[pymodule]
fn _extractrs(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(extract, m)?)?;
m.add_class::<CharSet>()?;
m.add_class::<StreamReader>()?;
m.add_class::<Extractor>()?;
Ok(())
}
}
28 changes: 23 additions & 5 deletions bindings/python/tests/test_pdf.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,27 @@
from extractrs import Extractor

from extractrs import extract

def expected_result():
return "\nHello Quarkus\n\n\n"


def test_extract_file_to_string():
extractor = Extractor()
result = extractor.extract_file_to_string("tests/quarkus.pdf")

print(result)
assert result == expected_result()


def test_extract_file():
extractor = Extractor()
reader = extractor.extract_file("tests/quarkus.pdf")

result = ""
b = reader.read(4096)
while len(b) > 0:
result += b.decode("utf-8")
b = reader.read(4096)

def test_pdf():
result = extract("tests/quarkus.pdf")
print("")
print(result)
assert result == "\nHello Quarkus\n\n\n"
assert result == expected_result()

0 comments on commit 0f78452

Please sign in to comment.