Skip to content

Commit

Permalink
Merge pull request #29 from yobix-ai/3-return-metadata-with-extractio…
Browse files Browse the repository at this point in the history
…n-result

change extractor api to return tuple of result and metadata
  • Loading branch information
nmammeri authored Nov 17, 2024
2 parents ef326fa + 5e32273 commit 6ee0cd6
Show file tree
Hide file tree
Showing 19 changed files with 426 additions and 350 deletions.
135 changes: 56 additions & 79 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,32 +136,16 @@ impl Extractor {
Ok(Self(inner))
}

/// Extracts text from a file path. Returns a stream of the extracted text
/// the stream is decoded using the extractor's `encoding`
pub fn extract_file(&self, filename: &str) -> PyResult<StreamReader> {
let reader = self
.0
.extract_file(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Ok(StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
})
}

/// Extracts text from a file path. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_file_with_metadata<'py>(
pub fn extract_file<'py>(
&self,
filename: &str,
py: Python<'py>,
) -> PyResult<(StreamReader, PyObject)> {
let (reader, metadata) = self
.0
.extract_file_with_metadata(filename)
.extract_file(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Expand All @@ -176,58 +160,17 @@ impl Extractor {
))
}

/// Extracts text from a file path. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
self.0
.extract_file_to_string(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
}

/// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata.
pub fn extract_file_to_string_with_metadata<'py>(
&self,
filename: &str,
py: Python<'py>,
) -> PyResult<(String, PyObject)> {
let (content, metadata) = self
.0
.extract_file_to_string_with_metadata(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
}

/// Extracts text from a bytearray. Returns a stream of the extracted text
/// the stream is decoded using the extractor's `encoding`
pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
let slice = buffer.to_vec();
let reader = self
.0
.extract_bytes(&slice)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Ok(StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
})
}

/// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_bytes_with_metadata<'py>(
pub fn extract_bytes<'py>(
&self,
buffer: &Bound<'_, PyByteArray>,
py: Python<'py>,
) -> PyResult<(StreamReader, PyObject)> {
let slice = buffer.to_vec();
let (reader, metadata) = self
.0
.extract_bytes_with_metadata(&slice)
.extract_bytes(&slice)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Expand All @@ -242,32 +185,16 @@ impl Extractor {
))
}

/// Extracts text from a url. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
let reader = self
.0
.extract_url(&url)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Ok(StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
})
}

/// Extracts text from a url. Returns a tuple with string that is of maximum length
/// of the extractor's `extract_string_max_length` and tika metdata.
pub fn extract_url_with_metadata<'py>(
pub fn extract_url<'py>(
&self,
url: &str,
py: Python<'py>,
) -> PyResult<(StreamReader, PyObject)> {
let (reader, metadata) = self
.0
.extract_url_with_metadata(&url)
.extract_url(&url)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Expand All @@ -282,6 +209,56 @@ impl Extractor {
))
}


/// Extracts text from a file path. Returns a tuple with string that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata as dict.
pub fn extract_file_to_string<'py>(
&self,
filename: &str,
py: Python<'py>,
) -> PyResult<(String, PyObject)> {
let (content, metadata) = self
.0
.extract_file_to_string(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
}

/// Extracts text from a bytearray. string that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata as dict.
pub fn extract_bytes_to_string<'py>(
&self,
buffer: &Bound<'_, PyByteArray>,
py: Python<'py>,
) -> PyResult<(String, PyObject)> {
let (content, metadata) = self
.0
.extract_bytes_to_string(&buffer.to_vec())
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
}

/// Extracts text from a URL. Returns a tuple with string that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata as dict.
pub fn extract_url_to_string<'py>(
&self,
url: &str,
py: Python<'py>,
) -> PyResult<(String, PyObject)> {
let (content, metadata) = self
.0
.extract_url_to_string(url)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
}

fn __repr__(&self) -> String {
format!("{:?}", self.0)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import pytest

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
is_expected_metadata_contained

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand All @@ -21,50 +22,58 @@


@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist):
def test_extract_bytes_to_string(file_name, target_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
reader = extractor.extract_bytes(file_bytes)
result = read_to_string(reader)
result, metadata = extractor.extract_bytes_to_string(file_bytes)

# Expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()

# Check Expected
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > target_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"

TEST_CASES_METADATA = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.6),
("vodafone.xlsx", 0.8),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", 0.9),
("winter-sports.epub", 0.8),
("bug_16.docx", 0.9),
]


@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
def test_extract_bytes_to_stream(file_name, similarity_percent):
@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
file_bytes = read_file_to_bytearray(original_filepath)
extractor = Extractor()
_reader, metadata = extractor.extract_bytes_with_metadata(file_bytes)

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
reader, metadata = extractor.extract_bytes(file_bytes)
result = read_to_string(reader)

# Check Expected
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > similarity_percent, \
assert percent_similarity > target_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

from extractous import Extractor
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand All @@ -19,44 +19,55 @@
#("eng-ocr.pdf", 0.9),
]


@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_file_to_string(file_name, target_dist):
"""Test the extraction and comparison of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
extractor = Extractor()
result = extractor.extract_file_to_string(original_filepath)
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()

with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

# Extract
extractor = Extractor()
result, metadata = extractor.extract_file_to_string(original_filepath)

# Check extracted
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
#metadata.pop("dc:format")
assert is_expected_metadata_contained(expected_metadata, metadata)

TEST_CASES_METADATA = [
"2022_Q3_AAPL.pdf",
"science-exploration-1p.pptx",
"simple.odt",
"table-multi-row-column-cells-actual.csv",
"vodafone.xlsx",
"category-level.docx",
"simple.doc",
"simple.pptx",
"table-multi-row-column-cells.png",
"winter-sports.epub",
"bug_16.docx",
]

@pytest.mark.parametrize("file_name", TEST_CASES_METADATA)
def test_extract_file_to_string_with_metadata(file_name):
"""Test the extraction and comparison of various file types."""
@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_file_to_stream(file_name, target_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
extractor = Extractor()
_result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)

# Read expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

#metadata.pop("dc:format")
assert is_expected_metadata_contained(expected_metadata, metadata)
# Extract
extractor = Extractor()
reader, metadata = extractor.extract_file(original_filepath)
result = read_to_string(reader)

# Check Expected
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

# Check metadata
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > target_dist, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
Loading

0 comments on commit 6ee0cd6

Please sign in to comment.