Skip to content

Commit

Permalink
x
Browse files Browse the repository at this point in the history
  • Loading branch information
efriis committed Dec 13, 2024
1 parent 416e9f8 commit 1db4def
Showing 1 changed file with 20 additions and 4 deletions.
24 changes: 20 additions & 4 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,16 +243,32 @@ def get_image(layout_object: Any) -> Any:
else:
filter_names = [img_filter.name]

if any(name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names):
without_loss = any(
name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names
)
with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names)
non_matching = {name for name in filter_names} - {
*_PDF_FILTER_WITHOUT_LOSS,
*_PDF_FILTER_WITH_LOSS,
}

if without_loss and with_loss:
warnings.warn(
"Image has both lossy and lossless filters. Defaulting to lossless"
)

if non_matching:
warnings.warn(f"Unknown PDF Filter(s): {non_matching}")

if without_loss:
images.append(
np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
img.stream["Height"], img.stream["Width"], -1
)
)
elif any(name in _PDF_FILTER_WITH_LOSS for name in filter_names):
elif with_loss:
images.append(img.stream.get_data())
else:
warnings.warn("Unknown PDF Filter!")

return extract_from_images_with_rapidocr(images)


Expand Down

0 comments on commit 1db4def

Please sign in to comment.