From 1db4def2df60edd08785e2ad6797c9f6d98edb39 Mon Sep 17 00:00:00 2001 From: Erick Friis Date: Fri, 13 Dec 2024 15:21:58 -0800 Subject: [PATCH] x --- .../document_loaders/parsers/pdf.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 4ff9a05d9e12a..c603dde71eb32 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -243,16 +243,32 @@ def get_image(layout_object: Any) -> Any: else: filter_names = [img_filter.name] - if any(name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names): + without_loss = any( + name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names + ) + with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names) + non_matching = {name for name in filter_names} - { + *_PDF_FILTER_WITHOUT_LOSS, + *_PDF_FILTER_WITH_LOSS, + } + + if without_loss and with_loss: + warnings.warn( + "Image has both lossy and lossless filters. Defaulting to lossless" + ) + + if non_matching: + warnings.warn(f"Unknown PDF Filter(s): {non_matching}") + + if without_loss: images.append( np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape( img.stream["Height"], img.stream["Width"], -1 ) ) - elif any(name in _PDF_FILTER_WITH_LOSS for name in filter_names): + elif with_loss: images.append(img.stream.get_data()) - else: - warnings.warn("Unknown PDF Filter!") + return extract_from_images_with_rapidocr(images)