From 1db4def2df60edd08785e2ad6797c9f6d98edb39 Mon Sep 17 00:00:00 2001
From: Erick Friis <erick@langchain.dev>
Date: Fri, 13 Dec 2024 15:21:58 -0800
Subject: [PATCH] x

---
 .../document_loaders/parsers/pdf.py           | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index 4ff9a05d9e12a..c603dde71eb32 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -243,16 +243,32 @@ def get_image(layout_object: Any) -> Any:
             else:
                 filter_names = [img_filter.name]
 
-            if any(name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names):
+            without_loss = any(
+                name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names
+            )
+            with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names)
+            non_matching = {name for name in filter_names} - {
+                *_PDF_FILTER_WITHOUT_LOSS,
+                *_PDF_FILTER_WITH_LOSS,
+            }
+
+            if without_loss and with_loss:
+                warnings.warn(
+                    "Image has both lossy and lossless filters. Defaulting to lossless"
+                )
+
+            if non_matching:
+                warnings.warn(f"Unknown PDF Filter(s): {non_matching}")
+
+            if without_loss:
                 images.append(
                     np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
                         img.stream["Height"], img.stream["Width"], -1
                     )
                 )
-            elif any(name in _PDF_FILTER_WITH_LOSS for name in filter_names):
+            elif with_loss:
                 images.append(img.stream.get_data())
-            else:
-                warnings.warn("Unknown PDF Filter!")
+
         return extract_from_images_with_rapidocr(images)