From acf4358fa04c35a03f407bac583307bc7e859a77 Mon Sep 17 00:00:00 2001
From: Philippe Prados <github@prados.fr>
Date: Tue, 14 Jan 2025 10:23:32 +0100
Subject: [PATCH] Fix deprecated load() with kwargs

---
 .../document_loaders/parsers/pdf.py           | 20 ++++++++++++++++---
 .../document_loaders/pdf.py                   | 15 ++++++++++++--
 .../document_loaders/test_pdf.py              |  8 ++++++++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index 27547fde44336..d421352881900 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -531,6 +531,15 @@ def __init__(
         self.extract_tables_settings = extract_tables_settings
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
+        return self._lazy_parse(
+            blob,
+        )
+
+    def _lazy_parse(
+        self,
+        blob: Blob,
+        text_kwargs: Optional[dict[str, Any]] = None,  # deprectaed
+    ) -> Iterator[Document]:  # type: ignore[valid-type]
         """Lazily parse the blob.
         Insert image, if possible, between two paragraphs.
         In this way, a paragraph can be continued on the next page.
@@ -547,6 +556,8 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
         try:
             import pymupdf
 
+            if not text_kwargs:
+                text_kwargs = {}
             if not self.extract_tables_settings:
                 from pymupdf.table import (
                     DEFAULT_JOIN_TOLERANCE,
@@ -597,7 +608,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
                 doc_metadata = self._extract_metadata(doc, blob)
                 full_content = []
                 for page in doc:
-                    all_text = self._get_page_content(doc, page, blob).strip()
+                    all_text = self._get_page_content(doc, page, text_kwargs).strip()
                     if self.mode == "page":
                         yield Document(
                             page_content=all_text,
@@ -615,7 +626,10 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
                     )
 
     def _get_page_content(
-        self, doc: pymupdf.Document, page: pymupdf.Page, blob: Blob
+        self,
+        doc: pymupdf.Document,
+        page: pymupdf.Page,
+        text_kwargs: dict[str, Any],
     ) -> str:
         """Get the text of the page using PyMuPDF and RapidOCR and issue a warning
         if it is empty.
@@ -628,7 +642,7 @@ def _get_page_content(
         Returns:
             str: The text content of the page.
         """
-        text_from_page = page.get_text(**self.text_kwargs)
+        text_from_page = page.get_text(**{**self.text_kwargs, **text_kwargs})
         images_from_page = self._extract_images_from_page(doc, page)
         tables_from_page = self._extract_tables_from_page(page)
         extras = []
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
index 8a688068431cc..252016f58595d 100644
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -544,17 +544,28 @@ def __init__(
             extract_tables_settings=extract_tables_settings,
         )
 
-    def lazy_load(self) -> Iterator[Document]:
+    def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
         """Lazy load given path as pages or single document (see `mode`).
         Insert image, if possible, between two paragraphs.
         In this way, a paragraph can be continued on the next page.
         """
+        if kwargs:
+            logger.warning(
+                f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
+                f" is deprecated. Please pass arguments during initialization instead."
+            )
         parser = self.parser
         if self.web_path:
             blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]
         else:
             blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]
-        yield from parser.lazy_parse(blob)
+        yield from parser._lazy_parse(blob, text_kwargs=kwargs)
+
+    def load(self, **kwargs: Any) -> list[Document]:
+        return list(self._lazy_load(**kwargs))
+
+    def lazy_load(self) -> Iterator[Document]:
+        yield from self._lazy_load()
 
 
 # MathpixPDFLoader implementation taken largely from Daniel Gross's:
diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
index c1b8e43caa7b3..a681dce8c59c0 100644
--- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
@@ -237,3 +237,11 @@ def test_standard_parameters(
     assert loader.web_path == web_path
     assert loader.file_path != web_path
     assert len(docs) == 1
+
+
+def test_pymupdf_deprecated_kwards() -> None:
+    from langchain_community.document_loaders import PyMuPDFLoader
+
+    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
+    loader = PyMuPDFLoader(file_path=file_path)
+    loader.load(sort=True)