From acf4358fa04c35a03f407bac583307bc7e859a77 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Tue, 14 Jan 2025 10:23:32 +0100 Subject: [PATCH] Fix deprecated load() with kwargs --- .../document_loaders/parsers/pdf.py | 20 ++++++++++++++++--- .../document_loaders/pdf.py | 15 ++++++++++++-- .../document_loaders/test_pdf.py | 8 ++++++++ 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 27547fde44336..d421352881900 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -531,6 +531,15 @@ def __init__( self.extract_tables_settings = extract_tables_settings def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] + return self._lazy_parse( + blob, + ) + + def _lazy_parse( + self, + blob: Blob, + text_kwargs: Optional[dict[str, Any]] = None, # deprectaed + ) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. @@ -547,6 +556,8 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty try: import pymupdf + if not text_kwargs: + text_kwargs = {} if not self.extract_tables_settings: from pymupdf.table import ( DEFAULT_JOIN_TOLERANCE, @@ -597,7 +608,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty doc_metadata = self._extract_metadata(doc, blob) full_content = [] for page in doc: - all_text = self._get_page_content(doc, page, blob).strip() + all_text = self._get_page_content(doc, page, text_kwargs).strip() if self.mode == "page": yield Document( page_content=all_text, @@ -615,7 +626,10 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty ) def _get_page_content( - self, doc: pymupdf.Document, page: pymupdf.Page, blob: Blob + self, + doc: pymupdf.Document, + page: pymupdf.Page, + text_kwargs: dict[str, Any], ) -> str: """Get the text of the page using PyMuPDF and RapidOCR and issue a warning if it is empty. @@ -628,7 +642,7 @@ def _get_page_content( Returns: str: The text content of the page. """ - text_from_page = page.get_text(**self.text_kwargs) + text_from_page = page.get_text(**{**self.text_kwargs, **text_kwargs}) images_from_page = self._extract_images_from_page(doc, page) tables_from_page = self._extract_tables_from_page(page) extras = [] diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 8a688068431cc..252016f58595d 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -544,17 +544,28 @@ def __init__( extract_tables_settings=extract_tables_settings, ) - def lazy_load(self) -> Iterator[Document]: + def _lazy_load(self, **kwargs: Any) -> Iterator[Document]: """Lazy load given path as pages or single document (see `mode`). Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. """ + if kwargs: + logger.warning( + f"Received runtime arguments {kwargs}. Passing runtime args to `load`" + f" is deprecated. Please pass arguments during initialization instead." + ) parser = self.parser if self.web_path: blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] else: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] - yield from parser.lazy_parse(blob) + yield from parser._lazy_parse(blob, text_kwargs=kwargs) + + def load(self, **kwargs: Any) -> list[Document]: + return list(self._lazy_load(**kwargs)) + + def lazy_load(self) -> Iterator[Document]: + yield from self._lazy_load() # MathpixPDFLoader implementation taken largely from Daniel Gross's: diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index c1b8e43caa7b3..a681dce8c59c0 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -237,3 +237,11 @@ def test_standard_parameters( assert loader.web_path == web_path assert loader.file_path != web_path assert len(docs) == 1 + + +def test_pymupdf_deprecated_kwards() -> None: + from langchain_community.document_loaders import PyMuPDFLoader + + file_path = Path(__file__).parent.parent / "examples/hello.pdf" + loader = PyMuPDFLoader(file_path=file_path) + loader.load(sort=True)