From 4e0a6ebe7d6a3f404077817e483a1fb56b021d88 Mon Sep 17 00:00:00 2001 From: Erik Date: Thu, 19 Sep 2024 07:22:09 +0200 Subject: [PATCH] community: Add warning when page_content is empty (#25955) Page content sometimes is empty when PyMuPDF can not find text on pages. For example, this can happen when the text of the PDF is not copyable "by hand". Then an OCR solution is need - which is not integrated here. This warning should accurately warn the user that some pages are lost during this process. Thank you for contributing to LangChain! - [ ] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis --- .../document_loaders/parsers/pdf.py | 55 ++++++++++++++----- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 2013c948fb8af..d0329b7cc1bea 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -267,6 +267,7 @@ def __init__( def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob.""" + import fitz with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] @@ -277,25 +278,49 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty yield from [ Document( - page_content=page.get_text(**self.text_kwargs) - + self._extract_images_from_page(doc, page), - metadata=dict( - { - "source": blob.source, # type: ignore[attr-defined] - "file_path": blob.source, # type: ignore[attr-defined] - "page": page.number, - "total_pages": len(doc), - }, - **{ - k: doc.metadata[k] - for k in doc.metadata - if type(doc.metadata[k]) in [str, int] - }, - ), + page_content=self._get_page_content(doc, page, blob), + metadata=self._extract_metadata(doc, page, blob), ) for page in doc ] + def _get_page_content( + self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob + ) -> str: + """ + Get the text of the page using PyMuPDF and RapidOCR and issue a warning + if it is empty. + """ + content = page.get_text(**self.text_kwargs) + self._extract_images_from_page( + doc, page + ) + + if not content: + warnings.warn( + f"Warning: Empty content on page " + f"{page.number} of document {blob.source}" + ) + + return content + + def _extract_metadata( + self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob + ) -> dict: + """Extract metadata from the document and page.""" + return dict( + { + "source": blob.source, # type: ignore[attr-defined] + "file_path": blob.source, # type: ignore[attr-defined] + "page": page.number, + "total_pages": len(doc), + }, + **{ + k: doc.metadata[k] + for k in doc.metadata + if isinstance(doc.metadata[k], (str, int)) + }, + ) + def _extract_images_from_page( self, doc: fitz.fitz.Document, page: fitz.fitz.Page ) -> str: