diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py index 1eb6bc6d307c7..2da542d9cc552 100644 --- a/libs/community/langchain_community/document_loaders/parsers/__init__.py +++ b/libs/community/langchain_community/document_loaders/parsers/__init__.py @@ -36,7 +36,7 @@ _module_lookup = { - "AdobePDFExtractionParser": "langchain_community.document_loaders.parsers.adobe_pdf_services", # noqa: E501 + "AdobePDFExtractParser": "langchain_community.document_loaders.parsers.adobe_pdf_extract", # noqa: E501 "AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence", # noqa: E501 "BS4HTMLParser": "langchain_community.document_loaders.parsers.html", "DocAIParser": "langchain_community.document_loaders.parsers.docai", diff --git a/libs/community/langchain_community/document_loaders/parsers/adobe_pdf_extract.py b/libs/community/langchain_community/document_loaders/parsers/adobe_pdf_extract.py index e283b9d9e4fdb..0c02a88651c7c 100644 --- a/libs/community/langchain_community/document_loaders/parsers/adobe_pdf_extract.py +++ b/libs/community/langchain_community/document_loaders/parsers/adobe_pdf_extract.py @@ -54,10 +54,16 @@ def __init__( mode: Literal["chunks", "json", "data"] = "chunks", embed_figures: bool = True, ): - from adobe.pdfservices.operation.auth.service_principal_credentials import ( - ServicePrincipalCredentials, - ) - from adobe.pdfservices.operation.pdf_services import PDFServices + try: + from adobe.pdfservices.operation.auth.service_principal_credentials import ( + ServicePrincipalCredentials, + ) + from adobe.pdfservices.operation.pdf_services import PDFServices + except (ImportError, ModuleNotFoundError): + raise ImportError( + "The Adobe PDF Services SDK is not installed. Please install it using " + "`pip install pdfservices-sdk`." + ) self.credentials = ServicePrincipalCredentials( client_id=client_id, client_secret=client_secret @@ -85,27 +91,11 @@ def _generate_docs_chunks( ) -> Iterator[Document]: headers: list = [] current_paragraphs: list = [] - header_page = None + header_page = 0 paragraph_page = None last_header_level = None figures: dict = {} - def yield_chunk() -> Iterator[Document]: - if current_paragraphs: - merged_paragraphs = "".join([p for p in current_paragraphs]).strip() - yield Document( - page_content=merged_paragraphs, - metadata={ - "headers": headers.copy(), - "page": header_page + 1 - if header_page - else (paragraph_page or 0) + 1, - "figures": figures, - }, - ) - current_paragraphs.clear() - figures.clear() - for element in json_data["elements"]: text = element.get("Text", "") path = element.get("Path", "") @@ -117,12 +107,25 @@ def yield_chunk() -> Iterator[Document]: if "Figure" in path: continue + if current_paragraphs: + merged_paragraphs = "".join([p for p in current_paragraphs]).strip() + yield Document( + page_content=merged_paragraphs, + metadata={ + "headers": headers.copy(), + "page": header_page + 1 + if header_page + else (paragraph_page or 0) + 1, + "figures": figures, + }, + ) + current_paragraphs.clear() + figures.clear() + if last_header_level and last_header_level < header_level: - yield from yield_chunk() headers.append(text) else: - yield from yield_chunk() headers = headers[: header_level - 1] headers.append(text) @@ -158,7 +161,18 @@ def yield_chunk() -> Iterator[Document]: elif "Table" not in path: current_paragraphs.append(text + "\n") - yield from yield_chunk() + if current_paragraphs: + merged_paragraphs = "".join([p for p in current_paragraphs]).strip() + yield Document( + page_content=merged_paragraphs, + metadata={ + "headers": headers.copy(), + "page": header_page + 1 + if header_page + else (paragraph_page or 0) + 1, + "figures": figures, + }, + ) def _generate_docs_data(self, archive: zipfile.ZipFile) -> Iterator[Document]: from mimetypes import guess_type diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_adobe_pdf_extract.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_adobe_pdf_extract.py index 5c01e85bf638d..49c21a3a6f25e 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/test_adobe_pdf_extract.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_adobe_pdf_extract.py @@ -4,7 +4,6 @@ import pytest -from langchain_community.document_loaders import AdobePDFExtractLoader from langchain_community.document_loaders.parsers import ( AdobePDFExtractParser, ) @@ -41,26 +40,14 @@ def test_adobe_pdf_services( def test_adobe_pdf_services_loader_modes( mock_pdf_services: MagicMock, mock_credentials: MagicMock, mode: str ) -> None: - loader = AdobePDFExtractLoader( + blob_parser = AdobePDFExtractParser( client_id="client_id", client_secret="client_secret", - file_path="path/to/file.pdf", - mode=mode, + mode=mode, # type: ignore embed_figures=True, ) mock_credentials.assert_called_once_with( client_id="client_id", client_secret="client_secret" ) mock_pdf_services.assert_called_once_with(credentials=mock_credentials()) - assert loader.parser.mode == mode - - -def test_adobe_pdf_services_loader_invalid_file_path() -> None: - with pytest.raises(AssertionError): - AdobePDFExtractLoader( - client_id="client_id", - client_secret="client_secret", - file_path=None, # type: ignore[arg-type] - mode="chunks", - embed_figures=True, - ) + assert blob_parser.mode == mode