Skip to content

Commit

Permalink
x
Browse files Browse the repository at this point in the history
  • Loading branch information
efriis committed Dec 14, 2024
1 parent 826c313 commit c6fbe24
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@


_module_lookup = {
"AdobePDFExtractionParser": "langchain_community.document_loaders.parsers.adobe_pdf_services", # noqa: E501
"AdobePDFExtractParser": "langchain_community.document_loaders.parsers.adobe_pdf_extract", # noqa: E501
"AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence", # noqa: E501
"BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,16 @@ def __init__(
mode: Literal["chunks", "json", "data"] = "chunks",
embed_figures: bool = True,
):
from adobe.pdfservices.operation.auth.service_principal_credentials import (
ServicePrincipalCredentials,
)
from adobe.pdfservices.operation.pdf_services import PDFServices
try:
from adobe.pdfservices.operation.auth.service_principal_credentials import (
ServicePrincipalCredentials,
)
from adobe.pdfservices.operation.pdf_services import PDFServices
except (ImportError, ModuleNotFoundError):
raise ImportError(
"The Adobe PDF Services SDK is not installed. Please install it using "
"`pip install pdfservices-sdk`."
)

self.credentials = ServicePrincipalCredentials(
client_id=client_id, client_secret=client_secret
Expand Down Expand Up @@ -85,27 +91,11 @@ def _generate_docs_chunks(
) -> Iterator[Document]:
headers: list = []
current_paragraphs: list = []
header_page = None
header_page = 0
paragraph_page = None
last_header_level = None
figures: dict = {}

def yield_chunk() -> Iterator[Document]:
if current_paragraphs:
merged_paragraphs = "".join([p for p in current_paragraphs]).strip()
yield Document(
page_content=merged_paragraphs,
metadata={
"headers": headers.copy(),
"page": header_page + 1
if header_page
else (paragraph_page or 0) + 1,
"figures": figures,
},
)
current_paragraphs.clear()
figures.clear()

for element in json_data["elements"]:
text = element.get("Text", "")
path = element.get("Path", "")
Expand All @@ -117,12 +107,25 @@ def yield_chunk() -> Iterator[Document]:
if "Figure" in path:
continue

if current_paragraphs:
merged_paragraphs = "".join([p for p in current_paragraphs]).strip()
yield Document(
page_content=merged_paragraphs,
metadata={
"headers": headers.copy(),
"page": header_page + 1
if header_page
else (paragraph_page or 0) + 1,
"figures": figures,
},
)
current_paragraphs.clear()
figures.clear()

if last_header_level and last_header_level < header_level:
yield from yield_chunk()
headers.append(text)

else:
yield from yield_chunk()
headers = headers[: header_level - 1]
headers.append(text)

Expand Down Expand Up @@ -158,7 +161,18 @@ def yield_chunk() -> Iterator[Document]:

elif "Table" not in path:
current_paragraphs.append(text + "\n")
yield from yield_chunk()
if current_paragraphs:
merged_paragraphs = "".join([p for p in current_paragraphs]).strip()
yield Document(
page_content=merged_paragraphs,
metadata={
"headers": headers.copy(),
"page": header_page + 1
if header_page
else (paragraph_page or 0) + 1,
"figures": figures,
},
)

def _generate_docs_data(self, archive: zipfile.ZipFile) -> Iterator[Document]:
from mimetypes import guess_type
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import pytest

from langchain_community.document_loaders import AdobePDFExtractLoader
from langchain_community.document_loaders.parsers import (
AdobePDFExtractParser,
)
Expand Down Expand Up @@ -41,26 +40,14 @@ def test_adobe_pdf_services(
def test_adobe_pdf_services_loader_modes(
mock_pdf_services: MagicMock, mock_credentials: MagicMock, mode: str
) -> None:
loader = AdobePDFExtractLoader(
blob_parser = AdobePDFExtractParser(
client_id="client_id",
client_secret="client_secret",
file_path="path/to/file.pdf",
mode=mode,
mode=mode, # type: ignore
embed_figures=True,
)
mock_credentials.assert_called_once_with(
client_id="client_id", client_secret="client_secret"
)
mock_pdf_services.assert_called_once_with(credentials=mock_credentials())
assert loader.parser.mode == mode


def test_adobe_pdf_services_loader_invalid_file_path() -> None:
with pytest.raises(AssertionError):
AdobePDFExtractLoader(
client_id="client_id",
client_secret="client_secret",
file_path=None, # type: ignore[arg-type]
mode="chunks",
embed_figures=True,
)
assert blob_parser.mode == mode

0 comments on commit c6fbe24

Please sign in to comment.