Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core: fix ChatPromptTemplate doesn't accept PDF data as bytes #28011

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 44 additions & 2 deletions libs/community/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions libs/community/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ dataclasses-json = ">= 0.5.7, < 0.7"
pydantic-settings = "^2.4.0"
langsmith = "^0.1.125"
httpx-sse = "^0.4.0"
pypdf = "^5.1.0"
[[tool.poetry.dependencies.numpy]]
version = ">=1.22.4,<2"
python = "<3.12"
Expand Down
1 change: 1 addition & 0 deletions libs/community/tests/unit_tests/test_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_required_dependencies(poetry_conf: Mapping[str, Any]) -> None:
"langchain-core",
"langsmith",
"numpy",
"pypdf",
"python",
"requests",
"pydantic-settings",
Expand Down
1 change: 1 addition & 0 deletions libs/core/extended_testing_deps.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
jinja2>=3,<4
pypdf>=5,<6
29 changes: 29 additions & 0 deletions libs/core/langchain_core/prompts/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from __future__ import annotations

import base64
import io
from abc import ABC, abstractmethod
from collections.abc import Sequence
from pathlib import Path
Expand All @@ -22,6 +24,7 @@
SkipValidation,
model_validator,
)
from pypdf import PdfReader

from langchain_core._api import deprecated
from langchain_core.load import Serializable
Expand All @@ -48,6 +51,22 @@
from langchain_core.utils.interactive_env import is_interactive_env


# extract pdf into bytes
def extract_pdf_text(pdf_data: str) -> str:
# Decode the base64 back into bytes
pdf_bytes = base64.b64decode(pdf_data)
pdf_text = ""

# Read the PDF and extract text
with io.BytesIO(pdf_bytes) as pdf_file:
reader = PdfReader(pdf_file)
for page in reader.pages:
extracted_text = page.extract_text()
pdf_text += extracted_text + "\n"

return pdf_text


class BaseMessagePromptTemplate(Serializable, ABC):
"""Base class for message prompt templates."""

Expand Down Expand Up @@ -468,6 +487,10 @@ class _ImageTemplateParam(TypedDict, total=False):
image_url: Union[str, dict]


class _PdfTemplateParam(TypedDict, total=False):
data: str


class _StringImageMessagePromptTemplate(BaseMessagePromptTemplate):
"""Human message prompt template. This is a message sent from the user."""

Expand Down Expand Up @@ -571,6 +594,12 @@ def from_template(
msg = f"Invalid image template: {tmpl}"
raise ValueError(msg)
prompt.append(img_template_obj)
elif isinstance(tmpl, dict) and "data" in tmpl:
if tmpl.get("mime_type") == "application/pdf":
pdf_template = cast(_PdfTemplateParam, tmpl)["data"]
pdf_text = extract_pdf_text(pdf_template)

prompt.append(PromptTemplate.from_template(pdf_text))
else:
msg = f"Invalid template: {tmpl}"
raise ValueError(msg)
Expand Down
26 changes: 24 additions & 2 deletions libs/core/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions libs/core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ jsonpatch = "^1.33"
PyYAML = ">=5.3"
packaging = ">=23.2,<25"
typing-extensions = ">=4.7"
pypdf = "^5.1.0"
[[tool.poetry.dependencies.pydantic]]
version = "^2.5.2"
python = "<3.12.4"
Expand Down
30 changes: 30 additions & 0 deletions libs/core/tests/unit_tests/prompts/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,36 @@
from tests.unit_tests.pydantic_utils import _normalize_schema


def test_create_chat_prompt_template() -> None:
"""Test chat prompt with pdf data as bytes."""
file_path = (
Path(__file__).parent.parent.parent.parent.parent
/ "community/tests/examples/hello.pdf"
)

with open(file_path, "rb") as file:
file_data = file.read()

pdf_data = base64.b64encode(file_data).decode("utf-8")
prompt = ChatPromptTemplate(
[
(
"human",
[
{"type": "media", "mime_type": "application/pdf", "data": pdf_data},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which chat model supports this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Efriis, I just wanted to make sure that we understood the question correctly! If you’re asking which chat models this implementation would be compatible with, it should work with all chat models because it is sending the pdf data as text. But please let me know if you were looking for something else!

],
)
]
)

expected_prompt = PromptTemplate(template="Hello world!\n1\n", input_variables=[])

assert len(prompt.messages) == 1
output_prompt = prompt.messages[0]
assert isinstance(output_prompt, HumanMessagePromptTemplate)
assert output_prompt.prompt == [expected_prompt]


@pytest.fixture
def messages() -> list[BaseMessagePromptTemplate]:
"""Create messages."""
Expand Down
Loading
Loading