Skip to content

Commit

Permalink
Merge branch 'master' into langchain-pipeshift
Browse files Browse the repository at this point in the history
  • Loading branch information
efriis authored Jan 22, 2025
2 parents 31435af + e723882 commit 47ff6e5
Show file tree
Hide file tree
Showing 186 changed files with 6,369 additions and 3,232 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"First we prepare an example table with non-default schema, and populate it with some arbitary data."
"First we prepare an example table with non-default schema, and populate it with some arbitrary data."
]
},
{
Expand Down
1,188 changes: 1,154 additions & 34 deletions docs/docs/integrations/document_loaders/pymupdf.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/docs/integrations/text_embedding/huggingfacehub.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install --upgrade --quiet langchain sentence_transformers"
"%pip install --upgrade --quiet langchain langchain-huggingface sentence_transformers"
]
},
{
Expand Down
4 changes: 4 additions & 0 deletions docs/vercel.json
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@
{
"source": "/docs/integrations/retrievers/weaviate-hybrid(/?)",
"destination": "/docs/integrations/vectorstores/weaviate/#search-mechanism"
},
{
"source": "/api_reference/mongodb/:path(.*/?)*",
"destination": "https://langchain-mongodb.readthedocs.io/en/latest/langchain_mongodb/api_docs.html"
}
]
}
2 changes: 2 additions & 0 deletions libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@ oracle-ads>=2.9.1,<3
oracledb>=2.2.0,<3
pandas>=2.0.1,<3
pdfminer-six>=20221105,<20240706
pdfplumber>=0.11
pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2
praw>=7.7.1,<8
premai>=0.3.25,<0.4
psychicapi>=0.8.0,<0.9
pydantic>=2.7.4,<3
pytesseract>=0.3.13
py-trello>=0.19.0,<0.20
pyjwt>=2.8.0,<3
pymupdf>=1.22.3,<2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
from langchain_community.document_loaders.parsers.html import (
BS4HTMLParser,
)
from langchain_community.document_loaders.parsers.images import (
BaseImageBlobParser,
LLMImageBlobParser,
RapidOCRBlobParser,
TesseractBlobParser,
)
from langchain_community.document_loaders.parsers.language import (
LanguageParser,
)
Expand All @@ -35,15 +41,19 @@
_module_lookup = {
"AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence", # noqa: E501
"BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
"BaseImageBlobParser": "langchain_community.document_loaders.parsers.images",
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
"GrobidParser": "langchain_community.document_loaders.parsers.grobid",
"LanguageParser": "langchain_community.document_loaders.parsers.language",
"LLMImageBlobParser": "langchain_community.document_loaders.parsers.images",
"OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
"PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
}

Expand All @@ -57,15 +67,19 @@ def __getattr__(name: str) -> Any:

__all__ = [
"AzureAIDocumentIntelligenceParser",
"BaseImageBlobParser",
"BS4HTMLParser",
"DocAIParser",
"GrobidParser",
"LanguageParser",
"LLMImageBlobParser",
"OpenAIWhisperParser",
"PDFMinerParser",
"PDFPlumberParser",
"PyMuPDFParser",
"PyPDFParser",
"PyPDFium2Parser",
"RapidOCRBlobParser",
"TesseractBlobParser",
"VsdxParser",
]
220 changes: 220 additions & 0 deletions libs/community/langchain_community/document_loaders/parsers/images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import base64
import io
import logging
from abc import abstractmethod
from typing import TYPE_CHECKING, Iterable, Iterator

import numpy
import numpy as np
from langchain_core.documents import Document
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import HumanMessage

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob

if TYPE_CHECKING:
from PIL.Image import Image

logger = logging.getLogger(__name__)


class BaseImageBlobParser(BaseBlobParser):
"""Abstract base class for parsing image blobs into text."""

@abstractmethod
def _analyze_image(self, img: "Image") -> str:
"""Abstract method to analyze an image and extract textual content.
Args:
img: The image to be analyzed.
Returns:
The extracted text content.
"""

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse a blob and yields Documents containing the parsed content.
Args:
blob (Blob): The blob to be parsed.
Yields:
Document:
A document containing the parsed content and metadata.
"""
try:
from PIL import Image as Img

with blob.as_bytes_io() as buf:
if blob.mimetype == "application/x-npy":
img = Img.fromarray(numpy.load(buf))
else:
img = Img.open(buf)
content = self._analyze_image(img)
logger.debug("Image text: %s", content.replace("\n", "\\n"))
yield Document(
page_content=content,
metadata={**blob.metadata, **{"source": blob.source}},
)
except ImportError:
raise ImportError(
"`Pillow` package not found, please install it with "
"`pip install Pillow`"
)


class RapidOCRBlobParser(BaseImageBlobParser):
"""Parser for extracting text from images using the RapidOCR library.
Attributes:
ocr:
The RapidOCR instance for performing OCR.
"""

def __init__(
self,
) -> None:
"""
Initializes the RapidOCRBlobParser.
"""
super().__init__()
self.ocr = None

def _analyze_image(self, img: "Image") -> str:
"""
Analyzes an image and extracts text using RapidOCR.
Args:
img (Image):
The image to be analyzed.
Returns:
str:
The extracted text content.
"""
if not self.ocr:
try:
from rapidocr_onnxruntime import RapidOCR

self.ocr = RapidOCR()
except ImportError:
raise ImportError(
"`rapidocr-onnxruntime` package not found, please install it with "
"`pip install rapidocr-onnxruntime`"
)
ocr_result, _ = self.ocr(np.array(img)) # type: ignore
content = ""
if ocr_result:
content = ("\n".join([text[1] for text in ocr_result])).strip()
return content


class TesseractBlobParser(BaseImageBlobParser):
"""Parse for extracting text from images using the Tesseract OCR library."""

def __init__(
self,
*,
langs: Iterable[str] = ("eng",),
):
"""Initialize the TesseractBlobParser.
Args:
langs (list[str]):
The languages to use for OCR.
"""
super().__init__()
self.langs = list(langs)

def _analyze_image(self, img: "Image") -> str:
"""Analyze an image and extracts text using Tesseract OCR.
Args:
img: The image to be analyzed.
Returns:
str: The extracted text content.
"""
try:
import pytesseract
except ImportError:
raise ImportError(
"`pytesseract` package not found, please install it with "
"`pip install pytesseract`"
)
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()


_PROMPT_IMAGES_TO_DESCRIPTION: str = (
"You are an assistant tasked with summarizing images for retrieval. "
"1. These summaries will be embedded and used to retrieve the raw image. "
"Give a concise summary of the image that is well optimized for retrieval\n"
"2. extract all the text from the image. "
"Do not exclude any content from the page.\n"
"Format answer in markdown without explanatory text "
"and without markdown delimiter ``` at the beginning. "
)


class LLMImageBlobParser(BaseImageBlobParser):
"""Parser for analyzing images using a language model (LLM).
Attributes:
model (BaseChatModel):
The language model to use for analysis.
prompt (str):
The prompt to provide to the language model.
"""

def __init__(
self,
*,
model: BaseChatModel,
prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
):
"""Initializes the LLMImageBlobParser.
Args:
model (BaseChatModel):
The language model to use for analysis.
prompt (str):
The prompt to provide to the language model.
"""
super().__init__()
self.model = model
self.prompt = prompt

def _analyze_image(self, img: "Image") -> str:
"""Analyze an image using the provided language model.
Args:
img: The image to be analyzed.
Returns:
The extracted textual content.
"""
image_bytes = io.BytesIO()
img.save(image_bytes, format="PNG")
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
msg = self.model.invoke(
[
HumanMessage(
content=[
{
"type": "text",
"text": self.prompt.format(format=format),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_base64}"
},
},
]
)
]
)
result = msg.content
assert isinstance(result, str)
return result
Loading

0 comments on commit 47ff6e5

Please sign in to comment.