From 60123bef67287992bbf014dc991e3a64c8e2c60e Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:25:13 -0800 Subject: [PATCH 01/11] docs: fix trim_messages docstring (#27948) --- libs/core/langchain_core/messages/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/core/langchain_core/messages/utils.py b/libs/core/langchain_core/messages/utils.py index 0ea03e40d00d4..2fce9f7dbcaf7 100644 --- a/libs/core/langchain_core/messages/utils.py +++ b/libs/core/langchain_core/messages/utils.py @@ -590,7 +590,7 @@ def trim_messages( include_system: bool = False, text_splitter: Optional[Union[Callable[[str], list[str]], TextSplitter]] = None, ) -> list[BaseMessage]: - """Trim messages to be below a token count. + r"""Trim messages to be below a token count. trim_messages can be used to reduce the size of a chat history to a specified token count or specified message count. From 0f85dea8c89f00a72ef52043a6d2abad1cd0d1ae Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Thu, 7 Nov 2024 03:35:39 +0500 Subject: [PATCH 02/11] langchain-huggingface: use separate kwargs for queries and docs (#27857) Now `encode_kwargs` used for both for documents and queries and this leads to wrong embeddings. E. g.: ```python model_kwargs = {"device": "cuda", "trust_remote_code": True} encode_kwargs = {"normalize_embeddings": False, "prompt_name": "s2p_query"} model = HuggingFaceEmbeddings( model_name="dunzhang/stella_en_400M_v5", model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, ) query_embedding = np.array( model.embed_query("What are some ways to reduce stress?",) ) document_embedding = np.array( model.embed_documents( [ "There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent stress from building up.", "Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties.", ] ) ) print(model._client.similarity(query_embedding, document_embedding)) # output: tensor([[0.8421, 0.3317]], dtype=torch.float64) ``` But from the [model card](https://huggingface.co/dunzhang/stella_en_400M_v5#sentence-transformers) expexted like this: ```python model_kwargs = {"device": "cuda", "trust_remote_code": True} encode_kwargs = {"normalize_embeddings": False} query_encode_kwargs = {"normalize_embeddings": False, "prompt_name": "s2p_query"} model = HuggingFaceEmbeddings( model_name="dunzhang/stella_en_400M_v5", model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_encode_kwargs=query_encode_kwargs, ) query_embedding = np.array( model.embed_query("What are some ways to reduce stress?", ) ) document_embedding = np.array( model.embed_documents( [ "There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent stress from building up.", "Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties.", ] ) ) print(model._client.similarity(query_embedding, document_embedding)) # tensor([[0.8398, 0.2990]], dtype=torch.float64) ``` --- .../embeddings/huggingface.py | 41 +++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py b/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py index 180a9ed3b5e79..2bbc551f4e0b1 100644 --- a/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py +++ b/libs/partners/huggingface/langchain_huggingface/embeddings/huggingface.py @@ -36,9 +36,14 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings): `prompts`, `default_prompt_name`, `revision`, `trust_remote_code`, or `token`. See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer""" encode_kwargs: Dict[str, Any] = Field(default_factory=dict) - """Keyword arguments to pass when calling the `encode` method of the Sentence - Transformer model, such as `prompt_name`, `prompt`, `batch_size`, `precision`, - `normalize_embeddings`, and more. + """Keyword arguments to pass when calling the `encode` method for the documents of + the Sentence Transformer model, such as `prompt_name`, `prompt`, `batch_size`, + `precision`, `normalize_embeddings`, and more. + See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode""" + query_encode_kwargs: Dict[str, Any] = Field(default_factory=dict) + """Keyword arguments to pass when calling the `encode` method for the query of + the Sentence Transformer model, such as `prompt_name`, `prompt`, `batch_size`, + `precision`, `normalize_embeddings`, and more. See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode""" multi_process: bool = False """Run encode() on multiple GPUs.""" @@ -65,11 +70,17 @@ def __init__(self, **kwargs: Any): protected_namespaces=(), ) - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Compute doc embeddings using a HuggingFace transformer model. + def _embed( + self, texts: list[str], encode_kwargs: Dict[str, Any] + ) -> List[List[float]]: + """ + Embed a text using the HuggingFace transformer model. Args: texts: The list of texts to embed. + encode_kwargs: Keyword arguments to pass when calling the + `encode` method for the documents of the SentenceTransformer + encode method. Returns: List of embeddings, one for each text. @@ -85,7 +96,7 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: embeddings = self._client.encode( texts, show_progress_bar=self.show_progress, - **self.encode_kwargs, # type: ignore + **encode_kwargs, # type: ignore ) if isinstance(embeddings, list): @@ -96,6 +107,17 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: return embeddings.tolist() + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Compute doc embeddings using a HuggingFace transformer model. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + return self._embed(texts, self.encode_kwargs) + def embed_query(self, text: str) -> List[float]: """Compute query embeddings using a HuggingFace transformer model. @@ -105,4 +127,9 @@ def embed_query(self, text: str) -> List[float]: Returns: Embeddings for the text. """ - return self.embed_documents([text])[0] + embed_kwargs = ( + self.query_encode_kwargs + if len(self.query_encode_kwargs) > 0 + else self.encode_kwargs + ) + return self._embed([text], embed_kwargs)[0] From 482c168b3e8ac982130bacc89fc9a32b67f765a3 Mon Sep 17 00:00:00 2001 From: takahashi <7567050+MasaYan24@users.noreply.github.com> Date: Thu, 7 Nov 2024 07:37:07 +0900 Subject: [PATCH 03/11] langchain_core: add `file_type` option to make file type default as `png` (#27855) Thank you for contributing to LangChain! - [ ] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [ ] **description** langchain_core.runnables.graph_mermaid.draw_mermaid_png calls this function, but the Mermaid API returns JPEG by default. To be consistent, add the option `file_type` with the default `png` type. - [ ] **Add tests and docs**: If you're adding a new integration, please include With this small change, I didn't add tests and docs. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: One long sentence was divided into two. Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --- libs/core/langchain_core/runnables/graph_mermaid.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/core/langchain_core/runnables/graph_mermaid.py b/libs/core/langchain_core/runnables/graph_mermaid.py index b0e7f3dd5de5e..c9b3025bc857d 100644 --- a/libs/core/langchain_core/runnables/graph_mermaid.py +++ b/libs/core/langchain_core/runnables/graph_mermaid.py @@ -2,7 +2,7 @@ import base64 import re from dataclasses import asdict -from typing import Optional +from typing import Literal, Optional from langchain_core.runnables.graph import ( CurveStyle, @@ -306,6 +306,7 @@ def _render_mermaid_using_api( mermaid_syntax: str, output_file_path: Optional[str] = None, background_color: Optional[str] = "white", + file_type: Optional[Literal["jpeg", "png", "webp"]] = "png", ) -> bytes: """Renders Mermaid graph using the Mermaid.INK API.""" try: @@ -329,7 +330,8 @@ def _render_mermaid_using_api( background_color = f"!{background_color}" image_url = ( - f"https://mermaid.ink/img/{mermaid_syntax_encoded}?bgColor={background_color}" + f"https://mermaid.ink/img/{mermaid_syntax_encoded}" + f"?type={file_type}&bgColor={background_color}" ) response = requests.get(image_url, timeout=10) if response.status_code == 200: From 90189f5639560d1c4923e39885d3c8eb3c94caf2 Mon Sep 17 00:00:00 2001 From: Martin Triska Date: Wed, 6 Nov 2024 23:44:34 +0100 Subject: [PATCH 04/11] community: Allow other than default parsers in SharePointLoader and OneDriveLoader (#27716) ## What this PR does? ### Currently `O365BaseLoader` (and consequently both derived loaders) are limited to `pdf`, `doc`, `docx` files. - **Solution: here we introduce _handlers_ attribute that allows for custom handlers to be passed in. This is done in _dict_ form:** **Example:** ```python from langchain_community.document_loaders.parsers.documentloader_adapter import DocumentLoaderAsParser # PR for DocumentLoaderAsParser here: https://github.com/langchain-ai/langchain/pull/27749 from langchain_community.document_loaders.excel import UnstructuredExcelLoader xlsx_parser = DocumentLoaderAsParser(UnstructuredExcelLoader, mode="paged") # create dictionary mapping file types to handlers (parsers) handlers = { "doc": MsWordParser() "pdf": PDFMinerParser() "txt": TextParser() "xlsx": xlsx_parser } loader = SharePointLoader(document_library_id="...", handlers=handlers # pass handlers to SharePointLoader ) documents = loader.load() # works the same in OneDriveLoader loader = OneDriveLoader(document_library_id="...", handlers=handlers ) ``` This dictionary is then passed to `MimeTypeBasedParser` same as in the [current implementation](https://github.com/langchain-ai/langchain/blob/5a2cfb49e045988d290a1c7e3a0c589d6b371694/libs/community/langchain_community/document_loaders/parsers/registry.py#L13). ### Currently `SharePointLoader` and `OneDriveLoader` are separate loaders that both inherit from `O365BaseLoader` However both of these implement the same functionality. The only differences are: - `SharePointLoader` requires argument `document_library_id` whereas `OneDriveLoader` requires `drive_id`. These are just different names for the same thing. - `SharePointLoader` implements significantly more features. - **Solution: `OneDriveLoader` is replaced with an empty shell just renaming `drive_id` to `document_library_id` and inheriting from `SharePointLoader`** **Dependencies:** None **Twitter handle:** @martintriska1 If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --- .../document_loaders/microsoft_onedrive.ipynb | 67 ++++++++-- .../microsoft_sharepoint.ipynb | 60 ++++++++- .../document_loaders/base_o365.py | 116 ++++++++++++++---- .../document_loaders/onedrive.py | 97 ++------------- .../document_loaders/sharepoint.py | 23 +--- 5 files changed, 227 insertions(+), 136 deletions(-) diff --git a/docs/docs/integrations/document_loaders/microsoft_onedrive.ipynb b/docs/docs/integrations/document_loaders/microsoft_onedrive.ipynb index b42c141f8fea4..20feef0f9cc3c 100644 --- a/docs/docs/integrations/document_loaders/microsoft_onedrive.ipynb +++ b/docs/docs/integrations/document_loaders/microsoft_onedrive.ipynb @@ -8,7 +8,7 @@ "\n", ">[Microsoft OneDrive](https://en.wikipedia.org/wiki/OneDrive) (formerly `SkyDrive`) is a file hosting service operated by Microsoft.\n", "\n", - "This notebook covers how to load documents from `OneDrive`. Currently, only docx, doc, and pdf files are supported.\n", + "This notebook covers how to load documents from `OneDrive`. By default the document loader loads `pdf`, `doc`, `docx` and `txt` files. You can load other file types by providing appropriate parsers (see more below).\n", "\n", "## Prerequisites\n", "1. Register an application with the [Microsoft identity platform](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) instructions.\n", @@ -77,15 +77,64 @@ "\n", "loader = OneDriveLoader(drive_id=\"YOUR DRIVE ID\", object_ids=[\"ID_1\", \"ID_2\"], auth_with_token=True)\n", "documents = loader.load()\n", - "```\n" + "```\n", + "\n", + "#### 📑 Choosing supported file types and preffered parsers\n", + "By default `OneDriveLoader` loads file types defined in [`document_loaders/parsers/registry`](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/document_loaders/parsers/registry.py#L10-L22) using the default parsers (see below).\n", + "```python\n", + "def _get_default_parser() -> BaseBlobParser:\n", + " \"\"\"Get default mime-type based parser.\"\"\"\n", + " return MimeTypeBasedParser(\n", + " handlers={\n", + " \"application/pdf\": PyMuPDFParser(),\n", + " \"text/plain\": TextParser(),\n", + " \"application/msword\": MsWordParser(),\n", + " \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\": (\n", + " MsWordParser()\n", + " ),\n", + " },\n", + " fallback_parser=None,\n", + " )\n", + "```\n", + "You can override this behavior by passing `handlers` argument to `OneDriveLoader`. \n", + "Pass a dictionary mapping either file extensions (like `\"doc\"`, `\"pdf\"`, etc.) \n", + "or MIME types (like `\"application/pdf\"`, `\"text/plain\"`, etc.) to parsers. \n", + "Note that you must use either file extensions or MIME types exclusively and \n", + "cannot mix them.\n", + "\n", + "Do not include the leading dot for file extensions.\n", + "\n", + "```python\n", + "# using file extensions:\n", + "handlers = {\n", + " \"doc\": MsWordParser(),\n", + " \"pdf\": PDFMinerParser(),\n", + " \"mp3\": OpenAIWhisperParser()\n", + "}\n", + "\n", + "# using MIME types:\n", + "handlers = {\n", + " \"application/msword\": MsWordParser(),\n", + " \"application/pdf\": PDFMinerParser(),\n", + " \"audio/mpeg\": OpenAIWhisperParser()\n", + "}\n", + "\n", + "loader = OneDriveLoader(document_library_id=\"...\",\n", + " handlers=handlers # pass handlers to OneDriveLoader\n", + " )\n", + "```\n", + "In case multiple file extensions map to the same MIME type, the last dictionary item will\n", + "apply.\n", + "Example:\n", + "```python\n", + "# 'jpg' and 'jpeg' both map to 'image/jpeg' MIME type. SecondParser() will be used \n", + "# to parse all jpg/jpeg files.\n", + "handlers = {\n", + " \"jpg\": FirstParser(),\n", + " \"jpeg\": SecondParser()\n", + "}\n", + "```" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb b/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb index 930346675dfd9..b49abe39f5158 100644 --- a/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb +++ b/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb @@ -9,7 +9,7 @@ "\n", "> [Microsoft SharePoint](https://en.wikipedia.org/wiki/SharePoint) is a website-based collaboration system that uses workflow applications, “list” databases, and other web parts and security features to empower business teams to work together developed by Microsoft.\n", "\n", - "This notebook covers how to load documents from the [SharePoint Document Library](https://support.microsoft.com/en-us/office/what-is-a-document-library-3b5976dd-65cf-4c9e-bf5a-713c10ca2872). Currently, only docx, doc, and pdf files are supported.\n", + "This notebook covers how to load documents from the [SharePoint Document Library](https://support.microsoft.com/en-us/office/what-is-a-document-library-3b5976dd-65cf-4c9e-bf5a-713c10ca2872). By default the document loader loads `pdf`, `doc`, `docx` and `txt` files. You can load other file types by providing appropriate parsers (see more below).\n", "\n", "## Prerequisites\n", "1. Register an application with the [Microsoft identity platform](https://learn.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) instructions.\n", @@ -100,7 +100,63 @@ "\n", "loader = SharePointLoader(document_library_id=\"YOUR DOCUMENT LIBRARY ID\", object_ids=[\"ID_1\", \"ID_2\"], auth_with_token=True)\n", "documents = loader.load()\n", - "```\n" + "```\n", + "\n", + "#### 📑 Choosing supported file types and preffered parsers\n", + "By default `SharePointLoader` loads file types defined in [`document_loaders/parsers/registry`](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/document_loaders/parsers/registry.py#L10-L22) using the default parsers (see below).\n", + "```python\n", + "def _get_default_parser() -> BaseBlobParser:\n", + " \"\"\"Get default mime-type based parser.\"\"\"\n", + " return MimeTypeBasedParser(\n", + " handlers={\n", + " \"application/pdf\": PyMuPDFParser(),\n", + " \"text/plain\": TextParser(),\n", + " \"application/msword\": MsWordParser(),\n", + " \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\": (\n", + " MsWordParser()\n", + " ),\n", + " },\n", + " fallback_parser=None,\n", + " )\n", + "```\n", + "You can override this behavior by passing `handlers` argument to `SharePointLoader`. \n", + "Pass a dictionary mapping either file extensions (like `\"doc\"`, `\"pdf\"`, etc.) \n", + "or MIME types (like `\"application/pdf\"`, `\"text/plain\"`, etc.) to parsers. \n", + "Note that you must use either file extensions or MIME types exclusively and \n", + "cannot mix them.\n", + "\n", + "Do not include the leading dot for file extensions.\n", + "\n", + "```python\n", + "# using file extensions:\n", + "handlers = {\n", + " \"doc\": MsWordParser(),\n", + " \"pdf\": PDFMinerParser(),\n", + " \"mp3\": OpenAIWhisperParser()\n", + "}\n", + "\n", + "# using MIME types:\n", + "handlers = {\n", + " \"application/msword\": MsWordParser(),\n", + " \"application/pdf\": PDFMinerParser(),\n", + " \"audio/mpeg\": OpenAIWhisperParser()\n", + "}\n", + "\n", + "loader = SharePointLoader(document_library_id=\"...\",\n", + " handlers=handlers # pass handlers to SharePointLoader\n", + " )\n", + "```\n", + "In case multiple file extensions map to the same MIME type, the last dictionary item will\n", + "apply.\n", + "Example:\n", + "```python\n", + "# 'jpg' and 'jpeg' both map to 'image/jpeg' MIME type. SecondParser() will be used \n", + "# to parse all jpg/jpeg files.\n", + "handlers = {\n", + " \"jpg\": FirstParser(),\n", + " \"jpeg\": SecondParser()\n", + "}\n", + "```" ] } ], diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py index 44002842bf25d..5f89d0794fccd 100644 --- a/libs/community/langchain_community/document_loaders/base_o365.py +++ b/libs/community/langchain_community/document_loaders/base_o365.py @@ -3,26 +3,29 @@ from __future__ import annotations import logging +import mimetypes import os import tempfile from abc import abstractmethod -from enum import Enum from pathlib import Path, PurePath -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Sequence, Union +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union from pydantic import ( BaseModel, Field, FilePath, + PrivateAttr, SecretStr, ) from pydantic_settings import BaseSettings, SettingsConfigDict -from langchain_community.document_loaders.base import BaseLoader +from langchain_community.document_loaders.base import BaseBlobParser, BaseLoader from langchain_community.document_loaders.blob_loaders.file_system import ( FileSystemBlobLoader, ) from langchain_community.document_loaders.blob_loaders.schema import Blob +from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser +from langchain_community.document_loaders.parsers.registry import get_parser if TYPE_CHECKING: from O365 import Account @@ -46,24 +49,27 @@ class _O365TokenStorage(BaseSettings): token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt" -class _FileType(str, Enum): - DOC = "doc" - DOCX = "docx" - PDF = "pdf" +def fetch_mime_types(file_types: Sequence[str]) -> Dict[str, str]: + """Fetch the mime types for the specified file types.""" + mime_types_mapping = {} + for ext in file_types: + mime_type, _ = mimetypes.guess_type(f"file.{ext}") + if mime_type: + mime_types_mapping[ext] = mime_type + else: + raise ValueError(f"Unknown mimetype of extention {ext}") + return mime_types_mapping -def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]: +def fetch_extensions(mime_types: Sequence[str]) -> Dict[str, str]: """Fetch the mime types for the specified file types.""" mime_types_mapping = {} - for file_type in file_types: - if file_type.value == "doc": - mime_types_mapping[file_type.value] = "application/msword" - elif file_type.value == "docx": - mime_types_mapping[file_type.value] = ( - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501 - ) - elif file_type.value == "pdf": - mime_types_mapping[file_type.value] = "application/pdf" + for mime_type in mime_types: + ext = mimetypes.guess_extension(mime_type) + if ext: + mime_types_mapping[ext[1:]] = mime_type # ignore leading `.` + else: + raise ValueError(f"Unknown mimetype {mime_type}") return mime_types_mapping @@ -78,16 +84,82 @@ class O365BaseLoader(BaseLoader, BaseModel): """Number of bytes to retrieve from each api call to the server. int or 'auto'.""" recursive: bool = False """Should the loader recursively load subfolders?""" + handlers: Optional[Dict[str, Any]] = {} + """ + Provide custom handlers for MimeTypeBasedParser. + + Pass a dictionary mapping either file extensions (like "doc", "pdf", etc.) + or MIME types (like "application/pdf", "text/plain", etc.) to parsers. + Note that you must use either file extensions or MIME types exclusively and + cannot mix them. + + Do not include the leading dot for file extensions. + + Example using file extensions: + ```python + handlers = { + "doc": MsWordParser(), + "pdf": PDFMinerParser(), + "txt": TextParser() + } + ``` + + Example using MIME types: + ```python + handlers = { + "application/msword": MsWordParser(), + "application/pdf": PDFMinerParser(), + "text/plain": TextParser() + } + ``` + """ + + _blob_parser: BaseBlobParser = PrivateAttr() + _file_types: Sequence[str] = PrivateAttr() + _mime_types: Dict[str, str] = PrivateAttr() + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + if self.handlers: + handler_keys = list(self.handlers.keys()) + try: + # assume handlers.keys() are file extensions + self._mime_types = fetch_mime_types(handler_keys) + self._file_types = list(set(handler_keys)) + mime_handlers = { + self._mime_types[extension]: handler + for extension, handler in self.handlers.items() + } + except ValueError: + try: + # assume handlers.keys() are mime types + self._mime_types = fetch_extensions(handler_keys) + self._file_types = list(set(self._mime_types.keys())) + mime_handlers = self.handlers + except ValueError: + raise ValueError( + "`handlers` keys must be either file extensions or mimetypes.\n" + f"{handler_keys} could not be interpreted as either.\n" + "File extensions and mimetypes cannot mix. " + "Use either one or the other" + ) - @property - @abstractmethod - def _file_types(self) -> Sequence[_FileType]: - """Return supported file types.""" + self._blob_parser = MimeTypeBasedParser( + handlers=mime_handlers, fallback_parser=None + ) + else: + self._blob_parser = get_parser("default") + if not isinstance(self._blob_parser, MimeTypeBasedParser): + raise TypeError( + 'get_parser("default) was supposed to return MimeTypeBasedParser.' + f"It returned {type(self._blob_parser)}" + ) + self._mime_types = fetch_extensions(list(self._blob_parser.handlers.keys())) @property def _fetch_mime_types(self) -> Dict[str, str]: """Return a dict of supported file types to corresponding mime types.""" - return fetch_mime_types(self._file_types) + return self._mime_types @property @abstractmethod diff --git a/libs/community/langchain_community/document_loaders/onedrive.py b/libs/community/langchain_community/document_loaders/onedrive.py index ecc1d232bfe4c..e0369233c22bf 100644 --- a/libs/community/langchain_community/document_loaders/onedrive.py +++ b/libs/community/langchain_community/document_loaders/onedrive.py @@ -1,94 +1,19 @@ -"""Loads data from OneDrive""" +from typing import Any -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING, Iterator, List, Optional, Sequence, Union - -from langchain_core.documents import Document from pydantic import Field -from langchain_community.document_loaders.base_o365 import ( - O365BaseLoader, - _FileType, -) -from langchain_community.document_loaders.parsers.registry import get_parser - -if TYPE_CHECKING: - from O365.drive import Drive, Folder - -logger = logging.getLogger(__name__) +from langchain_community.document_loaders import SharePointLoader -class OneDriveLoader(O365BaseLoader): - """Load from `Microsoft OneDrive`.""" +class OneDriveLoader(SharePointLoader): + """ + Load documents from Microsoft OneDrive. + Uses `SharePointLoader` under the hood. + """ drive_id: str = Field(...) - """ The ID of the OneDrive drive to load data from.""" - folder_path: Optional[str] = None - """ The path to the folder to load data from.""" - object_ids: Optional[List[str]] = None - """ The IDs of the objects to load data from.""" - - @property - def _file_types(self) -> Sequence[_FileType]: - """Return supported file types.""" - return _FileType.DOC, _FileType.DOCX, _FileType.PDF - - @property - def _scopes(self) -> List[str]: - """Return required scopes.""" - return ["offline_access", "Files.Read.All"] - - def _get_folder_from_path(self, drive: Drive) -> Union[Folder, Drive]: - """ - Returns the folder or drive object located at the - specified path relative to the given drive. - - Args: - drive (Drive): The root drive from which the folder path is relative. - - Returns: - Union[Folder, Drive]: The folder or drive object - located at the specified path. - - Raises: - FileNotFoundError: If the path does not exist. - """ - - subfolder_drive = drive - if self.folder_path is None: - return subfolder_drive - - subfolders = [f for f in self.folder_path.split("/") if f != ""] - if len(subfolders) == 0: - return subfolder_drive - - items = subfolder_drive.get_items() - for subfolder in subfolders: - try: - subfolder_drive = list(filter(lambda x: subfolder in x.name, items))[0] - items = subfolder_drive.get_items() - except (IndexError, AttributeError): - raise FileNotFoundError("Path {} not exist.".format(self.folder_path)) - return subfolder_drive + """The ID of the OneDrive drive to load data from.""" - def lazy_load(self) -> Iterator[Document]: - """Load documents lazily. Use this when working at a large scale.""" - try: - from O365.drive import Drive - except ImportError: - raise ImportError( - "O365 package not found, please install it with `pip install o365`" - ) - drive = self._auth().storage().get_drive(self.drive_id) - if not isinstance(drive, Drive): - raise ValueError(f"There isn't a Drive with id {self.drive_id}.") - blob_parser = get_parser("default") - if self.folder_path: - folder = self._get_folder_from_path(drive) - for blob in self._load_from_folder(folder): - yield from blob_parser.lazy_parse(blob) - if self.object_ids: - for blob in self._load_from_object_ids(drive, self.object_ids): - yield from blob_parser.lazy_parse(blob) + def __init__(self, **kwargs: Any) -> None: + kwargs["document_library_id"] = kwargs["drive_id"] + super().__init__(**kwargs) diff --git a/libs/community/langchain_community/document_loaders/sharepoint.py b/libs/community/langchain_community/document_loaders/sharepoint.py index 06426a7038fdd..6d5a820248e5f 100644 --- a/libs/community/langchain_community/document_loaders/sharepoint.py +++ b/libs/community/langchain_community/document_loaders/sharepoint.py @@ -4,7 +4,7 @@ import json from pathlib import Path -from typing import Any, Iterator, List, Optional, Sequence +from typing import Any, Dict, Iterator, List, Optional import requests # type: ignore from langchain_core.document_loaders import BaseLoader @@ -13,9 +13,7 @@ from langchain_community.document_loaders.base_o365 import ( O365BaseLoader, - _FileType, ) -from langchain_community.document_loaders.parsers.registry import get_parser class SharePointLoader(O365BaseLoader, BaseLoader): @@ -36,14 +34,6 @@ class SharePointLoader(O365BaseLoader, BaseLoader): load_extended_metadata: Optional[bool] = False """ Whether to load extended metadata. Size, Owner and full_path.""" - @property - def _file_types(self) -> Sequence[_FileType]: - """Return supported file types. - Returns: - A sequence of supported file types. - """ - return _FileType.DOC, _FileType.DOCX, _FileType.PDF - @property def _scopes(self) -> List[str]: """Return required scopes. @@ -67,7 +57,6 @@ def lazy_load(self) -> Iterator[Document]: drive = self._auth().storage().get_drive(self.document_library_id) if not isinstance(drive, Drive): raise ValueError(f"There isn't a Drive with id {self.document_library_id}.") - blob_parser = get_parser("default") if self.folder_path: target_folder = drive.get_item_by_path(self.folder_path) if not isinstance(target_folder, Folder): @@ -79,7 +68,7 @@ def lazy_load(self) -> Iterator[Document]: if self.load_extended_metadata is True: extended_metadata = self.get_extended_metadata(file_id) extended_metadata.update({"source_full_url": target_folder.web_url}) - for parsed_blob in blob_parser.lazy_parse(blob): + for parsed_blob in self._blob_parser.lazy_parse(blob): if self.load_auth is True: parsed_blob.metadata["authorized_identities"] = auth_identities if self.load_extended_metadata is True: @@ -96,7 +85,7 @@ def lazy_load(self) -> Iterator[Document]: if self.load_extended_metadata is True: extended_metadata = self.get_extended_metadata(file_id) extended_metadata.update({"source_full_url": target_folder.web_url}) - for parsed_blob in blob_parser.lazy_parse(blob): + for parsed_blob in self._blob_parser.lazy_parse(blob): if self.load_auth is True: parsed_blob.metadata["authorized_identities"] = auth_identities if self.load_extended_metadata is True: @@ -109,7 +98,7 @@ def lazy_load(self) -> Iterator[Document]: auth_identities = self.authorized_identities(file_id) if self.load_extended_metadata is True: extended_metadata = self.get_extended_metadata(file_id) - for parsed_blob in blob_parser.lazy_parse(blob): + for parsed_blob in self._blob_parser.lazy_parse(blob): if self.load_auth is True: parsed_blob.metadata["authorized_identities"] = auth_identities if self.load_extended_metadata is True: @@ -126,7 +115,7 @@ def lazy_load(self) -> Iterator[Document]: auth_identities = self.authorized_identities(file_id) if self.load_extended_metadata is True: extended_metadata = self.get_extended_metadata(file_id) - for blob_part in blob_parser.lazy_parse(blob): + for blob_part in self._blob_parser.lazy_parse(blob): blob_part.metadata.update(blob.metadata) if self.load_auth is True: blob_part.metadata["authorized_identities"] = auth_identities @@ -182,7 +171,7 @@ def _fetch_access_token(self) -> Any: data = json.loads(s) return data - def get_extended_metadata(self, file_id: str) -> dict: + def get_extended_metadata(self, file_id: str) -> Dict: """ Retrieve extended metadata for a file in SharePoint. As of today, following fields are supported in the extended metadata: From 2494deb2a4bac23888cfa3763567336018660636 Mon Sep 17 00:00:00 2001 From: Erick Friis Date: Wed, 6 Nov 2024 16:31:10 -0800 Subject: [PATCH 05/11] infra: remove google creds from release and integration test workflows (#27950) --- .github/workflows/_integration_test.yml | 6 ------ .github/workflows/_release.yml | 6 ------ 2 files changed, 12 deletions(-) diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml index 60a74ff08734d..d2157eb495630 100644 --- a/.github/workflows/_integration_test.yml +++ b/.github/workflows/_integration_test.yml @@ -41,12 +41,6 @@ jobs: shell: bash run: poetry run pip install "boto3<2" "google-cloud-aiplatform<2" - - name: 'Authenticate to Google Cloud' - id: 'auth' - uses: google-github-actions/auth@v2 - with: - credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' - - name: Run integration tests shell: bash env: diff --git a/.github/workflows/_release.yml b/.github/workflows/_release.yml index fd6a53fee85cb..c36607f62fee7 100644 --- a/.github/workflows/_release.yml +++ b/.github/workflows/_release.yml @@ -267,12 +267,6 @@ jobs: make tests working-directory: ${{ inputs.working-directory }} - - name: 'Authenticate to Google Cloud' - id: 'auth' - uses: google-github-actions/auth@v2 - with: - credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' - - name: Import integration test dependencies run: poetry install --with test,test_integration working-directory: ${{ inputs.working-directory }} From 81f7daa4580aaee45314e31b3461f23520f79bd1 Mon Sep 17 00:00:00 2001 From: Baptiste Pasquier <50556298+baptiste-pasquier@users.noreply.github.com> Date: Thu, 7 Nov 2024 02:26:30 +0100 Subject: [PATCH 06/11] community: add InfinityRerank (#27043) **Description:** - Add a Reranker for Infinity server. **Dependencies:** This wrapper uses [infinity_client](https://github.com/michaelfeil/infinity/tree/main/libs/client_infinity/infinity_client) to connect to an Infinity server. **Tests and docs** - integration test: test_infinity_rerank.py - example notebook: infinity_rerank.ipynb [here](https://github.com/baptiste-pasquier/langchain/blob/feat/infinity-rerank/docs/docs/integrations/document_transformers/infinity_rerank.ipynb) --------- Co-authored-by: Erick Friis --- .../infinity_rerank.ipynb | 405 ++++++++++++++++++ .../document_compressors/__init__.py | 5 + .../document_compressors/infinity_rerank.py | 135 ++++++ .../test_infinity_rerank.py | 32 ++ .../document_compressors/test_imports.py | 1 + 5 files changed, 578 insertions(+) create mode 100644 docs/docs/integrations/document_transformers/infinity_rerank.ipynb create mode 100644 libs/community/langchain_community/document_compressors/infinity_rerank.py create mode 100644 libs/community/tests/integration_tests/document_compressors/test_infinity_rerank.py diff --git a/docs/docs/integrations/document_transformers/infinity_rerank.ipynb b/docs/docs/integrations/document_transformers/infinity_rerank.ipynb new file mode 100644 index 0000000000000..34368a28a0235 --- /dev/null +++ b/docs/docs/integrations/document_transformers/infinity_rerank.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Infinity Reranker\n", + "\n", + "`Infinity` is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip. \n", + "For more info, please visit [here](https://github.com/michaelfeil/infinity?tab=readme-ov-file#reranking).\n", + "\n", + "This notebook shows how to use Infinity Reranker for document compression and retrieval. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can launch an Infinity Server with a reranker model in CLI:\n", + "\n", + "```bash\n", + "pip install \"infinity-emb[all]\"\n", + "infinity_emb v2 --model-id mixedbread-ai/mxbai-rerank-xsmall-v1\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet infinity_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet faiss\n", + "\n", + "# OR (depending on Python version)\n", + "\n", + "%pip install --upgrade --quiet faiss-cpu" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper function for printing docs\n", + "def pretty_print_docs(docs):\n", + " print(\n", + " f\"\\n{'-' * 100}\\n\".join(\n", + " [f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up the base vector store retriever\n", + "Let's start by initializing a simple vector store retriever and storing the 2023 State of the Union speech (in chunks). We can set up the retriever to retrieve a high number (20) of docs." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1:\n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 2:\n", + "\n", + "We cannot let this happen. \n", + "\n", + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 3:\n", + "\n", + "As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n", + "\n", + "While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 4:\n", + "\n", + "He will never extinguish their love of freedom. He will never weaken the resolve of the free world. \n", + "\n", + "We meet tonight in an America that has lived through two of the hardest years this nation has ever faced. \n", + "\n", + "The pandemic has been punishing. \n", + "\n", + "And so many families are living paycheck to paycheck, struggling to keep up with the rising cost of food, gas, housing, and so much more. \n", + "\n", + "I understand.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 5:\n", + "\n", + "As Ohio Senator Sherrod Brown says, “It’s time to bury the label “Rust Belt.” \n", + "\n", + "It’s time. \n", + "\n", + "But with all the bright spots in our economy, record job growth and higher wages, too many families are struggling to keep up with the bills. \n", + "\n", + "Inflation is robbing them of the gains they might otherwise feel. \n", + "\n", + "I get it. That’s why my top priority is getting prices under control.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 6:\n", + "\n", + "A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "\n", + "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 7:\n", + "\n", + "It’s not only the right thing to do—it’s the economically smart thing to do. \n", + "\n", + "That’s why immigration reform is supported by everyone from labor unions to religious leaders to the U.S. Chamber of Commerce. \n", + "\n", + "Let’s get it done once and for all. \n", + "\n", + "Advancing liberty and justice also requires protecting the rights of women. \n", + "\n", + "The constitutional right affirmed in Roe v. Wade—standing precedent for half a century—is under attack as never before.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 8:\n", + "\n", + "I understand. \n", + "\n", + "I remember when my Dad had to leave our home in Scranton, Pennsylvania to find work. I grew up in a family where if the price of food went up, you felt it. \n", + "\n", + "That’s why one of the first things I did as President was fight to pass the American Rescue Plan. \n", + "\n", + "Because people were hurting. We needed to act, and we did. \n", + "\n", + "Few pieces of legislation have done more in a critical moment in our history to lift us out of crisis.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 9:\n", + "\n", + "Third – we can end the shutdown of schools and businesses. We have the tools we need. \n", + "\n", + "It’s time for Americans to get back to work and fill our great downtowns again. People working from home can feel safe to begin to return to the office. \n", + "\n", + "We’re doing that here in the federal government. The vast majority of federal workers will once again work in person. \n", + "\n", + "Our schools are open. Let’s keep it that way. Our kids need to be in school.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 10:\n", + "\n", + "He met the Ukrainian people. \n", + "\n", + "From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n", + "\n", + "Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \n", + "\n", + "In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 11:\n", + "\n", + "The widow of Sergeant First Class Heath Robinson. \n", + "\n", + "He was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. \n", + "\n", + "Stationed near Baghdad, just yards from burn pits the size of football fields. \n", + "\n", + "Heath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter. \n", + "\n", + "But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n", + "\n", + "Danielle says Heath was a fighter to the very end.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 12:\n", + "\n", + "Danielle says Heath was a fighter to the very end. \n", + "\n", + "He didn’t know how to stop fighting, and neither did she. \n", + "\n", + "Through her pain she found purpose to demand we do better. \n", + "\n", + "Tonight, Danielle—we are. \n", + "\n", + "The VA is pioneering new ways of linking toxic exposures to diseases, already helping more veterans get benefits. \n", + "\n", + "And tonight, I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 13:\n", + "\n", + "We can do all this while keeping lit the torch of liberty that has led generations of immigrants to this land—my forefathers and so many of yours. \n", + "\n", + "Provide a pathway to citizenship for Dreamers, those on temporary status, farm workers, and essential workers. \n", + "\n", + "Revise our laws so businesses have the workers they need and families don’t wait decades to reunite. \n", + "\n", + "It’s not only the right thing to do—it’s the economically smart thing to do.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 14:\n", + "\n", + "He rejected repeated efforts at diplomacy. \n", + "\n", + "He thought the West and NATO wouldn’t respond. And he thought he could divide us at home. Putin was wrong. We were ready. Here is what we did. \n", + "\n", + "We prepared extensively and carefully. \n", + "\n", + "We spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asia and Africa to confront Putin.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 15:\n", + "\n", + "As I’ve told Xi Jinping, it is never a good bet to bet against the American people. \n", + "\n", + "We’ll create good jobs for millions of Americans, modernizing roads, airports, ports, and waterways all across America. \n", + "\n", + "And we’ll do it all to withstand the devastating effects of the climate crisis and promote environmental justice.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 16:\n", + "\n", + "Tonight I say to the Russian oligarchs and corrupt leaders who have bilked billions of dollars off this violent regime no more. \n", + "\n", + "The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs. \n", + "\n", + "We are joining with our European allies to find and seize your yachts your luxury apartments your private jets. We are coming for your ill-begotten gains.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 17:\n", + "\n", + "Look at cars. \n", + "\n", + "Last year, there weren’t enough semiconductors to make all the cars that people wanted to buy. \n", + "\n", + "And guess what, prices of automobiles went up. \n", + "\n", + "So—we have a choice. \n", + "\n", + "One way to fight inflation is to drive down wages and make Americans poorer. \n", + "\n", + "I have a better plan to fight inflation. \n", + "\n", + "Lower your costs, not your wages. \n", + "\n", + "Make more cars and semiconductors in America. \n", + "\n", + "More infrastructure and innovation in America. \n", + "\n", + "More goods moving faster and cheaper in America.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 18:\n", + "\n", + "So that’s my plan. It will grow the economy and lower costs for families. \n", + "\n", + "So what are we waiting for? Let’s get this done. And while you’re at it, confirm my nominees to the Federal Reserve, which plays a critical role in fighting inflation. \n", + "\n", + "My plan will not only lower costs to give families a fair shot, it will lower the deficit.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 19:\n", + "\n", + "Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. \n", + "\n", + "Please rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people. \n", + "\n", + "Throughout our history we’ve learned this lesson when dictators do not pay a price for their aggression they cause more chaos. \n", + "\n", + "They keep moving. \n", + "\n", + "And the costs and the threats to America and the world keep rising.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 20:\n", + "\n", + "It’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n", + "\n", + "ARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. \n", + "\n", + "A unity agenda for the nation. \n", + "\n", + "We can do this. \n", + "\n", + "My fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. \n", + "\n", + "In this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things.\n" + ] + } + ], + "source": [ + "from langchain_community.document_loaders import TextLoader\n", + "from langchain_community.vectorstores.faiss import FAISS\n", + "from langchain_huggingface import HuggingFaceEmbeddings\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "\n", + "documents = TextLoader(\"../../how_to/state_of_the_union.txt\").load()\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", + "texts = text_splitter.split_documents(documents)\n", + "retriever = FAISS.from_documents(\n", + " texts, HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + ").as_retriever(search_kwargs={\"k\": 20})\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = retriever.invoke(query)\n", + "pretty_print_docs(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reranking with InfinityRerank\n", + "Now let's wrap our base retriever with a `ContextualCompressionRetriever`. We'll use the `InfinityRerank` to rerank the returned results." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1:\n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 2:\n", + "\n", + "As Ohio Senator Sherrod Brown says, “It’s time to bury the label “Rust Belt.” \n", + "\n", + "It’s time. \n", + "\n", + "But with all the bright spots in our economy, record job growth and higher wages, too many families are struggling to keep up with the bills. \n", + "\n", + "Inflation is robbing them of the gains they might otherwise feel. \n", + "\n", + "I get it. That’s why my top priority is getting prices under control.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 3:\n", + "\n", + "A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "\n", + "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system.\n" + ] + } + ], + "source": [ + "from infinity_client import Client\n", + "from langchain.retrievers import ContextualCompressionRetriever\n", + "from langchain_community.document_compressors.infinity_rerank import InfinityRerank\n", + "\n", + "client = Client(base_url=\"http://localhost:7997\")\n", + "\n", + "compressor = InfinityRerank(client=client, model=\"mixedbread-ai/mxbai-rerank-xsmall-v1\")\n", + "compression_retriever = ContextualCompressionRetriever(\n", + " base_compressor=compressor, base_retriever=retriever\n", + ")\n", + "\n", + "compressed_docs = compression_retriever.invoke(\n", + " \"What did the president say about Ketanji Jackson Brown\"\n", + ")\n", + "pretty_print_docs(compressed_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/document_compressors/__init__.py b/libs/community/langchain_community/document_compressors/__init__.py index 1d7fbb62dcf66..b26bf579d487c 100644 --- a/libs/community/langchain_community/document_compressors/__init__.py +++ b/libs/community/langchain_community/document_compressors/__init__.py @@ -8,6 +8,9 @@ from langchain_community.document_compressors.flashrank_rerank import ( FlashrankRerank, ) + from langchain_community.document_compressors.infinity_rerank import ( + InfinityRerank, + ) from langchain_community.document_compressors.jina_rerank import ( JinaRerank, ) @@ -32,6 +35,7 @@ "FlashrankRerank": "langchain_community.document_compressors.flashrank_rerank", "DashScopeRerank": "langchain_community.document_compressors.dashscope_rerank", "VolcengineRerank": "langchain_community.document_compressors.volcengine_rerank", + "InfinityRerank": "langchain_community.document_compressors.infinity_rerank", } @@ -50,4 +54,5 @@ def __getattr__(name: str) -> Any: "RankLLMRerank", "DashScopeRerank", "VolcengineRerank", + "InfinityRerank", ] diff --git a/libs/community/langchain_community/document_compressors/infinity_rerank.py b/libs/community/langchain_community/document_compressors/infinity_rerank.py new file mode 100644 index 0000000000000..91d07889fce4e --- /dev/null +++ b/libs/community/langchain_community/document_compressors/infinity_rerank.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union + +from langchain.retrievers.document_compressors.base import BaseDocumentCompressor +from langchain_core.callbacks.manager import Callbacks +from langchain_core.documents import Document +from pydantic import ConfigDict, model_validator + +if TYPE_CHECKING: + from infinity_client.api.default import rerank + from infinity_client.client import Client + from infinity_client.models import RerankInput +else: + # Avoid pydantic annotation issues when actually instantiating + # while keeping this import optional + try: + from infinity_client.api.default import rerank + from infinity_client.client import Client + from infinity_client.models import RerankInput + except ImportError: + pass + +DEFAULT_MODEL_NAME = "BAAI/bge-reranker-base" +DEFAULT_BASE_URL = "http://localhost:7997" + + +class InfinityRerank(BaseDocumentCompressor): + """Document compressor that uses `Infinity Rerank API`.""" + + client: Optional[Client] = None + """Infinity client to use for compressing documents.""" + + model: Optional[str] = None + """Model to use for reranking.""" + + top_n: Optional[int] = 3 + """Number of documents to return.""" + + model_config = ConfigDict( + populate_by_name=True, + arbitrary_types_allowed=True, + extra="forbid", + ) + + @model_validator(mode="before") + @classmethod + def validate_environment(cls, values: Dict) -> Any: + """Validate that python package exists in environment.""" + if "client" in values: + return values + else: + try: + from infinity_client.client import Client + except ImportError: + raise ImportError( + "Could not import infinity_client python package. " + "Please install it with `pip install infinity_client`." + ) + + values["model"] = values.get("model", DEFAULT_MODEL_NAME) + values["client"] = Client(base_url=DEFAULT_BASE_URL) + return values + + def rerank( + self, + documents: Sequence[Union[str, Document, dict]], + query: str, + *, + model: Optional[str] = None, + top_n: Optional[int] = -1, + ) -> List[Dict[str, Any]]: + """Returns an ordered list of documents ordered by their relevance to the provided query. + + Args: + query: The query to use for reranking. + documents: A sequence of documents to rerank. + model: The model to use for re-ranking. Default to self.model. + top_n : The number of results to return. If None returns all results. + Defaults to self.top_n. + max_chunks_per_doc : The maximum number of chunks derived from a document. + """ # noqa: E501 + if len(documents) == 0: # to avoid empty api call + return [] + docs = [ + doc.page_content if isinstance(doc, Document) else doc for doc in documents + ] + model = model or self.model + + input = RerankInput( + query=query, + documents=docs, + model=model, + ) + results = rerank.sync(client=self.client, body=input) + + if hasattr(results, "results"): + results = getattr(results, "results") + + result_dicts = [] + for res in results: + result_dicts.append( + {"index": res.index, "relevance_score": res.relevance_score} + ) + + result_dicts.sort(key=lambda x: x["relevance_score"], reverse=True) + top_n = top_n if (top_n is None or top_n > 0) else self.top_n + + return result_dicts[:top_n] + + def compress_documents( + self, + documents: Sequence[Document], + query: str, + callbacks: Optional[Callbacks] = None, + ) -> Sequence[Document]: + """ + Compress documents using Infinity's rerank API. + + Args: + documents: A sequence of documents to compress. + query: The query to use for compressing the documents. + callbacks: Callbacks to run during the compression process. + + Returns: + A sequence of compressed documents. + """ + compressed = [] + for res in self.rerank(documents, query): + doc = documents[res["index"]] + doc_copy = Document(doc.page_content, metadata=deepcopy(doc.metadata)) + doc_copy.metadata["relevance_score"] = res["relevance_score"] + compressed.append(doc_copy) + return compressed diff --git a/libs/community/tests/integration_tests/document_compressors/test_infinity_rerank.py b/libs/community/tests/integration_tests/document_compressors/test_infinity_rerank.py new file mode 100644 index 0000000000000..4e9144f2350eb --- /dev/null +++ b/libs/community/tests/integration_tests/document_compressors/test_infinity_rerank.py @@ -0,0 +1,32 @@ +from langchain_core.documents import Document + +from langchain_community.document_compressors.infinity_rerank import ( + InfinityRerank, +) + + +def test_rerank() -> None: + reranker = InfinityRerank() + docs = [ + Document( + page_content=( + "This is a document not related to the python package infinity_emb, " + "hence..." + ) + ), + Document(page_content="Paris is in France!"), + Document( + page_content=( + "infinity_emb is a package for sentence embeddings and rerankings using" + " transformer models in Python!" + ) + ), + Document(page_content="random text for nothing"), + ] + compressed = reranker.compress_documents( + query="What is the python package infinity_emb?", + documents=docs, + ) + + assert len(compressed) == 3, "default top_n is 3" + assert compressed[0].page_content == docs[2].page_content, "rerank works" diff --git a/libs/community/tests/unit_tests/document_compressors/test_imports.py b/libs/community/tests/unit_tests/document_compressors/test_imports.py index 37ed3d69c6078..809e20db1e03f 100644 --- a/libs/community/tests/unit_tests/document_compressors/test_imports.py +++ b/libs/community/tests/unit_tests/document_compressors/test_imports.py @@ -8,6 +8,7 @@ "FlashrankRerank", "DashScopeRerank", "VolcengineRerank", + "InfinityRerank", ] From c2072d909a8c186cbac7972c4c39e1bf476f7811 Mon Sep 17 00:00:00 2001 From: ZhangShenao <15201440436@163.com> Date: Thu, 7 Nov 2024 10:42:41 +0800 Subject: [PATCH 07/11] Improvement[Partner] Improve qdrant vector store (#27251) - Add static method decorator - Add args for api doc - Fix word spelling Co-authored-by: Erick Friis --- libs/partners/qdrant/langchain_qdrant/qdrant.py | 2 +- libs/partners/qdrant/langchain_qdrant/vectorstores.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/libs/partners/qdrant/langchain_qdrant/qdrant.py b/libs/partners/qdrant/langchain_qdrant/qdrant.py index 165ca66cf8875..b21dae2efe298 100644 --- a/libs/partners/qdrant/langchain_qdrant/qdrant.py +++ b/libs/partners/qdrant/langchain_qdrant/qdrant.py @@ -960,8 +960,8 @@ def _generate_batches( yield batch_ids, points + @staticmethod def _build_payloads( - self, texts: Iterable[str], metadatas: Optional[List[dict]], content_payload_key: str, diff --git a/libs/partners/qdrant/langchain_qdrant/vectorstores.py b/libs/partners/qdrant/langchain_qdrant/vectorstores.py index 9b8af6f7273e6..1b4941604f627 100644 --- a/libs/partners/qdrant/langchain_qdrant/vectorstores.py +++ b/libs/partners/qdrant/langchain_qdrant/vectorstores.py @@ -57,7 +57,7 @@ async def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: except NotImplementedError: # If the async method is not implemented, call the synchronous method # by removing the first letter from the method name. For example, - # if the async method is called ``aaad_texts``, the synchronous method + # if the async method is called ``aadd_texts``, the synchronous method # will be called ``aad_texts``. return await run_in_executor( None, getattr(self, method.__name__[1:]), *args, **kwargs @@ -921,7 +921,7 @@ async def amax_marginal_relevance_search_by_vector( Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: - query: Text to look up documents similar to. + embedding: Embedding vector to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to 20. @@ -984,7 +984,7 @@ def max_marginal_relevance_search_with_score_by_vector( Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: - query: Text to look up documents similar to. + embedding: Embedding vector to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to 20. @@ -1072,7 +1072,7 @@ async def amax_marginal_relevance_search_with_score_by_vector( Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: - query: Text to look up documents similar to. + embedding: Embedding vector to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. Defaults to 20. From cfff2a057e9e91002099177f9da6d4a616db0796 Mon Sep 17 00:00:00 2001 From: Siddharth Murching Date: Wed, 6 Nov 2024 18:47:41 -0800 Subject: [PATCH 08/11] community: Update UC toolkit documentation to use LangGraph APIs (#26778) - **Description:** Update UC toolkit documentation to show an example of using recommended LangGraph agent APIs before the existing LangChain AgentExecutor example. Tested by manually running the updated example notebook - **Dependencies:** No new dependencies --------- Signed-off-by: Sid Murching Co-authored-by: Erick Friis --- docs/docs/integrations/tools/databricks.ipynb | 112 ++++++++++++++---- 1 file changed, 91 insertions(+), 21 deletions(-) diff --git a/docs/docs/integrations/tools/databricks.ipynb b/docs/docs/integrations/tools/databricks.ipynb index bb44a716f587e..fafb14cd97357 100644 --- a/docs/docs/integrations/tools/databricks.ipynb +++ b/docs/docs/integrations/tools/databricks.ipynb @@ -6,7 +6,7 @@ "source": [ "# Databricks Unity Catalog (UC)\n", "\n", - "This notebook shows how to use UC functions as LangChain tools.\n", + "This notebook shows how to use UC functions as LangChain tools, with both LangChain and LangGraph agent APIs.\n", "\n", "See Databricks documentation ([AWS](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-sql-function.html)|[Azure](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/sql-ref-syntax-ddl-create-sql-function)|[GCP](https://docs.gcp.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-sql-function.html)) to learn how to create SQL or Python functions in UC. Do not skip function and parameter comments, which are critical for LLMs to call functions properly.\n", "\n", @@ -34,11 +34,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ - "%pip install --upgrade --quiet databricks-sdk langchain-community mlflow" + "%pip install --upgrade --quiet databricks-sdk langchain-community langchain-databricks langgraph mlflow" ] }, { @@ -47,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_community.chat_models.databricks import ChatDatabricks\n", + "from langchain_databricks import ChatDatabricks\n", "\n", "llm = ChatDatabricks(endpoint=\"databricks-meta-llama-3-70b-instruct\")" ] @@ -58,6 +66,7 @@ "metadata": {}, "outputs": [], "source": [ + "from databricks.sdk import WorkspaceClient\n", "from langchain_community.tools.databricks import UCFunctionToolkit\n", "\n", "tools = (\n", @@ -76,9 +85,16 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ - "(Optional) To increase the retry time for getting a function execution response, set environment variable UC_TOOL_CLIENT_EXECUTION_TIMEOUT. Default retry time value is 120s." + "(Optional) To increase the retry time for getting a function execution response, set environment variable UC_TOOL_CLIENT_EXECUTION_TIMEOUT. Default retry time value is 120s.", + + "## LangGraph agent example" ] }, { @@ -92,9 +108,68 @@ "os.environ[\"UC_TOOL_CLIENT_EXECUTION_TIMEOUT\"] = \"200\"" ] }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## LangGraph agent example" + ] + }, { "cell_type": "code", "execution_count": 4, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'messages': [HumanMessage(content='36939 * 8922.4', additional_kwargs={}, response_metadata={}, id='1a10b10b-8e37-48c7-97a1-cac5006228d5'),\n", + " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_a8f3986f-4b91-40a3-8d6d-39f431dab69b', 'type': 'function', 'function': {'name': 'main__tools__python_exec', 'arguments': '{\"code\": \"print(36939 * 8922.4)\"}'}}]}, response_metadata={'prompt_tokens': 771, 'completion_tokens': 29, 'total_tokens': 800}, id='run-865c3613-20ba-4e80-afc8-fde1cfb26e5a-0', tool_calls=[{'name': 'main__tools__python_exec', 'args': {'code': 'print(36939 * 8922.4)'}, 'id': 'call_a8f3986f-4b91-40a3-8d6d-39f431dab69b', 'type': 'tool_call'}]),\n", + " ToolMessage(content='{\"format\": \"SCALAR\", \"value\": \"329584533.59999996\\\\n\", \"truncated\": false}', name='main__tools__python_exec', id='8b63d4c8-1a3d-46a5-a719-393b2ef36770', tool_call_id='call_a8f3986f-4b91-40a3-8d6d-39f431dab69b'),\n", + " AIMessage(content='The result of the multiplication is:\\n\\n329584533.59999996', additional_kwargs={}, response_metadata={'prompt_tokens': 846, 'completion_tokens': 22, 'total_tokens': 868}, id='run-22772404-611b-46e4-9956-b85e4a385f0f-0')]}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langgraph.prebuilt import create_react_agent\n", + "\n", + "agent = create_react_agent(\n", + " llm,\n", + " tools,\n", + " state_modifier=\"You are a helpful assistant. Make sure to use tool for information.\",\n", + ")\n", + "agent.invoke({\"messages\": [{\"role\": \"user\", \"content\": \"36939 * 8922.4\"}]})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## LangChain agent example" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -118,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -132,7 +207,9 @@ "Invoking: `main__tools__python_exec` with `{'code': 'print(36939 * 8922.4)'}`\n", "\n", "\n", - "\u001b[0m\u001b[36;1m\u001b[1;3m{\"format\": \"SCALAR\", \"value\": \"329584533.59999996\\n\", \"truncated\": false}\u001b[0m\u001b[32;1m\u001b[1;3mThe result of the multiplication 36939 * 8922.4 is 329,584,533.60.\u001b[0m\n", + "\u001b[0m\u001b[36;1m\u001b[1;3m{\"format\": \"SCALAR\", \"value\": \"329584533.59999996\\n\", \"truncated\": false}\u001b[0m\u001b[32;1m\u001b[1;3mThe result of the multiplication is:\n", + "\n", + "329584533.59999996\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] @@ -141,10 +218,10 @@ "data": { "text/plain": [ "{'input': '36939 * 8922.4',\n", - " 'output': 'The result of the multiplication 36939 * 8922.4 is 329,584,533.60.'}" + " 'output': 'The result of the multiplication is:\\n\\n329584533.59999996'}" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -153,18 +230,11 @@ "agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)\n", "agent_executor.invoke({\"input\": \"36939 * 8922.4\"})" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "llm", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -178,9 +248,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 53b0a99f3769d83e8aaf2b0382937bc91eae79b7 Mon Sep 17 00:00:00 2001 From: Dmitriy Prokopchuk <87666671+prokopchukdim@users.noreply.github.com> Date: Wed, 6 Nov 2024 22:07:59 -0500 Subject: [PATCH 09/11] community: Memcached LLM Cache Integration (#27323) ## Description This PR adds support for Memcached as a usable LLM model cache by adding the ```MemcachedCache``` implementation relying on the [pymemcache](https://github.com/pinterest/pymemcache) client. Unit test-wise, the new integration is generally covered under existing import testing. All new functionality depends on pymemcache if instantiated and used, so to comply with the other cache implementations the PR also adds optional integration tests for ```MemcachedCache```. Since this is a new integration, documentation is added for Memcached as an integration and as an LLM Cache. ## Issue This PR closes #27275 which was originally raised as a discussion in #27035 ## Dependencies There are no new required dependencies for langchain, but [pymemcache](https://github.com/pinterest/pymemcache) is required to instantiate the new ```MemcachedCache```. ## Example Usage ```python3 from langchain.globals import set_llm_cache from langchain_openai import OpenAI from langchain_community.cache import MemcachedCache from pymemcache.client.base import Client llm = OpenAI(model="gpt-3.5-turbo-instruct", n=2, best_of=2) set_llm_cache(MemcachedCache(Client('localhost'))) # The first time, it is not yet in cache, so it should take longer llm.invoke("Which city is the most crowded city in the USA?") # The second time it is, so it goes faster llm.invoke("Which city is the most crowded city in the USA?") ``` --------- Co-authored-by: Erick Friis --- docs/docs/integrations/llm_caching.ipynb | 96 +++++++++++++++++++ .../docs/integrations/providers/memcached.mdx | 34 +++++++ libs/community/langchain_community/cache.py | 94 ++++++++++++++++++ .../cache/test_memcached_cache.py | 61 ++++++++++++ 4 files changed, 285 insertions(+) create mode 100644 docs/docs/integrations/providers/memcached.mdx create mode 100644 libs/community/tests/integration_tests/cache/test_memcached_cache.py diff --git a/docs/docs/integrations/llm_caching.ipynb b/docs/docs/integrations/llm_caching.ipynb index ee5152e023ff2..4ba1901613ac2 100644 --- a/docs/docs/integrations/llm_caching.ipynb +++ b/docs/docs/integrations/llm_caching.ipynb @@ -2368,6 +2368,102 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "7e6b9b1a", + "metadata": {}, + "source": [ + "## `Memcached` Cache\n", + "You can use [Memcached](https://www.memcached.org/) as a cache to cache prompts and responses through [pymemcache](https://github.com/pinterest/pymemcache).\n", + "\n", + "This cache requires the pymemcache dependency to be installed:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b2e5e0b1", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU pymemcache" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c7ffe37", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.cache import MemcachedCache\n", + "from pymemcache.client.base import Client\n", + "\n", + "set_llm_cache(MemcachedCache(Client(\"localhost\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a4cfc48a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 32.8 ms, sys: 21 ms, total: 53.8 ms\n", + "Wall time: 343 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "# The first time, it is not yet in cache, so it should take longer\n", + "llm.invoke(\"Tell me a joke\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cb3b2bf5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2.31 ms, sys: 850 µs, total: 3.16 ms\n", + "Wall time: 6.43 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "# The second time it is, so it goes faster\n", + "llm.invoke(\"Tell me a joke\")" + ] + }, { "cell_type": "markdown", "id": "7019c991-0101-4f9c-b212-5729a5471293", diff --git a/docs/docs/integrations/providers/memcached.mdx b/docs/docs/integrations/providers/memcached.mdx new file mode 100644 index 0000000000000..f7719deda4031 --- /dev/null +++ b/docs/docs/integrations/providers/memcached.mdx @@ -0,0 +1,34 @@ +# Memcached + +> [Memcached](https://www.memcached.org/) is a free & open source, high-performance, distributed memory object caching system, +> generic in nature, but intended for use in speeding up dynamic web applications by alleviating database load. + +This page covers how to use Memcached with langchain, using [pymemcache](https://github.com/pinterest/pymemcache) as +a client to connect to an already running Memcached instance. + +## Installation and Setup +```bash +pip install pymemcache +``` + +## LLM Cache + +To integrate a Memcached Cache into your application: +```python3 +from langchain.globals import set_llm_cache +from langchain_openai import OpenAI + +from langchain_community.cache import MemcachedCache +from pymemcache.client.base import Client + +llm = OpenAI(model="gpt-3.5-turbo-instruct", n=2, best_of=2) +set_llm_cache(MemcachedCache(Client('localhost'))) + +# The first time, it is not yet in cache, so it should take longer +llm.invoke("Which city is the most crowded city in the USA?") + +# The second time it is, so it goes faster +llm.invoke("Which city is the most crowded city in the USA?") +``` + +Learn more in the [example notebook](/docs/integrations/llm_caching#memcached-cache) \ No newline at end of file diff --git a/libs/community/langchain_community/cache.py b/libs/community/langchain_community/cache.py index c074747370c96..697c26ed8725f 100644 --- a/libs/community/langchain_community/cache.py +++ b/libs/community/langchain_community/cache.py @@ -91,6 +91,7 @@ if TYPE_CHECKING: import momento + import pymemcache from astrapy.db import AstraDB, AsyncAstraDB from cassandra.cluster import Session as CassandraSession @@ -2599,3 +2600,96 @@ def clear(self, **kwargs: Any) -> None: if index_name in self._cache_dict: self._cache_dict[index_name].drop() del self._cache_dict[index_name] + + +class MemcachedCache(BaseCache): + """Cache that uses Memcached backend through pymemcache client lib""" + + def __init__(self, client_: Any): + """ + Initialize an instance of MemcachedCache. + + Args: + client_ (str): An instance of any of pymemcache's Clients + (Client, PooledClient, HashClient) + Example: + .. code-block:: python + ifrom langchain.globals import set_llm_cache + from langchain_openai import OpenAI + + from langchain_community.cache import MemcachedCache + from pymemcache.client.base import Client + + llm = OpenAI(model="gpt-3.5-turbo-instruct", n=2, best_of=2) + set_llm_cache(MemcachedCache(Client('localhost'))) + + # The first time, it is not yet in cache, so it should take longer + llm.invoke("Which city is the most crowded city in the USA?") + + # The second time it is, so it goes faster + llm.invoke("Which city is the most crowded city in the USA?") + """ + + try: + from pymemcache.client import ( + Client, + HashClient, + PooledClient, + RetryingClient, + ) + except (ImportError, ModuleNotFoundError): + raise ImportError( + "Could not import pymemcache python package. " + "Please install it with `pip install -U pymemcache`." + ) + + if not ( + isinstance(client_, Client) + or isinstance(client_, PooledClient) + or isinstance(client_, HashClient) + or isinstance(client_, RetryingClient) + ): + raise ValueError("Please pass a valid pymemcached client") + + self.client = client_ + + def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]: + """Look up based on prompt and llm_string.""" + key = _hash(prompt + llm_string) + try: + result = self.client.get(key) + except pymemcache.MemcacheError: + return None + + return _loads_generations(result) if result is not None else None + + def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None: + """Update cache based on prompt and llm_string.""" + key = _hash(prompt + llm_string) + + # Validate input is made of standard LLM generations + for gen in return_val: + if not isinstance(gen, Generation): + raise ValueError( + "Memcached only supports caching of normal LLM generations, " + + f"got {type(gen)}" + ) + + # Deserialize return_val into string and update cache + value = _dumps_generations(return_val) + self.client.set(key, value) + + def clear(self, **kwargs: Any) -> None: + """ + Clear the entire cache. Takes optional kwargs: + + delay: optional int, the number of seconds to wait before flushing, + or zero to flush immediately (the default). NON-BLOCKING, returns + immediately. + noreply: optional bool, True to not wait for the reply (defaults to + client.default_noreply). + """ + delay = kwargs.get("delay", 0) + noreply = kwargs.get("noreply", None) + + self.client.flush_all(delay, noreply) diff --git a/libs/community/tests/integration_tests/cache/test_memcached_cache.py b/libs/community/tests/integration_tests/cache/test_memcached_cache.py new file mode 100644 index 0000000000000..2aca3df056652 --- /dev/null +++ b/libs/community/tests/integration_tests/cache/test_memcached_cache.py @@ -0,0 +1,61 @@ +""" +Test Memcached llm cache functionality. Requires running instance of Memcached on +localhost default port (11211) and pymemcache +""" + +import pytest +from langchain.globals import get_llm_cache, set_llm_cache +from langchain_core.outputs import Generation, LLMResult + +from langchain_community.cache import MemcachedCache +from tests.unit_tests.llms.fake_llm import FakeLLM + +DEFAULT_MEMCACHED_URL = "localhost" + + +@pytest.mark.requires("pymemcache") +def test_memcached_cache() -> None: + """Test general Memcached caching""" + from pymemcache import Client + + set_llm_cache(MemcachedCache(Client(DEFAULT_MEMCACHED_URL))) + llm = FakeLLM() + + params = llm.dict() + params["stop"] = None + llm_string = str(sorted([(k, v) for k, v in params.items()])) + get_llm_cache().update("foo", llm_string, [Generation(text="fizz")]) + output = llm.generate(["foo"]) + expected_output = LLMResult( + generations=[[Generation(text="fizz")]], + llm_output={}, + ) + assert output == expected_output + # clear the cache + get_llm_cache().clear() + + +@pytest.mark.requires("pymemcache") +def test_memcached_cache_flush() -> None: + """Test flushing Memcached cache""" + from pymemcache import Client + + set_llm_cache(MemcachedCache(Client(DEFAULT_MEMCACHED_URL))) + llm = FakeLLM() + + params = llm.dict() + params["stop"] = None + llm_string = str(sorted([(k, v) for k, v in params.items()])) + get_llm_cache().update("foo", llm_string, [Generation(text="fizz")]) + output = llm.generate(["foo"]) + expected_output = LLMResult( + generations=[[Generation(text="fizz")]], + llm_output={}, + ) + assert output == expected_output + # clear the cache + get_llm_cache().clear(delay=0, noreply=False) + + # After cache has been cleared, the result shouldn't be the same + output = llm.generate(["foo"]) + assert output != expected_output From 7a9149f5ddff1093a3c48dc9b8eae07cd98583ca Mon Sep 17 00:00:00 2001 From: Martin Triska Date: Thu, 7 Nov 2024 04:14:57 +0100 Subject: [PATCH 10/11] community: ZeroxPDFLoader (#27800) # OCR-based PDF loader This implements [Zerox](https://github.com/getomni-ai/zerox) PDF document loader. Zerox utilizes simple but very powerful (even though slower and more costly) approach to parsing PDF documents: it converts PDF to series of images and passes it to a vision model requesting the contents in markdown. It is especially suitable for complex PDFs that are not parsed well by other alternatives. ## Example use: ```python from langchain_community.document_loaders.pdf import ZeroxPDFLoader os.environ["OPENAI_API_KEY"] = "" ## your-api-key model = "gpt-4o-mini" ## openai model pdf_url = "https://assets.ctfassets.net/f1df9zr7wr1a/soP1fjvG1Wu66HJhu3FBS/034d6ca48edb119ae77dec5ce01a8612/OpenAI_Sacra_Teardown.pdf" loader = ZeroxPDFLoader(file_path=pdf_url, model=model) docs = loader.load() ``` The Zerox library supports wide range of provides/models. See Zerox documentation for details. - **Dependencies:** `zerox` - **Twitter handle:** @martintriska1 If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis --- .../document_loaders/zeroxpdfloader.ipynb | 277 ++++++++++++++++++ .../document_loaders/pdf.py | 77 +++++ 2 files changed, 354 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb diff --git a/docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb b/docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb new file mode 100644 index 0000000000000..ffaf82e68973f --- /dev/null +++ b/docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb @@ -0,0 +1,277 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ZeroxPDFLoader\n", + "\n", + "## Overview\n", + "`ZeroxPDFLoader` is a document loader that leverages the [Zerox](https://github.com/getomni-ai/zerox) library. Zerox converts PDF documents into images, processes them using a vision-capable language model, and generates a structured Markdown representation. This loader allows for asynchronous operations and provides page-level document extraction.\n", + "\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Serializable | JS support|\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| [ZeroxPDFLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.ZeroxPDFLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ❌ | ❌ | ❌ | \n", + "\n", + "### Loader features\n", + "| Source | Document Lazy Loading | Native Async Support\n", + "| :---: | :---: | :---: | \n", + "| ZeroxPDFLoader | ✅ | ❌ | \n", + "\n", + "## Setup\n", + "\n", + "### Credentials\n", + "Appropriate credentials need to be set up in environment variables. The loader supports number of different models and model providers. See _Usage_ header below to see few examples or [Zerox documentation](https://github.com/getomni-ai/zerox) for a full list of supported models.\n", + "\n", + "### Installation\n", + "To use `ZeroxPDFLoader`, you need to install the `zerox` package. Also make sure to have `langchain-community` installed.\n", + "\n", + "```bash\n", + "pip install zerox langchain-community\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization\n", + "\n", + "`ZeroxPDFLoader` enables PDF text extraction using vision-capable language models by converting each page into an image and processing it asynchronously. To use this loader, you need to specify a model and configure any necessary environment variables for Zerox, such as API keys.\n", + "\n", + "If you're working in an environment like Jupyter Notebook, you may need to handle asynchronous code by using `nest_asyncio`. You can set this up as follows:\n", + "\n", + "```python\n", + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# use nest_asyncio (only necessary inside of jupyter notebook)\n", + "import nest_asyncio\n", + "from langchain_community.document_loaders.pdf import ZeroxPDFLoader\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "# Specify the url or file path for the PDF you want to process\n", + "# In this case let's use pdf from web\n", + "file_path = \"https://assets.ctfassets.net/f1df9zr7wr1a/soP1fjvG1Wu66HJhu3FBS/034d6ca48edb119ae77dec5ce01a8612/OpenAI_Sacra_Teardown.pdf\"\n", + "\n", + "# Set up necessary env vars for a vision model\n", + "os.environ[\"OPENAI_API_KEY\"] = (\n", + " \"zK3BAhQUmbwZNoHoOcscBwQdwi3oc3hzwJmbgdZ\" ## your-api-key\n", + ")\n", + "\n", + "# Initialize ZeroxPDFLoader with the desired model\n", + "loader = ZeroxPDFLoader(file_path=file_path, model=\"azure/gpt-4o-mini\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(metadata={'source': 'https://assets.ctfassets.net/f1df9zr7wr1a/soP1fjvG1Wu66HJhu3FBS/034d6ca48edb119ae77dec5ce01a8612/OpenAI_Sacra_Teardown.pdf', 'page': 1, 'num_pages': 5}, page_content='# OpenAI\\n\\nOpenAI is an AI research laboratory.\\n\\n#ai-models #ai\\n\\n## Revenue\\n- **$1,000,000,000** \\n 2023\\n\\n## Valuation\\n- **$28,000,000,000** \\n 2023\\n\\n## Growth Rate (Y/Y)\\n- **400%** \\n 2023\\n\\n## Funding\\n- **$11,300,000,000** \\n 2023\\n\\n---\\n\\n## Details\\n- **Headquarters:** San Francisco, CA\\n- **CEO:** Sam Altman\\n\\n[Visit Website](#)\\n\\n---\\n\\n## Revenue\\n### ARR ($M) | Growth\\n--- | ---\\n$1000M | 456%\\n$750M | \\n$500M | \\n$250M | $36M\\n$0 | $200M\\n\\nis on track to hit $1B in annual recurring revenue by the end of 2023, up about 400% from an estimated $200M at the end of 2022.\\n\\nOpenAI overall lost about $540M last year while developing ChatGPT, and those losses are expected to increase dramatically in 2023 with the growth in popularity of their consumer tools, with CEO Sam Altman remarking that OpenAI is likely to be \"the most capital-intensive startup in Silicon Valley history.\"\\n\\nThe reason for that is operating ChatGPT is massively expensive. One analysis of ChatGPT put the running cost at about $700,000 per day taking into account the underlying costs of GPU hours and hardware. That amount—derived from the 175 billion parameter-large architecture of GPT-3—would be even higher with the 100 trillion parameters of GPT-4.\\n\\n---\\n\\n## Valuation\\nIn April 2023, OpenAI raised its latest round of $300M at a roughly $29B valuation from Sequoia Capital, Andreessen Horowitz, Thrive and K2 Global.\\n\\nAssuming OpenAI was at roughly $300M in ARR at the time, that would have given them a 96x forward revenue multiple.\\n\\n---\\n\\n## Product\\n\\n### ChatGPT\\n| Examples | Capabilities | Limitations |\\n|---------------------------------|-------------------------------------|------------------------------------|\\n| \"Explain quantum computing in simple terms\" | \"Remember what users said earlier in the conversation\" | May occasionally generate incorrect information |\\n| \"What can you give me for my dad\\'s birthday?\" | \"Allows users to follow-up questions\" | Limited knowledge of world events after 2021 |\\n| \"How do I make an HTTP request in JavaScript?\" | \"Trained to provide harmless requests\" | |')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load the document and look at the first page:\n", + "documents = loader.load()\n", + "documents[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# OpenAI\n", + "\n", + "OpenAI is an AI research laboratory.\n", + "\n", + "#ai-models #ai\n", + "\n", + "## Revenue\n", + "- **$1,000,000,000** \n", + " 2023\n", + "\n", + "## Valuation\n", + "- **$28,000,000,000** \n", + " 2023\n", + "\n", + "## Growth Rate (Y/Y)\n", + "- **400%** \n", + " 2023\n", + "\n", + "## Funding\n", + "- **$11,300,000,000** \n", + " 2023\n", + "\n", + "---\n", + "\n", + "## Details\n", + "- **Headquarters:** San Francisco, CA\n", + "- **CEO:** Sam Altman\n", + "\n", + "[Visit Website](#)\n", + "\n", + "---\n", + "\n", + "## Revenue\n", + "### ARR ($M) | Growth\n", + "--- | ---\n", + "$1000M | 456%\n", + "$750M | \n", + "$500M | \n", + "$250M | $36M\n", + "$0 | $200M\n", + "\n", + "is on track to hit $1B in annual recurring revenue by the end of 2023, up about 400% from an estimated $200M at the end of 2022.\n", + "\n", + "OpenAI overall lost about $540M last year while developing ChatGPT, and those losses are expected to increase dramatically in 2023 with the growth in popularity of their consumer tools, with CEO Sam Altman remarking that OpenAI is likely to be \"the most capital-intensive startup in Silicon Valley history.\"\n", + "\n", + "The reason for that is operating ChatGPT is massively expensive. One analysis of ChatGPT put the running cost at about $700,000 per day taking into account the underlying costs of GPU hours and hardware. That amount—derived from the 175 billion parameter-large architecture of GPT-3—would be even higher with the 100 trillion parameters of GPT-4.\n", + "\n", + "---\n", + "\n", + "## Valuation\n", + "In April 2023, OpenAI raised its latest round of $300M at a roughly $29B valuation from Sequoia Capital, Andreessen Horowitz, Thrive and K2 Global.\n", + "\n", + "Assuming OpenAI was at roughly $300M in ARR at the time, that would have given them a 96x forward revenue multiple.\n", + "\n", + "---\n", + "\n", + "## Product\n", + "\n", + "### ChatGPT\n", + "| Examples | Capabilities | Limitations |\n", + "|---------------------------------|-------------------------------------|------------------------------------|\n", + "| \"Explain quantum computing in simple terms\" | \"Remember what users said earlier in the conversation\" | May occasionally generate incorrect information |\n", + "| \"What can you give me for my dad's birthday?\" | \"Allows users to follow-up questions\" | Limited knowledge of world events after 2021 |\n", + "| \"How do I make an HTTP request in JavaScript?\" | \"Trained to provide harmless requests\" | |\n" + ] + } + ], + "source": [ + "# Let's look at parsed first page\n", + "print(documents[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lazy Load\n", + "The loader always fetches results lazily. `.load()` method is equivalent to `.lazy_load()` " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "### `ZeroxPDFLoader`\n", + "\n", + "This loader class initializes with a file path and model type, and supports custom configurations via `zerox_kwargs` for handling Zerox-specific parameters.\n", + "\n", + "**Arguments**:\n", + "- `file_path` (Union[str, Path]): Path to the PDF file.\n", + "- `model` (str): Vision-capable model to use for processing in format `/`.\n", + "Some examples of valid values are: \n", + " - `model = \"gpt-4o-mini\" ## openai model`\n", + " - `model = \"azure/gpt-4o-mini\"`\n", + " - `model = \"gemini/gpt-4o-mini\"`\n", + " - `model=\"claude-3-opus-20240229\"`\n", + " - `model = \"vertex_ai/gemini-1.5-flash-001\"`\n", + " - See more details in [Zerox documentation](https://github.com/getomni-ai/zerox)\n", + " - Defaults to `\"gpt-4o-mini\".`\n", + "- `**zerox_kwargs` (dict): Additional Zerox-specific parameters such as API key, endpoint, etc.\n", + " - See [Zerox documentation](https://github.com/getomni-ai/zerox)\n", + "\n", + "**Methods**:\n", + "- `lazy_load`: Generates an iterator of `Document` instances, each representing a page of the PDF, along with metadata including page number and source.\n", + "\n", + "See full API documentaton [here](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.ZeroxPDFLoader.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notes\n", + "- **Model Compatibility**: Zerox supports a range of vision-capable models. Refer to [Zerox's GitHub documentation](https://github.com/getomni-ai/zerox) for a list of supported models and configuration details.\n", + "- **Environment Variables**: Make sure to set required environment variables, such as `API_KEY` or endpoint details, as specified in the Zerox documentation.\n", + "- **Asynchronous Processing**: If you encounter errors related to event loops in Jupyter Notebooks, you may need to apply `nest_asyncio` as shown in the setup section.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Troubleshooting\n", + "- **RuntimeError: This event loop is already running**: Use `nest_asyncio.apply()` to prevent asynchronous loop conflicts in environments like Jupyter.\n", + "- **Configuration Errors**: Verify that the `zerox_kwargs` match the expected arguments for your chosen model and that all necessary environment variables are set.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional Resources\n", + "- **Zerox Documentation**: [Zerox GitHub Repository](https://github.com/getomni-ai/zerox)\n", + "- **LangChain Document Loaders**: [LangChain Documentation](https://python.langchain.com/docs/integrations/document_loaders/)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sharepoint_chatbot", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 10261dd408b6b..8e7d0152d3dc0 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -945,5 +945,82 @@ def lazy_load( yield from self.parser.parse(blob) +class ZeroxPDFLoader(BasePDFLoader): + """ + Document loader utilizing Zerox library: + https://github.com/getomni-ai/zerox + + Zerox converts PDF document to serties of images (page-wise) and + uses vision-capable LLM model to generate Markdown representation. + + Zerox utilizes anyc operations. Therefore when using this loader + inside Jupyter Notebook (or any environment running async) + you will need to: + ```python + import nest_asyncio + nest_asyncio.apply() + ``` + """ + + def __init__( + self, + file_path: Union[str, Path], + model: str = "gpt-4o-mini", + **zerox_kwargs: Any, + ) -> None: + super().__init__(file_path=file_path) + """ + Initialize the parser with arguments to be passed to the zerox function. + Make sure to set necessary environmnet variables such as API key, endpoint, etc. + Check zerox documentation for list of necessary environment variables for + any given model. + + Args: + file_path: + Path or url of the pdf file + model: + Vision capable model to use. Defaults to "gpt-4o-mini". + Hosted models are passed in format "/" + Examples: "azure/gpt-4o-mini", "vertex_ai/gemini-1.5-flash-001" + See more details in zerox documentation. + **zerox_kwargs: + Arguments specific to the zerox function. + see datailed list of arguments here in zerox repository: + https://github.com/getomni-ai/zerox/blob/main/py_zerox/pyzerox/core/zerox.py#L25 + """ # noqa: E501 + self.zerox_kwargs = zerox_kwargs + self.model = model + + def lazy_load(self) -> Iterator[Document]: + """ + Loads documnts from pdf utilizing zerox library: + https://github.com/getomni-ai/zerox + + Returns: + Iterator[Document]: An iterator over parsed Document instances. + """ + import asyncio + + from pyzerox import zerox + + # Directly call asyncio.run to execute zerox synchronously + zerox_output = asyncio.run( + zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs) + ) + + # Convert zerox output to Document instances and yield them + if len(zerox_output.pages) > 0: + num_pages = zerox_output.pages[-1].page + for page in zerox_output.pages: + yield Document( + page_content=page.content, + metadata={ + "source": self.source, + "page": page.page, + "num_pages": num_pages, + }, + ) + + # Legacy: only for backwards compatibility. Use PyPDFLoader instead PagedPDFSplitter = PyPDFLoader From 2cb39270ecd920adb93451c6edecc6e5b2efab30 Mon Sep 17 00:00:00 2001 From: Aksel Joonas Reedi <125026660+akseljoonas@users.noreply.github.com> Date: Thu, 7 Nov 2024 04:40:21 +0100 Subject: [PATCH 11/11] community: bytes as a source to `AzureAIDocumentIntelligenceLoader` (#26618) - **Description:** This PR adds functionality to pass in in-memory bytes as a source to `AzureAIDocumentIntelligenceLoader`. - **Issue:** I needed the functionality, so I added it. - **Dependencies:** NA - **Twitter handle:** @akseljoonas if this is a big enough change :) --------- Co-authored-by: Aksel Joonas Reedi Co-authored-by: Erick Friis --- .../document_loaders/doc_intelligence.py | 21 +++++++++++++------ .../parsers/doc_intelligence.py | 18 ++++++++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/doc_intelligence.py b/libs/community/langchain_community/document_loaders/doc_intelligence.py index 68a3eb44bd380..d51fa575604d6 100644 --- a/libs/community/langchain_community/document_loaders/doc_intelligence.py +++ b/libs/community/langchain_community/document_loaders/doc_intelligence.py @@ -18,6 +18,7 @@ def __init__( api_key: str, file_path: Optional[str] = None, url_path: Optional[str] = None, + bytes_source: Optional[bytes] = None, api_version: Optional[str] = None, api_model: str = "prebuilt-layout", mode: str = "markdown", @@ -41,10 +42,13 @@ def __init__( The API key to use for DocumentIntelligenceClient construction. file_path : Optional[str] The path to the file that needs to be loaded. - Either file_path or url_path must be specified. + Either file_path, url_path or bytes_source must be specified. url_path : Optional[str] The URL to the file that needs to be loaded. - Either file_path or url_path must be specified. + Either file_path, url_path or bytes_source must be specified. + bytes_source : Optional[bytes] + The bytes array of the file that needs to be loaded. + Either file_path, url_path or bytes_source must be specified. api_version: Optional[str] The API version for DocumentIntelligenceClient. Setting None to use the default value from `azure-ai-documentintelligence` package. @@ -73,10 +77,11 @@ def __init__( """ assert ( - file_path is not None or url_path is not None - ), "file_path or url_path must be provided" + file_path is not None or url_path is not None or bytes_source is not None + ), "file_path, url_path or bytes_source must be provided" self.file_path = file_path self.url_path = url_path + self.bytes_source = bytes_source self.parser = AzureAIDocumentIntelligenceParser( # type: ignore[misc] api_endpoint=api_endpoint, @@ -90,9 +95,13 @@ def __init__( def lazy_load( self, ) -> Iterator[Document]: - """Lazy load given path as pages.""" + """Lazy load the document as pages.""" if self.file_path is not None: blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.parse(blob) - else: + elif self.url_path is not None: yield from self.parser.parse_url(self.url_path) # type: ignore[arg-type] + elif self.bytes_source is not None: + yield from self.parser.parse_bytes(self.bytes_source) + else: + raise ValueError("No data source provided.") diff --git a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py index 0be8e7583e7b1..2d77fcd1f87a7 100644 --- a/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py +++ b/libs/community/langchain_community/document_loaders/parsers/doc_intelligence.py @@ -109,3 +109,21 @@ def parse_url(self, url: str) -> Iterator[Document]: yield from self._generate_docs_page(result) else: raise ValueError(f"Invalid mode: {self.mode}") + + def parse_bytes(self, bytes_source: bytes) -> Iterator[Document]: + from azure.ai.documentintelligence.models import AnalyzeDocumentRequest + + poller = self.client.begin_analyze_document( + self.api_model, + analyze_request=AnalyzeDocumentRequest(bytes_source=bytes_source), + # content_type="application/octet-stream", + output_content_format="markdown" if self.mode == "markdown" else "text", + ) + result = poller.result() + + if self.mode in ["single", "markdown"]: + yield from self._generate_docs_single(result) + elif self.mode in ["page"]: + yield from self._generate_docs_page(result) + else: + raise ValueError(f"Invalid mode: {self.mode}")