From 32917a0b98cb8edcfb8d0e84f0878434e1c3f192 Mon Sep 17 00:00:00 2001 From: Darien Schettler <50381286+darien-schettler@users.noreply.github.com> Date: Sun, 22 Dec 2024 19:16:16 -0500 Subject: [PATCH 1/6] Update dataframe.py (#28871) community: optimize DataFrame document loader **Description:** Simplify the `lazy_load` method in the DataFrame document loader by combining text extraction and metadata cleanup into a single operation. This makes the code more concise while maintaining the same functionality. **Issue:** N/A **Dependencies:** None **Twitter handle:** N/A --- .../langchain_community/document_loaders/dataframe.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/dataframe.py b/libs/community/langchain_community/document_loaders/dataframe.py index 1b508533f8d93..74ad56b53f783 100644 --- a/libs/community/langchain_community/document_loaders/dataframe.py +++ b/libs/community/langchain_community/document_loaders/dataframe.py @@ -21,9 +21,8 @@ def lazy_load(self) -> Iterator[Document]: """Lazy load records from dataframe.""" for _, row in self.data_frame.iterrows(): - text = row[self.page_content_column] metadata = row.to_dict() - metadata.pop(self.page_content_column) + text = metadata.pop(self.page_content_column) yield Document(page_content=text, metadata=metadata) From 41b6a86bbe030291cf8ee284ed0cd70dd493152b Mon Sep 17 00:00:00 2001 From: Mohammad Mohtashim <45242107+keenborder786@users.noreply.github.com> Date: Mon, 23 Dec 2024 19:50:22 +0500 Subject: [PATCH 2/6] Community: LlamaCppEmbeddings `embed_documents` and `embed_query` (#28827) - **Description:** `embed_documents` and `embed_query` was throwing off the error as stated in the issue. The issue was that `Llama` client is returning the embeddings in a nested list which is not being accounted for in the current implementation and therefore the stated error is being raised. - **Issue:** #28813 --------- Co-authored-by: Chester Curme --- .../embeddings/llamacpp.py | 50 ++++++++++++------- .../unit_tests/embeddings/test_llamacpp.py | 40 +++++++++++++++ 2 files changed, 72 insertions(+), 18 deletions(-) create mode 100644 libs/community/tests/unit_tests/embeddings/test_llamacpp.py diff --git a/libs/community/langchain_community/embeddings/llamacpp.py b/libs/community/langchain_community/embeddings/llamacpp.py index 6487312fd31d0..4adfeb0e52774 100644 --- a/libs/community/langchain_community/embeddings/llamacpp.py +++ b/libs/community/langchain_community/embeddings/llamacpp.py @@ -20,7 +20,7 @@ class LlamaCppEmbeddings(BaseModel, Embeddings): """ client: Any = None #: :meta private: - model_path: str + model_path: str = Field(default="") n_ctx: int = Field(512, alias="n_ctx") """Token context window.""" @@ -88,21 +88,22 @@ def validate_environment(self) -> Self: if self.n_gpu_layers is not None: model_params["n_gpu_layers"] = self.n_gpu_layers - try: - from llama_cpp import Llama - - self.client = Llama(model_path, embedding=True, **model_params) - except ImportError: - raise ImportError( - "Could not import llama-cpp-python library. " - "Please install the llama-cpp-python library to " - "use this embedding model: pip install llama-cpp-python" - ) - except Exception as e: - raise ValueError( - f"Could not load Llama model from path: {model_path}. " - f"Received error {e}" - ) + if not self.client: + try: + from llama_cpp import Llama + + self.client = Llama(model_path, embedding=True, **model_params) + except ImportError: + raise ImportError( + "Could not import llama-cpp-python library. " + "Please install the llama-cpp-python library to " + "use this embedding model: pip install llama-cpp-python" + ) + except Exception as e: + raise ValueError( + f"Could not load Llama model from path: {model_path}. " + f"Received error {e}" + ) return self @@ -116,7 +117,17 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: List of embeddings, one for each text. """ embeddings = self.client.create_embedding(texts) - return [list(map(float, e["embedding"])) for e in embeddings["data"]] + final_embeddings = [] + for e in embeddings["data"]: + try: + if isinstance(e["embedding"][0], list): + for data in e["embedding"]: + final_embeddings.append(list(map(float, data))) + else: + final_embeddings.append(list(map(float, e["embedding"]))) + except (IndexError, TypeError): + final_embeddings.append(list(map(float, e["embedding"]))) + return final_embeddings def embed_query(self, text: str) -> List[float]: """Embed a query using the Llama model. @@ -128,4 +139,7 @@ def embed_query(self, text: str) -> List[float]: Embeddings for the text. """ embedding = self.client.embed(text) - return list(map(float, embedding)) + if not isinstance(embedding, list): + return list(map(float, embedding)) + else: + return list(map(float, embedding[0])) diff --git a/libs/community/tests/unit_tests/embeddings/test_llamacpp.py b/libs/community/tests/unit_tests/embeddings/test_llamacpp.py new file mode 100644 index 0000000000000..ca2bd758216cf --- /dev/null +++ b/libs/community/tests/unit_tests/embeddings/test_llamacpp.py @@ -0,0 +1,40 @@ +from typing import Generator +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_community.embeddings.llamacpp import LlamaCppEmbeddings + + +@pytest.fixture +def mock_llama_client() -> Generator[MagicMock, None, None]: + with patch( + "langchain_community.embeddings.llamacpp.LlamaCppEmbeddings" + ) as MockLlama: + mock_client = MagicMock() + MockLlama.return_value = mock_client + yield mock_client + + +def test_initialization(mock_llama_client: MagicMock) -> None: + embeddings = LlamaCppEmbeddings(client=mock_llama_client) # type: ignore[call-arg] + assert embeddings.client is not None + + +def test_embed_documents(mock_llama_client: MagicMock) -> None: + mock_llama_client.create_embedding.return_value = { + "data": [{"embedding": [[0.1, 0.2, 0.3]]}, {"embedding": [[0.4, 0.5, 0.6]]}] + } + embeddings = LlamaCppEmbeddings(client=mock_llama_client) # type: ignore[call-arg] + texts = ["Hello world", "Test document"] + result = embeddings.embed_documents(texts) + expected = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] + assert result == expected + + +def test_embed_query(mock_llama_client: MagicMock) -> None: + mock_llama_client.embed.return_value = [[0.1, 0.2, 0.3]] + embeddings = LlamaCppEmbeddings(client=mock_llama_client) # type: ignore[call-arg] + result = embeddings.embed_query("Sample query") + expected = [0.1, 0.2, 0.3] + assert result == expected From 4b4d09f82b5451df34568096c2d8906c8f5a0a6d Mon Sep 17 00:00:00 2001 From: ZhangShenao <15201440436@163.com> Date: Mon, 23 Dec 2024 22:51:44 +0800 Subject: [PATCH 3/6] [Doc] Improvement: Fix docs of `ChatMLX` (#28884) - `ChatMLX` doesn't supports the role of system. - Fix https://github.com/langchain-ai/langchain/issues/28532 #28532 --- docs/docs/integrations/chat/mlx.ipynb | 46 +++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/docs/docs/integrations/chat/mlx.ipynb b/docs/docs/integrations/chat/mlx.ipynb index a5945dffae408..dc852da549d55 100644 --- a/docs/docs/integrations/chat/mlx.ipynb +++ b/docs/docs/integrations/chat/mlx.ipynb @@ -155,8 +155,48 @@ "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", "\n", "# setup ReAct style prompt\n", - "prompt = hub.pull(\"hwchase17/react-json\")\n", - "prompt = prompt.partial(\n", + "# Based on 'hwchase17/react' prompt modification, cause mlx does not support the `System` role\n", + "human_prompt = \"\"\"\n", + "Answer the following questions as best you can. You have access to the following tools:\n", + "\n", + "{tools}\n", + "\n", + "The way you use the tools is by specifying a json blob.\n", + "Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the input to the tool going here).\n", + "\n", + "The only values that should be in the \"action\" field are: {tool_names}\n", + "\n", + "The $JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. Here is an example of a valid $JSON_BLOB:\n", + "\n", + "```\n", + "{{\n", + " \"action\": $TOOL_NAME,\n", + " \"action_input\": $INPUT\n", + "}}\n", + "```\n", + "\n", + "ALWAYS use the following format:\n", + "\n", + "Question: the input question you must answer\n", + "Thought: you should always think about what to do\n", + "Action:\n", + "```\n", + "$JSON_BLOB\n", + "```\n", + "Observation: the result of the action\n", + "... (this Thought/Action/Observation can repeat N times)\n", + "Thought: I now know the final answer\n", + "Final Answer: the final answer to the original input question\n", + "\n", + "Begin! Reminder to always use the exact characters `Final Answer` when responding.\n", + "\n", + "{input}\n", + "\n", + "{agent_scratchpad}\n", + "\n", + "\"\"\"\n", + "\n", + "prompt = human_prompt.partial(\n", " tools=render_text_description(tools),\n", " tool_names=\", \".join([t.name for t in tools]),\n", ")\n", @@ -207,7 +247,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.12.7" } }, "nbformat": 4, From 8d9907088b843756b5aa3f49f11f51b451567fa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Panella?= Date: Mon, 23 Dec 2024 10:05:48 -0500 Subject: [PATCH 4/6] community(azuresearch): allow to use any valid credential (#28873) Add option to use any valid credential type. Differentiates async cases needed by Azure Search. This could replace the use of a static token --- .../vectorstores/azuresearch.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azuresearch.py b/libs/community/langchain_community/vectorstores/azuresearch.py index 6930c8319e4d9..d0aa15e2acbd1 100644 --- a/libs/community/langchain_community/vectorstores/azuresearch.py +++ b/libs/community/langchain_community/vectorstores/azuresearch.py @@ -42,6 +42,8 @@ logger = logging.getLogger() if TYPE_CHECKING: + from azure.core.credentials import TokenCredential + from azure.core.credentials_async import AsyncTokenCredential from azure.search.documents import SearchClient, SearchItemPaged from azure.search.documents.aio import ( AsyncSearchItemPaged, @@ -96,10 +98,13 @@ def _get_search_client( cors_options: Optional[CorsOptions] = None, async_: bool = False, additional_search_client_options: Optional[Dict[str, Any]] = None, + azure_credential: Optional[TokenCredential] = None, + azure_async_credential: Optional[AsyncTokenCredential] = None, ) -> Union[SearchClient, AsyncSearchClient]: from azure.core.credentials import AccessToken, AzureKeyCredential, TokenCredential from azure.core.exceptions import ResourceNotFoundError from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential + from azure.identity.aio import DefaultAzureCredential as AsyncDefaultAzureCredential from azure.search.documents import SearchClient from azure.search.documents.aio import SearchClient as AsyncSearchClient from azure.search.documents.indexes import SearchIndexClient @@ -143,12 +148,17 @@ def get_token( if key.upper() == "INTERACTIVE": credential = InteractiveBrowserCredential() credential.get_token("https://search.azure.com/.default") + async_credential = credential else: credential = AzureKeyCredential(key) + async_credential = credential elif azure_ad_access_token is not None: credential = AzureBearerTokenCredential(azure_ad_access_token) + async_credential = credential else: - credential = DefaultAzureCredential() + credential = azure_credential or DefaultAzureCredential() + async_credential = azure_async_credential or AsyncDefaultAzureCredential() + index_client: SearchIndexClient = SearchIndexClient( endpoint=endpoint, credential=credential, @@ -266,7 +276,7 @@ def fmt_err(x: str) -> str: return AsyncSearchClient( endpoint=endpoint, index_name=index_name, - credential=credential, + credential=async_credential, user_agent=user_agent, **additional_search_client_options, ) @@ -278,7 +288,7 @@ class AzureSearch(VectorStore): def __init__( self, azure_search_endpoint: str, - azure_search_key: str, + azure_search_key: Optional[str], index_name: str, embedding_function: Union[Callable, Embeddings], search_type: str = "hybrid", @@ -295,6 +305,8 @@ def __init__( vector_search_dimensions: Optional[int] = None, additional_search_client_options: Optional[Dict[str, Any]] = None, azure_ad_access_token: Optional[str] = None, + azure_credential: Optional[TokenCredential] = None, + azure_async_credential: Optional[AsyncTokenCredential] = None, **kwargs: Any, ): try: @@ -361,6 +373,7 @@ def __init__( user_agent=user_agent, cors_options=cors_options, additional_search_client_options=additional_search_client_options, + azure_credential=azure_credential, ) self.async_client = _get_search_client( azure_search_endpoint, @@ -377,6 +390,8 @@ def __init__( user_agent=user_agent, cors_options=cors_options, async_=True, + azure_credential=azure_credential, + azure_async_credential=azure_async_credential, ) self.search_type = search_type self.semantic_configuration_name = semantic_configuration_name From e5c9da3eb622a3fd32fe073fa5246d8293895215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= Date: Mon, 23 Dec 2024 23:31:23 +0800 Subject: [PATCH 5/6] core[patch]: remove redundant imports (#28861) `Graph` has been imported at Line: 62 --- libs/core/langchain_core/runnables/base.py | 2 -- libs/core/langchain_core/utils/pydantic.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/libs/core/langchain_core/runnables/base.py b/libs/core/langchain_core/runnables/base.py index 3e43fb10a8e6b..893f393d8b174 100644 --- a/libs/core/langchain_core/runnables/base.py +++ b/libs/core/langchain_core/runnables/base.py @@ -534,8 +534,6 @@ def get_config_jsonschema( def get_graph(self, config: Optional[RunnableConfig] = None) -> Graph: """Return a graph representation of this Runnable.""" - from langchain_core.runnables.graph import Graph - graph = Graph() try: input_node = graph.add_node(self.get_input_schema(config)) diff --git a/libs/core/langchain_core/utils/pydantic.py b/libs/core/langchain_core/utils/pydantic.py index ae403574bb761..65f12232f9fc6 100644 --- a/libs/core/langchain_core/utils/pydantic.py +++ b/libs/core/langchain_core/utils/pydantic.py @@ -279,7 +279,7 @@ def _create_subset_model_v2( fn_description: Optional[str] = None, ) -> type[pydantic.BaseModel]: """Create a pydantic model with a subset of the model fields.""" - from pydantic import ConfigDict, create_model + from pydantic import create_model from pydantic.fields import FieldInfo descriptions_ = descriptions or {} From 6352edf77fe2ef5f412201562426fbbdf5c6bfeb Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Mon, 23 Dec 2024 16:55:44 +0100 Subject: [PATCH 6/6] docs: CrateDB: Register package `langchain-cratedb`, and add minimal "provider" documentation (#28877) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hi Erick. Coming back from a previous attempt, we now made a separate package for the CrateDB adapter, called `langchain-cratedb`, as advised. Other than registering the package within `libs/packages.yml`, this patch includes a minimal amount of documentation to accompany the advent of this new package. Let us know about any mistakes we made, or changes you would like to see. Thanks, Andreas. ## About - **Description:** Register a new database adapter package, `langchain-cratedb`, providing traditional vector store, document loader, and chat message history features for a start. - **Addressed to:** @efriis, @eyurtsev - **References:** GH-27710 - **Preview:** [Providers » More » CrateDB](https://langchain-git-fork-crate-workbench-register-la-4bf945-langchain.vercel.app/docs/integrations/providers/cratedb/) ## Status - **PyPI:** https://pypi.org/project/langchain-cratedb/ - **GitHub:** https://github.com/crate/langchain-cratedb - **Documentation (CrateDB):** https://cratedb.com/docs/guide/integrate/langchain/ - **Documentation (LangChain):** _This PR._ ## Backlog? Is this applicable for this kind of patch? > - [ ] **Add tests and docs**: If you're adding a new integration, please include > 1. a test for the integration, preferably unit tests that do not rely on network access, > 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. ## Q&A 1. Notebooks that use the LangChain CrateDB adapter are currently at [CrateDB LangChain Examples](https://github.com/crate/cratedb-examples/tree/main/topic/machine-learning/llm-langchain), and the documentation refers to them. Because they are derived from very old blueprints coming from LangChain 0.0.x times, we guess they need a refresh before adding them to `docs/docs/integrations`. Is it applicable to merge this minimal package registration + documentation patch, which already includes valid code snippets in `cratedb.mdx`, and add corresponding notebooks on behalf of a subsequent patch later? 2. How would it work getting into the tabular list of _Integration Packages_ enumerated on the [documentation entrypoint page about Providers](https://python.langchain.com/docs/integrations/providers/)? /cc Please also review, @ckurze, @wierdvanderhaar, @kneth, @simonprickett, if you can find the time. Thanks! --- docs/docs/integrations/providers/cratedb.mdx | 132 +++++++++++++++++++ libs/packages.yml | 3 + 2 files changed, 135 insertions(+) create mode 100644 docs/docs/integrations/providers/cratedb.mdx diff --git a/docs/docs/integrations/providers/cratedb.mdx b/docs/docs/integrations/providers/cratedb.mdx new file mode 100644 index 0000000000000..24e47930407c0 --- /dev/null +++ b/docs/docs/integrations/providers/cratedb.mdx @@ -0,0 +1,132 @@ +# CrateDB + +> [CrateDB] is a distributed and scalable SQL database for storing and +> analyzing massive amounts of data in near real-time, even with complex +> queries. It is PostgreSQL-compatible, based on Lucene, and inheriting +> from Elasticsearch. + + +## Installation and Setup + +### Setup CrateDB +There are two ways to get started with CrateDB quickly. Alternatively, +choose other [CrateDB installation options]. + +#### Start CrateDB on your local machine +Example: Run a single-node CrateDB instance with security disabled, +using Docker or Podman. This is not recommended for production use. + +```bash +docker run --name=cratedb --rm \ + --publish=4200:4200 --publish=5432:5432 --env=CRATE_HEAP_SIZE=2g \ + crate:latest -Cdiscovery.type=single-node +``` + +#### Deploy cluster on CrateDB Cloud +[CrateDB Cloud] is a managed CrateDB service. Sign up for a +[free trial][CrateDB Cloud Console]. + +### Install Client +Install the most recent version of the `langchain-cratedb` package +and a few others that are needed for this tutorial. +```bash +pip install --upgrade langchain-cratedb langchain-openai unstructured +``` + + +## Documentation +For a more detailed walkthrough of the CrateDB wrapper, see +[using LangChain with CrateDB]. See also [all features of CrateDB] +to learn about other functionality provided by CrateDB. + + +## Features +The CrateDB adapter for LangChain provides APIs to use CrateDB as vector store, +document loader, and storage for chat messages. + +### Vector Store +Use the CrateDB vector store functionality around `FLOAT_VECTOR` and `KNN_MATCH` +for similarity search and other purposes. See also [CrateDBVectorStore Tutorial]. + +Make sure you've configured a valid OpenAI API key. +```bash +export OPENAI_API_KEY=sk-XJZ... +``` +```python +from langchain_community.document_loaders import UnstructuredURLLoader +from langchain_cratedb import CrateDBVectorStore +from langchain_openai import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter + +loader = UnstructuredURLLoader(urls=["https://github.com/langchain-ai/langchain/raw/refs/tags/langchain-core==0.3.28/docs/docs/how_to/state_of_the_union.txt"]) +documents = loader.load() +text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) +docs = text_splitter.split_documents(documents) + +embeddings = OpenAIEmbeddings() + +# Connect to a self-managed CrateDB instance on localhost. +CONNECTION_STRING = "crate://?schema=testdrive" + +store = CrateDBVectorStore.from_documents( + documents=docs, + embedding=embeddings, + collection_name="state_of_the_union", + connection=CONNECTION_STRING, +) + +query = "What did the president say about Ketanji Brown Jackson" +docs_with_score = store.similarity_search_with_score(query) +``` + +### Document Loader +Load load documents from a CrateDB database table, using the document loader +`CrateDBLoader`, which is based on SQLAlchemy. See also [CrateDBLoader Tutorial]. + +To use the document loader in your applications: +```python +import sqlalchemy as sa +from langchain_community.utilities import SQLDatabase +from langchain_cratedb import CrateDBLoader + +# Connect to a self-managed CrateDB instance on localhost. +CONNECTION_STRING = "crate://?schema=testdrive" + +db = SQLDatabase(engine=sa.create_engine(CONNECTION_STRING)) + +loader = CrateDBLoader( + 'SELECT * FROM sys.summits LIMIT 42', + db=db, +) +documents = loader.load() +``` + +### Chat Message History +Use CrateDB as the storage for your chat messages. +See also [CrateDBChatMessageHistory Tutorial]. + +To use the chat message history in your applications: +```python +from langchain_cratedb import CrateDBChatMessageHistory + +# Connect to a self-managed CrateDB instance on localhost. +CONNECTION_STRING = "crate://?schema=testdrive" + +message_history = CrateDBChatMessageHistory( + session_id="test-session", + connection=CONNECTION_STRING, +) + +message_history.add_user_message("hi!") +``` + + +[all features of CrateDB]: https://cratedb.com/docs/guide/feature/ +[CrateDB]: https://cratedb.com/database +[CrateDB Cloud]: https://cratedb.com/database/cloud +[CrateDB Cloud Console]: https://console.cratedb.cloud/?utm_source=langchain&utm_content=documentation +[CrateDB installation options]: https://cratedb.com/docs/guide/install/ +[CrateDBChatMessageHistory Tutorial]: https://github.com/crate/cratedb-examples/blob/main/topic/machine-learning/llm-langchain/conversational_memory.ipynb +[CrateDBLoader Tutorial]: https://github.com/crate/cratedb-examples/blob/main/topic/machine-learning/llm-langchain/document_loader.ipynb +[CrateDBVectorStore Tutorial]: https://github.com/crate/cratedb-examples/blob/main/topic/machine-learning/llm-langchain/vector_search.ipynb +[using LangChain with CrateDB]: https://cratedb.com/docs/guide/integrate/langchain/ diff --git a/libs/packages.yml b/libs/packages.yml index da26ed6f0cfb8..e9f64be5a5eaa 100644 --- a/libs/packages.yml +++ b/libs/packages.yml @@ -143,6 +143,9 @@ packages: - name: langchain-couchbase repo: langchain-ai/langchain path: libs/partners/couchbase + - name: langchain-cratedb + repo: crate/langchain-cratedb + path: . - name: langchain-ollama repo: langchain-ai/langchain path: libs/partners/ollama