From 7bdfc43766e72e4b67512bd85119b1c797035b86 Mon Sep 17 00:00:00 2001 From: Erick Friis Date: Wed, 6 Dec 2023 10:05:43 -0800 Subject: [PATCH] core[patch], langchain[patch]: ByteStore (#14312) --- docs/.local_build.sh | 2 +- docs/docs/integrations/providers/redis.mdx | 18 + .../integrations/stores/file_system.ipynb | 100 ++++ docs/docs/integrations/stores/in_memory.ipynb | 73 +++ docs/docs/integrations/stores/index.mdx | 29 + docs/docs/integrations/stores/redis.ipynb | 83 +++ .../integrations/stores/upstash_redis.ipynb | 90 +++ .../text_embedding/caching_embeddings.ipynb | 517 ++---------------- docs/sidebars.js | 1 + libs/core/langchain_core/stores.py | 3 + libs/langchain/langchain/embeddings/cache.py | 4 +- .../langchain/retrievers/multi_vector.py | 4 +- libs/langchain/langchain/storage/__init__.py | 6 +- libs/langchain/langchain/storage/_lc_store.py | 6 +- .../langchain/storage/file_system.py | 4 +- libs/langchain/langchain/storage/in_memory.py | 29 +- libs/langchain/langchain/storage/redis.py | 4 +- .../langchain/storage/upstash_redis.py | 4 +- .../tests/unit_tests/storage/test_imports.py | 2 + 19 files changed, 497 insertions(+), 482 deletions(-) create mode 100644 docs/docs/integrations/stores/file_system.ipynb create mode 100644 docs/docs/integrations/stores/in_memory.ipynb create mode 100644 docs/docs/integrations/stores/index.mdx create mode 100644 docs/docs/integrations/stores/redis.ipynb create mode 100644 docs/docs/integrations/stores/upstash_redis.ipynb diff --git a/docs/.local_build.sh b/docs/.local_build.sh index 21d3fb3d49bc6..9a3f9a79f82d6 100755 --- a/docs/.local_build.sh +++ b/docs/.local_build.sh @@ -9,7 +9,7 @@ SCRIPT_DIR="$(cd "$(dirname "$0")"; pwd)" cd "${SCRIPT_DIR}" mkdir -p ../_dist -rsync -ruv --exclude node_modules . ../_dist +rsync -ruv --exclude node_modules --exclude api_reference --exclude .venv --exclude .docusaurus . ../_dist cd ../_dist poetry run python scripts/model_feat_table.py cp ../cookbook/README.md src/pages/cookbook.mdx diff --git a/docs/docs/integrations/providers/redis.mdx b/docs/docs/integrations/providers/redis.mdx index b9e4c67532112..bc1277d6d22ca 100644 --- a/docs/docs/integrations/providers/redis.mdx +++ b/docs/docs/integrations/providers/redis.mdx @@ -17,6 +17,24 @@ Install the Python SDK: pip install redis ``` +To run Redis locally, you can use Docker: + +```bash +docker run --name langchain-redis -d -p 6379:6379 redis redis-server --save 60 1 --loglevel warning +``` + +To stop the container: + +```bash +docker stop langchain-redis +``` + +And to start it again: + +```bash +docker start langchain-redis +``` + ## Wrappers All wrappers need a redis url connection string to connect to the database support either a stand alone Redis server diff --git a/docs/docs/integrations/stores/file_system.ipynb b/docs/docs/integrations/stores/file_system.ipynb new file mode 100644 index 0000000000000..b16e5f4051526 --- /dev/null +++ b/docs/docs/integrations/stores/file_system.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: Local Filesystem\n", + "sidebar_position: 3\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LocalFileStore\n", + "\n", + "The `LocalFileStore` is a persistent implementation of `ByteStore` that stores everything in a folder of your choosing." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[b'v1', b'v2']\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "from langchain.storage import LocalFileStore\n", + "\n", + "root_path = Path.cwd() / \"data\" # can also be a path set by a string\n", + "store = LocalFileStore(root_path)\n", + "\n", + "store.mset([(\"k1\", b\"v1\"), (\"k2\", b\"v2\")])\n", + "print(store.mget([\"k1\", \"k2\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see which files exist in our `data` folder:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "k1 k2\n" + ] + } + ], + "source": [ + "!ls {root_path}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/integrations/stores/in_memory.ipynb b/docs/docs/integrations/stores/in_memory.ipynb new file mode 100644 index 0000000000000..03e2f2c5b638e --- /dev/null +++ b/docs/docs/integrations/stores/in_memory.ipynb @@ -0,0 +1,73 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: In Memory\n", + "sidebar_position: 2\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# InMemoryByteStore\n", + "\n", + "The `InMemoryByteStore` is a non-persistent implementation of `ByteStore` that stores everything in a Python dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[b'v1', b'v2']\n" + ] + } + ], + "source": [ + "from langchain.storage import InMemoryByteStore\n", + "\n", + "store = InMemoryByteStore()\n", + "\n", + "store.mset([(\"k1\", b\"v1\"), (\"k2\", b\"v2\")])\n", + "print(store.mget([\"k1\", \"k2\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/integrations/stores/index.mdx b/docs/docs/integrations/stores/index.mdx new file mode 100644 index 0000000000000..5aa9abf1f3a6a --- /dev/null +++ b/docs/docs/integrations/stores/index.mdx @@ -0,0 +1,29 @@ +--- +sidebar_position: 1 +sidebar_class_name: hidden +--- + +# Stores + +In many different applications, having some sort of key-value storage is helpful. +In this section, we will look at a few different ways to store key-value pairs +using implementations of the `ByteStore` interface. + +## Features (natively supported) + +All `ByteStore`s support the following functions, which are used for modifying +**m**ultiple key-value pairs at once: + +- `mget(key: Sequence[str]) -> List[Optional[bytes]]`: get the contents of multiple keys, returning `None` if the key does not exist +- `mset(key_value_pairs: Sequence[Tuple[str, bytes]]) -> None`: set the contents of multiple keys +- `mdelete(key: Sequence[str]) -> None`: delete multiple keys +- `yield_keys(prefix: Optional[str] = None) -> Iterator[str]`: yield all keys in the store, optionally filtering by a prefix + +## How to pick one + +`ByteStore`s are designed to be interchangeable. By default, most dependent integrations +use the `InMemoryByteStore`, which is a simple in-memory key-value store. + +However, if you start having other requirements, like massive scalability or persistence, +you can swap out the `ByteStore` implementation with one of the other ones documented +in this section. diff --git a/docs/docs/integrations/stores/redis.ipynb b/docs/docs/integrations/stores/redis.ipynb new file mode 100644 index 0000000000000..251454b5e2bb3 --- /dev/null +++ b/docs/docs/integrations/stores/redis.ipynb @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: Redis\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RedisStore\n", + "\n", + "The `RedisStore` is an implementation of `ByteStore` that stores everything in your Redis instance.\n", + "\n", + "To configure Redis, follow our [Redis guide](/docs/integrations/providers/redis)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install redis" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[b'v1', b'v2']\n" + ] + } + ], + "source": [ + "from langchain.storage import RedisStore\n", + "\n", + "store = RedisStore(redis_url=\"redis://localhost:6379\")\n", + "\n", + "store.mset([(\"k1\", b\"v1\"), (\"k2\", b\"v2\")])\n", + "print(store.mget([\"k1\", \"k2\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/integrations/stores/upstash_redis.ipynb b/docs/docs/integrations/stores/upstash_redis.ipynb new file mode 100644 index 0000000000000..b070728907f4b --- /dev/null +++ b/docs/docs/integrations/stores/upstash_redis.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: Upstash Redis\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# UpstashRedisByteStore\n", + "\n", + "The `UpstashRedisStore` is an implementation of `ByteStore` that stores everything in your Upstash-hosted Redis instance.\n", + "\n", + "To use the base `RedisStore` instead, see [this guide](./redis)\n", + "\n", + "To configure Upstash Redis, follow our [Upstash guide](/docs/integrations/providers/upstash)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install upstash-redis" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[b'v1', b'v2']\n" + ] + } + ], + "source": [ + "from langchain.storage import UpstashRedisByteStore\n", + "from upstash_redis import Redis\n", + "\n", + "URL = \"\"\n", + "TOKEN = \"\"\n", + "\n", + "redis_client = Redis(url=URL, token=TOKEN)\n", + "store = UpstashRedisByteStore(client=redis_client, ttl=None, namespace=\"test-ns\")\n", + "\n", + "store.mset([(\"k1\", b\"v1\"), (\"k2\", b\"v2\")])\n", + "print(store.mget([\"k1\", \"k2\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/modules/data_connection/text_embedding/caching_embeddings.ipynb b/docs/docs/modules/data_connection/text_embedding/caching_embeddings.ipynb index 3b2327165013d..b3c4fd6340e3b 100644 --- a/docs/docs/modules/data_connection/text_embedding/caching_embeddings.ipynb +++ b/docs/docs/modules/data_connection/text_embedding/caching_embeddings.ipynb @@ -1,11 +1,21 @@ { "cells": [ + { + "cell_type": "raw", + "id": "8baf0f21", + "metadata": {}, + "source": [ + "--\n", + "sidebar_label: Caching\n", + "--" + ] + }, { "cell_type": "markdown", "id": "bf4061ce", "metadata": {}, "source": [ - "# Caching\n", + "# CacheBackedEmbeddings\n", "\n", "Embeddings can be stored or temporarily cached to avoid needing to recompute them.\n", "\n", @@ -15,7 +25,7 @@ "The main supported way to initialized a `CacheBackedEmbeddings` is `from_bytes_store`. This takes in the following parameters:\n", "\n", "- underlying_embedder: The embedder to use for embedding.\n", - "- document_embedding_cache: The cache to use for storing document embeddings.\n", + "- document_embedding_cache: Any [`ByteStore`](/docs/integrations/stores/) for caching document embeddings.\n", "- namespace: (optional, defaults to `\"\"`) The namespace to use for document cache. This namespace is used to avoid collisions with other caches. For example, set it to the name of the embedding model used.\n", "\n", "**Attention**: Be sure to set the `namespace` parameter to avoid collisions of the same text embedded using different embeddings models." @@ -23,20 +33,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "id": "a463c3c2-749b-40d1-a433-84f68a1cd1c7", "metadata": { "tags": [] }, "outputs": [], "source": [ - "from langchain.embeddings import CacheBackedEmbeddings, OpenAIEmbeddings\n", - "from langchain.storage import (\n", - " InMemoryStore,\n", - " LocalFileStore,\n", - " RedisStore,\n", - " UpstashRedisStore,\n", - ")" + "from langchain.embeddings import CacheBackedEmbeddings" ] }, { @@ -44,7 +48,7 @@ "id": "9ddf07dd-3e72-41de-99d4-78e9521e272f", "metadata": {}, "source": [ - "## Using with a vector store\n", + "## Using with a Vector Store\n", "\n", "First, let's see an example that uses the local file system for storing embeddings and uses FAISS vector store for retrieval." ] @@ -52,36 +56,32 @@ { "cell_type": "code", "execution_count": null, - "id": "9e4314d8-88ef-4f52-81ae-0be771168bb6", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.document_loaders import TextLoader\n", - "from langchain.embeddings.openai import OpenAIEmbeddings\n", - "from langchain.text_splitter import CharacterTextSplitter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e751f26-9b5b-4c10-843a-d784b5ea8538", + "id": "50183825", "metadata": {}, "outputs": [], "source": [ - "underlying_embeddings = OpenAIEmbeddings()" + "!pip install openai faiss-cpu" ] }, { "cell_type": "code", - "execution_count": null, - "id": "30743664-38f5-425d-8216-772b64e7f348", + "execution_count": 3, + "id": "9e4314d8-88ef-4f52-81ae-0be771168bb6", "metadata": {}, "outputs": [], "source": [ - "fs = LocalFileStore(\"./cache/\")\n", + "from langchain.document_loaders import TextLoader\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.storage import LocalFileStore\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import FAISS\n", + "\n", + "underlying_embeddings = OpenAIEmbeddings()\n", + "\n", + "store = LocalFileStore(\"./cache/\")\n", "\n", "cached_embedder = CacheBackedEmbeddings.from_bytes_store(\n", - " underlying_embeddings, fs, namespace=underlying_embeddings.model\n", + " underlying_embeddings, store, namespace=underlying_embeddings.model\n", ")" ] }, @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "f9ad627f-ced2-4277-b336-2434f22f2c8a", "metadata": {}, "outputs": [ @@ -105,13 +105,13 @@ "[]" ] }, - "execution_count": 9, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "list(fs.yield_keys())" + "list(store.yield_keys())" ] }, { @@ -124,12 +124,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "cf958ac2-e60e-4668-b32c-8bb2d78b3c61", "metadata": {}, "outputs": [], "source": [ - "raw_documents = TextLoader(\"../state_of_the_union.txt\").load()\n", + "raw_documents = TextLoader(\"../../state_of_the_union.txt\").load()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "documents = text_splitter.split_documents(raw_documents)" ] @@ -144,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "3a1d7bb8-3b72-4bb5-9013-cf7729caca61", "metadata": {}, "outputs": [ @@ -152,8 +152,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 608 ms, sys: 58.9 ms, total: 667 ms\n", - "Wall time: 1.3 s\n" + "CPU times: user 218 ms, sys: 29.7 ms, total: 248 ms\n", + "Wall time: 1.02 s\n" ] } ], @@ -172,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "714cb2e2-77ba-41a8-bb83-84e75342af2d", "metadata": {}, "outputs": [ @@ -180,8 +180,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 33.6 ms, sys: 3.96 ms, total: 37.6 ms\n", - "Wall time: 36.8 ms\n" + "CPU times: user 15.7 ms, sys: 2.22 ms, total: 18 ms\n", + "Wall time: 17.2 ms\n" ] } ], @@ -200,458 +200,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "f2ca32dd-3712-4093-942b-4122f3dc8a8e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['text-embedding-ada-002614d7cf6-46f1-52fa-9d3a-740c39e7a20e',\n", - " 'text-embedding-ada-0020fc1ede2-407a-5e14-8f8f-5642214263f5',\n", + "['text-embedding-ada-00217a6727d-8916-54eb-b196-ec9c9d6ca472',\n", + " 'text-embedding-ada-0025fc0d904-bd80-52da-95c9-441015bfb438',\n", " 'text-embedding-ada-002e4ad20ef-dfaa-5916-9459-f90c6d8e8159',\n", - " 'text-embedding-ada-002a5ef11e4-0474-5725-8d80-81c91943b37f',\n", - " 'text-embedding-ada-00281426526-23fe-58be-9e84-6c7c72c8ca9a']" + " 'text-embedding-ada-002ed199159-c1cd-5597-9757-f80498e8f17b',\n", + " 'text-embedding-ada-0021297d37a-2bc1-5e19-bf13-6c950f075062']" ] }, - "execution_count": 13, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "list(fs.yield_keys())[:5]" + "list(store.yield_keys())[:5]" ] }, { "cell_type": "markdown", - "id": "564c9801-29f0-4452-aeac-527382e2c0e8", + "id": "c1a7fafd", "metadata": {}, "source": [ - "## In Memory\n", + "# Swapping the `ByteStore`\n", "\n", - "This section shows how to set up an in memory cache for embeddings. This type of cache is primarily \n", - "useful for unit tests or prototyping. Do **not** use this cache if you need to actually store the embeddings." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13bd1c5b-b7ba-4394-957c-7d5b5a841972", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "store = InMemoryStore()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d99885f-99e1-498c-904d-6db539ac9466", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "underlying_embeddings = OpenAIEmbeddings()\n", - "embedder = CacheBackedEmbeddings.from_bytes_store(\n", - " underlying_embeddings, store, namespace=underlying_embeddings.model\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "682eb5d4-0b7a-4dac-b8fb-3de4ca6e421c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 10.9 ms, sys: 916 µs, total: 11.8 ms\n", - "Wall time: 159 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "embeddings = embedder.embed_documents([\"hello\", \"goodbye\"])" - ] - }, - { - "cell_type": "markdown", - "id": "95233026-147f-49d1-bd87-e1e8b88ebdbc", - "metadata": {}, - "source": [ - "The second time we try to embed the embedding time is only 2 ms because the embeddings are looked up in the cache." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f819c3ff-a212-4d06-a5f7-5eb1435c1feb", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1.67 ms, sys: 342 µs, total: 2.01 ms\n", - "Wall time: 2.01 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "embeddings_from_cache = embedder.embed_documents([\"hello\", \"goodbye\"])" + "In order to use a different `ByteStore`, just use it when creating your `CacheBackedEmbeddings`. Below, we create an equivalent cached embeddings object, except using the non-persistent `InMemoryByteStore` instead:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "ec38fb72-90a9-4687-a483-c62c87d1f4dd", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embeddings == embeddings_from_cache" - ] - }, - { - "cell_type": "markdown", - "id": "f6cbe100-8587-4830-b207-fb8b524a9854", + "execution_count": 9, + "id": "336a0538", "metadata": {}, - "source": [ - "## File system\n", - "\n", - "This section covers how to use a file system store." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0070271-0809-4528-97e0-2a88216846f3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fs = LocalFileStore(\"./test_cache/\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b20e9fe-f57f-4d7c-9f81-105c5f8726f4", - "metadata": { - "tags": [] - }, "outputs": [], "source": [ - "embedder2 = CacheBackedEmbeddings.from_bytes_store(\n", - " underlying_embeddings, fs, namespace=underlying_embeddings.model\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "630515fd-bf5c-4d9c-a404-9705308f3a2c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 6.89 ms, sys: 4.89 ms, total: 11.8 ms\n", - "Wall time: 184 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "embeddings = embedder2.embed_documents([\"hello\", \"goodbye\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30e6bb87-42c9-4d08-88ac-0d22c9c449a1", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 0 ns, sys: 3.24 ms, total: 3.24 ms\n", - "Wall time: 2.84 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "embeddings = embedder2.embed_documents([\"hello\", \"goodbye\"])" - ] - }, - { - "cell_type": "markdown", - "id": "12ed5a45-8352-4e0f-8583-5537397f53c0", - "metadata": {}, - "source": [ - "Here are the embeddings that have been persisted to the directory `./test_cache`. \n", - "\n", - "Notice that the embedder takes a namespace parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "658e2914-05e9-44a3-a8fe-3fe17ca84039", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['text-embedding-ada-002e885db5b-c0bd-5fbc-88b1-4d1da6020aa5',\n", - " 'text-embedding-ada-0026ba52e44-59c9-5cc9-a084-284061b13c80']" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(fs.yield_keys())" - ] - }, - { - "cell_type": "markdown", - "id": "904c1d47", - "metadata": {}, - "source": [ - "## Upstash Redis Store" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0f9f212", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.storage.upstash_redis import UpstashRedisStore" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45bf62e4", - "metadata": {}, - "outputs": [], - "source": [ - "from upstash_redis import Redis\n", + "from langchain.embeddings import CacheBackedEmbeddings\n", + "from langchain.storage import InMemoryByteStore\n", "\n", - "URL = \"\"\n", - "TOKEN = \"\"\n", + "store = InMemoryByteStore()\n", "\n", - "redis_client = Redis(url=URL, token=TOKEN)\n", - "store = UpstashRedisStore(client=redis_client, ttl=None, namespace=\"test-ns\")\n", - "\n", - "underlying_embeddings = OpenAIEmbeddings()\n", - "embedder = CacheBackedEmbeddings.from_bytes_store(\n", - " underlying_embeddings, store, namespace=underlying_embeddings.model\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3eac3504", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "embeddings = embedder.embed_documents([\"welcome\", \"goodbye\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "085dcd30", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "embeddings = embedder.embed_documents([\"welcome\", \"goodbye\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3570e83f", - "metadata": {}, - "outputs": [], - "source": [ - "list(store.yield_keys())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7dc8e51", - "metadata": {}, - "outputs": [], - "source": [ - "list(store.client.scan(0))" - ] - }, - { - "cell_type": "markdown", - "id": "cd5f5a96-6ffa-429d-aa82-00b3f6532871", - "metadata": {}, - "source": [ - "## Redis Store\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4879c134-141f-48a0-acfe-7d6f30253af0", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.storage import RedisStore" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b2bb9a0-6549-4487-8532-29ab4ab7336f", - "metadata": {}, - "outputs": [], - "source": [ - "# For cache isolation can use a separate DB\n", - "# Or additional namepace\n", - "store = RedisStore(\n", - " redis_url=\"redis://localhost:6379\",\n", - " client_kwargs={\"db\": 2},\n", - " namespace=\"embedding_caches\",\n", - ")\n", - "\n", - "underlying_embeddings = OpenAIEmbeddings()\n", - "embedder = CacheBackedEmbeddings.from_bytes_store(\n", + "cached_embedder = CacheBackedEmbeddings.from_bytes_store(\n", " underlying_embeddings, store, namespace=underlying_embeddings.model\n", ")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eca3cb99-2bb3-49d5-81f9-1dee03da4b8c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3.99 ms, sys: 0 ns, total: 3.99 ms\n", - "Wall time: 3.5 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "embeddings = embedder.embed_documents([\"hello\", \"goodbye\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "317ba5d8-89f9-462c-b807-ad4ef26e518b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 2.47 ms, sys: 767 µs, total: 3.24 ms\n", - "Wall time: 2.75 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "embeddings = embedder.embed_documents([\"hello\", \"goodbye\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a540317-5142-4491-9062-a097932b56e3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['text-embedding-ada-002e885db5b-c0bd-5fbc-88b1-4d1da6020aa5',\n", - " 'text-embedding-ada-0026ba52e44-59c9-5cc9-a084-284061b13c80']" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(store.yield_keys())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cd9b0d4a-f816-4dce-9dde-cde1ad9a65fb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[b'embedding_caches/text-embedding-ada-002e885db5b-c0bd-5fbc-88b1-4d1da6020aa5',\n", - " b'embedding_caches/text-embedding-ada-0026ba52e44-59c9-5cc9-a084-284061b13c80']" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(store.client.scan_iter())" - ] } ], "metadata": { @@ -670,7 +267,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/docs/sidebars.js b/docs/sidebars.js index f5062b100f5c6..8468b3216c6ac 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -111,6 +111,7 @@ module.exports = { { type: "category", label: "Callbacks", collapsed: true, items: [{type: "autogenerated", dirName: "integrations/callbacks" }], link: {type: "generated-index", slug: "integrations/callbacks" }}, { type: "category", label: "Chat loaders", collapsed: true, items: [{type: "autogenerated", dirName: "integrations/chat_loaders" }], link: {type: "generated-index", slug: "integrations/chat_loaders" }}, { type: "category", label: "Adapters", collapsed: true, items: [{type: "autogenerated", dirName: "integrations/adapters" }], link: {type: "generated-index", slug: "integrations/adapters" }}, + { type: "category", label: "Stores", collapsed: true, items: [{type: "autogenerated", dirName: "integrations/stores" }], link: {type: "doc", id: "integrations/stores/index" }}, ], link: { type: 'generated-index', diff --git a/libs/core/langchain_core/stores.py b/libs/core/langchain_core/stores.py index bae5adc2b8ef4..8363fca3891f9 100644 --- a/libs/core/langchain_core/stores.py +++ b/libs/core/langchain_core/stores.py @@ -51,3 +51,6 @@ def yield_keys( This method is allowed to return an iterator over either K or str depending on what makes more sense for the given store. """ + + +ByteStore = BaseStore[str, bytes] diff --git a/libs/langchain/langchain/embeddings/cache.py b/libs/langchain/langchain/embeddings/cache.py index 75f1992e867d9..e578129618117 100644 --- a/libs/langchain/langchain/embeddings/cache.py +++ b/libs/langchain/langchain/embeddings/cache.py @@ -15,7 +15,7 @@ from typing import Callable, List, Sequence, Union, cast from langchain_core.embeddings import Embeddings -from langchain_core.stores import BaseStore +from langchain_core.stores import BaseStore, ByteStore from langchain.storage.encoder_backed import EncoderBackedStore @@ -151,7 +151,7 @@ def embed_query(self, text: str) -> List[float]: def from_bytes_store( cls, underlying_embeddings: Embeddings, - document_embedding_cache: BaseStore[str, bytes], + document_embedding_cache: ByteStore, *, namespace: str = "", ) -> CacheBackedEmbeddings: diff --git a/libs/langchain/langchain/retrievers/multi_vector.py b/libs/langchain/langchain/retrievers/multi_vector.py index dcc81b554c363..267095f821c50 100644 --- a/libs/langchain/langchain/retrievers/multi_vector.py +++ b/libs/langchain/langchain/retrievers/multi_vector.py @@ -3,7 +3,7 @@ from langchain_core.documents import Document from langchain_core.retrievers import BaseRetriever -from langchain_core.stores import BaseStore +from langchain_core.stores import BaseStore, ByteStore from langchain_core.vectorstores import VectorStore from langchain.callbacks.manager import CallbackManagerForRetrieverRun @@ -38,7 +38,7 @@ def __init__( *, vectorstore: VectorStore, docstore: Optional[BaseStore[str, Document]] = None, - base_store: Optional[BaseStore[str, bytes]] = None, + base_store: Optional[ByteStore] = None, id_key: str = "doc_id", search_kwargs: Optional[dict] = None, search_type: SearchType = SearchType.similarity, diff --git a/libs/langchain/langchain/storage/__init__.py b/libs/langchain/langchain/storage/__init__.py index bf95c8b9d3989..5722213f93db9 100644 --- a/libs/langchain/langchain/storage/__init__.py +++ b/libs/langchain/langchain/storage/__init__.py @@ -9,16 +9,18 @@ from langchain.storage._lc_store import create_kv_docstore, create_lc_store from langchain.storage.encoder_backed import EncoderBackedStore from langchain.storage.file_system import LocalFileStore -from langchain.storage.in_memory import InMemoryStore +from langchain.storage.in_memory import InMemoryByteStore, InMemoryStore from langchain.storage.redis import RedisStore -from langchain.storage.upstash_redis import UpstashRedisStore +from langchain.storage.upstash_redis import UpstashRedisByteStore, UpstashRedisStore __all__ = [ "EncoderBackedStore", "InMemoryStore", + "InMemoryByteStore", "LocalFileStore", "RedisStore", "create_lc_store", "create_kv_docstore", + "UpstashRedisByteStore", "UpstashRedisStore", ] diff --git a/libs/langchain/langchain/storage/_lc_store.py b/libs/langchain/langchain/storage/_lc_store.py index c38f66cb72e6b..3574749e7590b 100644 --- a/libs/langchain/langchain/storage/_lc_store.py +++ b/libs/langchain/langchain/storage/_lc_store.py @@ -3,7 +3,7 @@ from langchain_core.documents import Document from langchain_core.load import Serializable, dumps, loads -from langchain_core.stores import BaseStore +from langchain_core.stores import BaseStore, ByteStore from langchain.storage.encoder_backed import EncoderBackedStore @@ -42,7 +42,7 @@ def _identity(x: str) -> str: def create_lc_store( - store: BaseStore[str, bytes], + store: ByteStore, *, key_encoder: Optional[Callable[[str], str]] = None, ) -> BaseStore[str, Serializable]: @@ -64,7 +64,7 @@ def create_lc_store( def create_kv_docstore( - store: BaseStore[str, bytes], + store: ByteStore, *, key_encoder: Optional[Callable[[str], str]] = None, ) -> BaseStore[str, Document]: diff --git a/libs/langchain/langchain/storage/file_system.py b/libs/langchain/langchain/storage/file_system.py index dadd3720bda0b..720acf085a133 100644 --- a/libs/langchain/langchain/storage/file_system.py +++ b/libs/langchain/langchain/storage/file_system.py @@ -2,12 +2,12 @@ from pathlib import Path from typing import Iterator, List, Optional, Sequence, Tuple, Union -from langchain_core.stores import BaseStore +from langchain_core.stores import ByteStore from langchain.storage.exceptions import InvalidKeyException -class LocalFileStore(BaseStore[str, bytes]): +class LocalFileStore(ByteStore): """BaseStore interface that works on the local file system. Examples: diff --git a/libs/langchain/langchain/storage/in_memory.py b/libs/langchain/langchain/storage/in_memory.py index 60d8ad5516c3e..03679a34909d9 100644 --- a/libs/langchain/langchain/storage/in_memory.py +++ b/libs/langchain/langchain/storage/in_memory.py @@ -3,12 +3,24 @@ This is a simple implementation of the BaseStore using a dictionary that is useful primarily for unit testing purposes. """ -from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple +from typing import ( + Any, + Dict, + Generic, + Iterator, + List, + Optional, + Sequence, + Tuple, + TypeVar, +) from langchain_core.stores import BaseStore +V = TypeVar("V") -class InMemoryStore(BaseStore[str, Any]): + +class InMemoryBaseStore(BaseStore[str, V], Generic[V]): """In-memory implementation of the BaseStore using a dictionary. Attributes: @@ -34,9 +46,9 @@ class InMemoryStore(BaseStore[str, Any]): def __init__(self) -> None: """Initialize an empty store.""" - self.store: Dict[str, Any] = {} + self.store: Dict[str, V] = {} - def mget(self, keys: Sequence[str]) -> List[Optional[Any]]: + def mget(self, keys: Sequence[str]) -> List[Optional[V]]: """Get the values associated with the given keys. Args: @@ -48,7 +60,7 @@ def mget(self, keys: Sequence[str]) -> List[Optional[Any]]: """ return [self.store.get(key) for key in keys] - def mset(self, key_value_pairs: Sequence[Tuple[str, Any]]) -> None: + def mset(self, key_value_pairs: Sequence[Tuple[str, V]]) -> None: """Set the values for the given keys. Args: @@ -67,7 +79,8 @@ def mdelete(self, keys: Sequence[str]) -> None: keys (Sequence[str]): A sequence of keys to delete. """ for key in keys: - self.store.pop(key, None) + if key in self.store: + del self.store[key] def yield_keys(self, prefix: Optional[str] = None) -> Iterator[str]: """Get an iterator over keys that match the given prefix. @@ -84,3 +97,7 @@ def yield_keys(self, prefix: Optional[str] = None) -> Iterator[str]: for key in self.store.keys(): if key.startswith(prefix): yield key + + +InMemoryStore = InMemoryBaseStore[Any] +InMemoryByteStore = InMemoryBaseStore[bytes] diff --git a/libs/langchain/langchain/storage/redis.py b/libs/langchain/langchain/storage/redis.py index d213f8cc051cb..3196ca8dfd7f8 100644 --- a/libs/langchain/langchain/storage/redis.py +++ b/libs/langchain/langchain/storage/redis.py @@ -1,11 +1,11 @@ from typing import Any, Iterator, List, Optional, Sequence, Tuple, cast -from langchain_core.stores import BaseStore +from langchain_core.stores import ByteStore from langchain.utilities.redis import get_client -class RedisStore(BaseStore[str, bytes]): +class RedisStore(ByteStore): """BaseStore implementation using Redis as the underlying store. Examples: diff --git a/libs/langchain/langchain/storage/upstash_redis.py b/libs/langchain/langchain/storage/upstash_redis.py index 7dc436ce33e2c..7fc49b49c768c 100644 --- a/libs/langchain/langchain/storage/upstash_redis.py +++ b/libs/langchain/langchain/storage/upstash_redis.py @@ -1,7 +1,7 @@ from typing import Any, Iterator, List, Optional, Sequence, Tuple, cast from langchain_core._api.deprecation import deprecated -from langchain_core.stores import BaseStore +from langchain_core.stores import BaseStore, ByteStore class _UpstashRedisStore(BaseStore[str, str]): @@ -130,7 +130,7 @@ class UpstashRedisStore(_UpstashRedisStore): """ -class UpstashRedisByteStore(BaseStore[str, bytes]): +class UpstashRedisByteStore(ByteStore): """ BaseStore implementation using Upstash Redis as the underlying store to store raw bytes. diff --git a/libs/langchain/tests/unit_tests/storage/test_imports.py b/libs/langchain/tests/unit_tests/storage/test_imports.py index 35554231b74ac..8c603c11dc9fb 100644 --- a/libs/langchain/tests/unit_tests/storage/test_imports.py +++ b/libs/langchain/tests/unit_tests/storage/test_imports.py @@ -3,10 +3,12 @@ EXPECTED_ALL = [ "EncoderBackedStore", "InMemoryStore", + "InMemoryByteStore", "LocalFileStore", "RedisStore", "create_lc_store", "create_kv_docstore", + "UpstashRedisByteStore", "UpstashRedisStore", ]