From a00258ec12a39b279f33722647e3e934216063b5 Mon Sep 17 00:00:00 2001 From: Vinit Kudva Date: Tue, 17 Dec 2024 10:03:02 -0500 Subject: [PATCH] chroma: fix persistence if client_settings is passed in (#25199) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ent path given. Thank you for contributing to LangChain! - [ ] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis Co-authored-by: Chester Curme --- libs/partners/chroma/.gitignore | 1 + .../chroma/langchain_chroma/vectorstores.py | 3 + .../integration_tests/test_vectorstores.py | 114 ++++++++++++++---- 3 files changed, 92 insertions(+), 26 deletions(-) diff --git a/libs/partners/chroma/.gitignore b/libs/partners/chroma/.gitignore index da0d250a6a8fd..aa64ea142f06f 100644 --- a/libs/partners/chroma/.gitignore +++ b/libs/partners/chroma/.gitignore @@ -1,2 +1,3 @@ __pycache__ */persist_dir +chroma/ diff --git a/libs/partners/chroma/langchain_chroma/vectorstores.py b/libs/partners/chroma/langchain_chroma/vectorstores.py index bc87f9e088f43..648ff439a3edb 100644 --- a/libs/partners/chroma/langchain_chroma/vectorstores.py +++ b/libs/partners/chroma/langchain_chroma/vectorstores.py @@ -318,6 +318,9 @@ def __init__( client_settings.persist_directory = ( persist_directory or client_settings.persist_directory ) + client_settings.is_persistent = ( + client_settings.persist_directory is not None + ) _client_settings = client_settings elif persist_directory: diff --git a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py index f7bed4cfa5588..7420a99ec5ef9 100644 --- a/libs/partners/chroma/tests/integration_tests/test_vectorstores.py +++ b/libs/partners/chroma/tests/integration_tests/test_vectorstores.py @@ -1,5 +1,7 @@ """Test Chroma functionality.""" +import os.path +import tempfile import uuid from typing import ( Generator, @@ -10,6 +12,7 @@ import pytest # type: ignore[import-not-found] import requests from chromadb.api.client import SharedSystemClient +from chromadb.api.segment import SegmentAPI from chromadb.api.types import Embeddable from langchain_core.documents import Document from langchain_core.embeddings.fake import FakeEmbeddings as Fak @@ -268,37 +271,96 @@ def test_chroma_search_filter_with_scores() -> None: def test_chroma_with_persistence() -> None: """Test end to end construction and search, with persistence.""" - chroma_persist_dir = "./tests/persist_dir" - collection_name = "test_collection" - texts = ["foo", "bar", "baz"] - ids = [f"id_{i}" for i in range(len(texts))] + with tempfile.TemporaryDirectory() as chroma_persist_dir: + collection_name = "test_collection" + texts = ["foo", "bar", "baz"] + ids = [f"id_{i}" for i in range(len(texts))] + + docsearch = Chroma.from_texts( + collection_name=collection_name, + texts=texts, + embedding=FakeEmbeddings(), + persist_directory=chroma_persist_dir, + ids=ids, + ) - docsearch = Chroma.from_texts( - collection_name=collection_name, - texts=texts, - embedding=FakeEmbeddings(), - persist_directory=chroma_persist_dir, - ids=ids, - ) + try: + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", id="id_0")] - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo", id="id_0")] + assert os.path.exists(chroma_persist_dir) - # Get a new VectorStore from the persisted directory - docsearch = Chroma( - collection_name=collection_name, - embedding_function=FakeEmbeddings(), - persist_directory=chroma_persist_dir, - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo", id="id_0")] + # Get a new VectorStore from the persisted directory + docsearch = Chroma( + collection_name=collection_name, + embedding_function=FakeEmbeddings(), + persist_directory=chroma_persist_dir, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", id="id_0")] - # Clean up - docsearch.delete_collection() + # Clean up + docsearch.delete_collection() + + # Persist doesn't need to be called again + # Data will be automatically persisted on object deletion + # Or on program exit + + finally: + # Need to stop the chrom system database and segment manager + # to be able to delete the files after testing + client = docsearch._client + assert isinstance(client, chromadb.ClientCreator) + assert isinstance(client._server, SegmentAPI) + client._server._sysdb.stop() + client._server._manager.stop() + + +def test_chroma_with_persistence_with_client_settings() -> None: + """Test end to end construction and search, with persistence.""" + with tempfile.TemporaryDirectory() as chroma_persist_dir: + client_settings = chromadb.config.Settings() + collection_name = "test_collection" + texts = ["foo", "bar", "baz"] + ids = [f"id_{i}" for i in range(len(texts))] + docsearch = Chroma.from_texts( + collection_name=collection_name, + texts=texts, + embedding=FakeEmbeddings(), + persist_directory=chroma_persist_dir, + client_settings=client_settings, + ids=ids, + ) - # Persist doesn't need to be called again - # Data will be automatically persisted on object deletion - # Or on program exit + try: + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", id="id_0")] + + assert os.path.exists(chroma_persist_dir) + + # Get a new VectorStore from the persisted directory + docsearch = Chroma( + collection_name=collection_name, + embedding_function=FakeEmbeddings(), + persist_directory=chroma_persist_dir, + ) + output = docsearch.similarity_search("foo", k=1) + + # Clean up + docsearch.delete_collection() + + # Persist doesn't need to be called again + # Data will be automatically persisted on object deletion + # Or on program exit + + finally: + # Need to stop the chrom system database and segment manager + # to be able to delete the files after testing + client = docsearch._client + assert isinstance(client, chromadb.ClientCreator) + assert isinstance(client._server, SegmentAPI) + client._server._sysdb.stop() + client._server._manager.stop() def test_chroma_mmr() -> None: