From f51de64ec485b5771fb3b23b5054411b0f98586d Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 6 Nov 2024 10:28:00 +0100 Subject: [PATCH] fix: adapt our implementation to breaking changes in Chroma 0.5.17 (#1165) * fix chroma breaking changes * improve warning * better warning --- integrations/chroma/pyproject.toml | 2 +- .../document_stores/chroma/document_store.py | 7 +- .../document_stores/chroma/filters.py | 8 +-- .../chroma/tests/test_document_store.py | 65 ++++++++++++++++++- 4 files changed, 74 insertions(+), 8 deletions(-) diff --git a/integrations/chroma/pyproject.toml b/integrations/chroma/pyproject.toml index 7f0943a30..cfe7a606e 100644 --- a/integrations/chroma/pyproject.toml +++ b/integrations/chroma/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai", "chromadb>=0.5.0", "typing_extensions>=4.8.0"] +dependencies = ["haystack-ai", "chromadb>=0.5.17", "typing_extensions>=4.8.0"] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/chroma#readme" diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py index 6a83937a4..439e4b144 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py @@ -248,9 +248,12 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D if doc.content is None: logger.warning( - "ChromaDocumentStore can only store the text field of Documents: " - "'array', 'dataframe' and 'blob' will be dropped." + "ChromaDocumentStore cannot store documents with `content=None`. " + "`array`, `dataframe` and `blob` are not supported. " + "Document with id %s will be skipped.", + doc.id, ) + continue data = {"ids": [doc.id], "documents": [doc.content]} if doc.meta: diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py index 60046b6ad..df49da673 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py @@ -1,6 +1,6 @@ from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from chromadb.api.types import validate_where, validate_where_document @@ -34,8 +34,8 @@ class ChromaFilter: """ ids: List[str] - where: Dict[str, Any] - where_document: Dict[str, Any] + where: Optional[Dict[str, Any]] + where_document: Optional[Dict[str, Any]] def _convert_filters(filters: Dict[str, Any]) -> ChromaFilter: @@ -80,7 +80,7 @@ def _convert_filters(filters: Dict[str, Any]) -> ChromaFilter: msg = f"Invalid '{test_clause}' : {e}" raise ChromaDocumentStoreFilterError(msg) from e - return ChromaFilter(ids=ids, where=where, where_document=where_document) + return ChromaFilter(ids=ids, where=where or None, where_document=where_document or None) def _convert_filter_clause(filters: Dict[str, Any]) -> Dict[str, Any]: diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 987f6d8b7..ed815251e 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -13,9 +13,12 @@ from chromadb.api.types import Documents, EmbeddingFunction, Embeddings from haystack import Document from haystack.testing.document_store import ( + TEST_EMBEDDING_1, + TEST_EMBEDDING_2, CountDocumentsTest, DeleteDocumentsTest, FilterDocumentsTest, + _random_embeddings, ) from haystack_integrations.document_stores.chroma import ChromaDocumentStore @@ -51,6 +54,67 @@ def document_store(self) -> ChromaDocumentStore: get_func.return_value = _TestEmbeddingFunction() return ChromaDocumentStore(embedding_function="test_function", collection_name=str(uuid.uuid1())) + @pytest.fixture + def filterable_docs(self) -> List[Document]: + """ + This fixture has been copied from haystack/testing/document_store.py and modified to + remove the documents that don't have textual content, as Chroma does not support writing them. + """ + documents = [] + for i in range(3): + documents.append( + Document( + content=f"A Foo Document {i}", + meta={ + "name": f"name_{i}", + "page": "100", + "chapter": "intro", + "number": 2, + "date": "1969-07-21T20:17:40", + }, + embedding=_random_embeddings(768), + ) + ) + documents.append( + Document( + content=f"A Bar Document {i}", + meta={ + "name": f"name_{i}", + "page": "123", + "chapter": "abstract", + "number": -2, + "date": "1972-12-11T19:54:58", + }, + embedding=_random_embeddings(768), + ) + ) + documents.append( + Document( + content=f"A Foobar Document {i}", + meta={ + "name": f"name_{i}", + "page": "90", + "chapter": "conclusion", + "number": -10, + "date": "1989-11-09T17:53:00", + }, + embedding=_random_embeddings(768), + ) + ) + documents.append( + Document( + content=f"Document {i} without embedding", + meta={"name": f"name_{i}", "no_embedding": True, "chapter": "conclusion"}, + ) + ) + documents.append( + Document(content=f"Doc {i} with zeros emb", meta={"name": "zeros_doc"}, embedding=TEST_EMBEDDING_1) + ) + documents.append( + Document(content=f"Doc {i} with ones emb", meta={"name": "ones_doc"}, embedding=TEST_EMBEDDING_2) + ) + return documents + def assert_documents_are_equal(self, received: List[Document], expected: List[Document]): """ Assert that two lists of Documents are equal. @@ -283,7 +347,6 @@ def test_contains(self, document_store: ChromaDocumentStore, filterable_docs: Li ) def test_multiple_contains(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - filterable_docs = [doc for doc in filterable_docs if doc.content] # remove documents without content document_store.write_documents(filterable_docs) filters = { "operator": "OR",