diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py index 937d841f8..3ea84780f 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py @@ -19,6 +19,7 @@ VALID_DISTANCE_FUNCTIONS = "l2", "cosine", "ip" +SUPPORTED_TYPES_FOR_METADATA_VALUES = str, int, float, bool class ChromaDocumentStore: @@ -226,7 +227,26 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D data = {"ids": [doc.id], "documents": [doc.content]} if doc.meta: - data["metadatas"] = [doc.meta] + valid_meta = {} + discarded_keys = [] + + for k, v in doc.meta.items(): + if isinstance(v, SUPPORTED_TYPES_FOR_METADATA_VALUES): + valid_meta[k] = v + else: + discarded_keys.append(k) + + if discarded_keys: + logger.warning( + "Document %s contains `meta` values of unsupported types for the keys: %s. " + "These items will be discarded. Supported types are: %s.", + doc.id, + ", ".join(discarded_keys), + ", ".join([t.__name__ for t in SUPPORTED_TYPES_FOR_METADATA_VALUES]), + ) + + if valid_meta: + data["metadatas"] = [valid_meta] if doc.embedding is not None: data["embeddings"] = [doc.embedding] diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 223bfd704..b05c9ccfc 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -107,6 +107,28 @@ def test_search(self): assert len(result) == 1 assert result[0][0].content == "Third document" + def test_write_documents_unsupported_meta_values(self, document_store: ChromaDocumentStore): + """ + Unsupported meta values should be removed from the documents before writing them to the database + """ + + docs = [ + Document(content="test doc 1", meta={"invalid": {"dict": "value"}}), + Document(content="test doc 2", meta={"invalid": ["list", "value"]}), + Document(content="test doc 3", meta={"ok": 123}), + ] + + document_store.write_documents(docs) + + written_docs = document_store.filter_documents() + written_docs.sort(key=lambda x: x.content) + + assert len(written_docs) == 3 + assert [doc.id for doc in written_docs] == [doc.id for doc in docs] + assert written_docs[0].meta == {} + assert written_docs[1].meta == {} + assert written_docs[2].meta == {"ok": 123} + @pytest.mark.integration def test_to_json(self, request): ds = ChromaDocumentStore(