Skip to content

Commit

Permalink
fix: ChromaDocumentStore - discard meta items when the type of th…
Browse files Browse the repository at this point in the history
…eir value is not supported in Chroma (#907)

* discard invalid meta values

* reduce warnings
  • Loading branch information
anakin87 authored Jul 17, 2024
1 parent db2b5f7 commit 9893b56
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


VALID_DISTANCE_FUNCTIONS = "l2", "cosine", "ip"
SUPPORTED_TYPES_FOR_METADATA_VALUES = str, int, float, bool


class ChromaDocumentStore:
Expand Down Expand Up @@ -226,7 +227,26 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
data = {"ids": [doc.id], "documents": [doc.content]}

if doc.meta:
data["metadatas"] = [doc.meta]
valid_meta = {}
discarded_keys = []

for k, v in doc.meta.items():
if isinstance(v, SUPPORTED_TYPES_FOR_METADATA_VALUES):
valid_meta[k] = v
else:
discarded_keys.append(k)

if discarded_keys:
logger.warning(
"Document %s contains `meta` values of unsupported types for the keys: %s. "
"These items will be discarded. Supported types are: %s.",
doc.id,
", ".join(discarded_keys),
", ".join([t.__name__ for t in SUPPORTED_TYPES_FOR_METADATA_VALUES]),
)

if valid_meta:
data["metadatas"] = [valid_meta]

if doc.embedding is not None:
data["embeddings"] = [doc.embedding]
Expand Down
22 changes: 22 additions & 0 deletions integrations/chroma/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,28 @@ def test_search(self):
assert len(result) == 1
assert result[0][0].content == "Third document"

def test_write_documents_unsupported_meta_values(self, document_store: ChromaDocumentStore):
"""
Unsupported meta values should be removed from the documents before writing them to the database
"""

docs = [
Document(content="test doc 1", meta={"invalid": {"dict": "value"}}),
Document(content="test doc 2", meta={"invalid": ["list", "value"]}),
Document(content="test doc 3", meta={"ok": 123}),
]

document_store.write_documents(docs)

written_docs = document_store.filter_documents()
written_docs.sort(key=lambda x: x.content)

assert len(written_docs) == 3
assert [doc.id for doc in written_docs] == [doc.id for doc in docs]
assert written_docs[0].meta == {}
assert written_docs[1].meta == {}
assert written_docs[2].meta == {"ok": 123}

@pytest.mark.integration
def test_to_json(self, request):
ds = ChromaDocumentStore(
Expand Down

0 comments on commit 9893b56

Please sign in to comment.