diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py index 3353ed5aa..49729c0d4 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py @@ -2,16 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 import logging -from collections import defaultdict -from typing import Any, Dict, List, Literal, Optional, Tuple +from typing import Any, Dict, List, Literal, Optional import chromadb -from chromadb.api.types import GetResult, QueryResult, validate_where, validate_where_document +from chromadb.api.types import GetResult, QueryResult from haystack import default_from_dict, default_to_dict from haystack.dataclasses import Document from haystack.document_stores.types import DuplicatePolicy -from .errors import ChromaDocumentStoreFilterError +from .filters import _convert_filters from .utils import get_embedding_function logger = logging.getLogger(__name__) @@ -114,83 +113,74 @@ def count_documents(self) -> int: def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: """ - Returns the documents that match the filters provided. - - Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`, - `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`, - `"$lte"`) or a metadata field name. - - Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata - field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or - (in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default - operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used - as default operation. - - Example: - - ```python - filters = { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": {"$in": ["economy", "politics"]}, - "publisher": {"$eq": "nytimes"} - } + Returns the documents that match the filters provided. + + Filters can be provided as a dictionary supporting filtering by ids, metadata, and document content. + Metadata filters should use the `"meta."` syntax, while content-based filters + use the `"content"` field directly. + Content filters support the `contains` and `not contains` operators, + while id filters only support the `==` operator. + + Due to Chroma's distinction between metadata filters and document filters, filters with `"field": "content"` + (i.e., document content filters) and metadata fields must be supplied separately. For details on chroma filters, + see the [Chroma documentation](https://docs.trychroma.com/guides). + + Example: + + ```python + filter_1 = { + "operator": "AND", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "name_0"}, + {"field": "meta.number", "operator": "not in", "value": [2, 9]}, + ], } - } - # or simpler using default operators - filters = { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": ["economy", "politics"], - "publisher": "nytimes" + filter_2 = { + "operator": "AND", + "conditions": [ + {"field": "content", "operator": "contains", "value": "FOO"}, + {"field": "content", "operator": "not contains", "value": "BAR"}, + ], } - } - ``` - - To use the same logical operator multiple times on the same level, logical operators can take a list of - dictionaries as value. - - Example: - - ```python - filters = { - "$or": [ - { - "$and": { - "Type": "News Paper", - "Date": { - "$lt": "2019-01-01" - } - } - }, - { - "$and": { - "Type": "Blog Post", - "Date": { - "$gte": "2019-01-01" - } - } - } - ] - } - ``` - - :param filters: the filters to apply to the document list. - :returns: a list of Documents that match the given filters. + ``` + + If you need to apply the same logical operator (e.g., "AND", "OR") to multiple conditions at the same level, + you can provide a list of dictionaries as the value for the operator, like in the example below: + + ```python + filters = { + "operator": "OR", + "conditions": [ + {"field": "meta.author", "operator": "==", "value": "author_1"}, + { + "operator": "AND", + "conditions": [ + {"field": "meta.tag", "operator": "==", "value": "tag_1"}, + {"field": "meta.page", "operator": ">", "value": 100}, + ], + }, + { + "operator": "AND", + "conditions": [ + {"field": "meta.tag", "operator": "==", "value": "tag_2"}, + {"field": "meta.page", "operator": ">", "value": 200}, + ], + }, + ], + } + ``` + + :param filters: the filters to apply to the document list. + :returns: a list of Documents that match the given filters. """ if filters: - ids, where, where_document = self._normalize_filters(filters) - kwargs: Dict[str, Any] = {"where": where} + chroma_filter = _convert_filters(filters) + kwargs: Dict[str, Any] = {"where": chroma_filter.where} - if ids: - kwargs["ids"] = ids - if where_document: - kwargs["where_document"] = where_document + if chroma_filter.ids: + kwargs["ids"] = chroma_filter.ids + if chroma_filter.where_document: + kwargs["where_document"] = chroma_filter.where_document result = self._collection.get(**kwargs) else: @@ -285,12 +275,12 @@ def search(self, queries: List[str], top_k: int, filters: Optional[Dict[str, Any include=["embeddings", "documents", "metadatas", "distances"], ) else: - chroma_filters = self._normalize_filters(filters=filters) + chroma_filters = _convert_filters(filters=filters) results = self._collection.query( query_texts=queries, n_results=top_k, - where=chroma_filters[1], - where_document=chroma_filters[2], + where=chroma_filters.where, + where_document=chroma_filters.where_document, include=["embeddings", "documents", "metadatas", "distances"], ) @@ -316,12 +306,12 @@ def search_embeddings( include=["embeddings", "documents", "metadatas", "distances"], ) else: - chroma_filters = self._normalize_filters(filters=filters) + chroma_filters = _convert_filters(filters=filters) results = self._collection.query( query_embeddings=query_embeddings, n_results=top_k, - where=chroma_filters[1], - where_document=chroma_filters[2], + where=chroma_filters.where, + where_document=chroma_filters.where_document, include=["embeddings", "documents", "metadatas", "distances"], ) @@ -355,62 +345,6 @@ def to_dict(self) -> Dict[str, Any]: **self._embedding_function_params, ) - @staticmethod - def _normalize_filters(filters: Dict[str, Any]) -> Tuple[List[str], Dict[str, Any], Dict[str, Any]]: - """ - Translate Haystack filters to Chroma filters. It returns three dictionaries, to be - passed to `ids`, `where` and `where_document` respectively. - """ - if not isinstance(filters, dict): - msg = "'filters' parameter must be a dictionary" - raise ChromaDocumentStoreFilterError(msg) - - ids = [] - where = defaultdict(list) - where_document = defaultdict(list) - keys_to_remove = [] - - for field, value in filters.items(): - if field == "content": - # Schedule for removal the original key, we're going to change it - keys_to_remove.append(field) - where_document["$contains"] = value - elif field == "id": - # Schedule for removal the original key, we're going to change it - keys_to_remove.append(field) - ids.append(value) - elif isinstance(value, (list, tuple)): - # Schedule for removal the original key, we're going to change it - keys_to_remove.append(field) - - # if the list is empty the filter is invalid, let's just remove it - if len(value) == 0: - continue - - # if the list has a single item, just make it a regular key:value filter pair - if len(value) == 1: - where[field] = value[0] - continue - - # if the list contains multiple items, we need an $or chain - for v in value: - where["$or"].append({field: v}) - - for k in keys_to_remove: - del filters[k] - - final_where = dict(filters) - final_where.update(dict(where)) - try: - if final_where: - validate_where(final_where) - if where_document: - validate_where_document(where_document) - except ValueError as e: - raise ChromaDocumentStoreFilterError(e) from e - - return ids, final_where, where_document - @staticmethod def _get_result_to_documents(result: GetResult) -> List[Document]: """ diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py new file mode 100644 index 000000000..ef5c920a7 --- /dev/null +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/filters.py @@ -0,0 +1,163 @@ +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List + +from chromadb.api.types import validate_where, validate_where_document + +from .errors import ChromaDocumentStoreFilterError + +OPERATORS = { + "==": "$eq", + "!=": "$ne", + ">": "$gt", + ">=": "$gte", + "<": "$lt", + "<=": "$lte", + "in": "$in", + "not in": "$nin", + "AND": "$and", + "OR": "$or", + "contains": "$contains", + "not contains": "$not_contains", +} + + +@dataclass +class ChromaFilter: + """ + Dataclass to store the converted filter structure used in Chroma queries. + + Following filter criterias are supported: + - `ids`: A list of document IDs to filter by in Chroma collection. + - `where`: A dictionary of metadata filters applied to the documents. + - `where_document`: A dictionary of content-based filters applied to the documents' content. + """ + + ids: List[str] + where: Dict[str, Any] + where_document: Dict[str, Any] + + +def _convert_filters(filters: Dict[str, Any]) -> ChromaFilter: + """ + Converts Haystack filters into a format compatible with Chroma, separating them into ids, metadata filters, + and content filters to be passed to chroma as ids, where, and where_document clauses respectively. + """ + + ids = [] + where: Dict[str, Any] = defaultdict(list) + where_document: Dict[str, Any] = defaultdict(list) + + converted_filters = _convert_filter_clause(filters) + for field, value in converted_filters.items(): + if value is None: + continue + + # Chroma differentiates between metadata filters and content filters, + # with each filter applying to only one type. + # If 'where_document' is populated, it's a content filter. + # In this case, we skip further processing of metadata filters for this field. + where_document.update(_create_where_document_filter(field, value)) + if where_document: + continue + # if field is "id", it'll be passed to Chroma's ids filter + elif field == "id": + if not value["$eq"]: + msg = f"id filter only supports '==' operator, got {value}" + raise ChromaDocumentStoreFilterError(msg) + ids.append(value["$eq"]) + else: + where[field] = value + + try: + if where_document: + test_clause = "document content filter" + validate_where_document(where_document) + elif where: + test_clause = "metadata filter" + validate_where(where) + except ValueError as e: + msg = f"Invalid '{test_clause}' : {e}" + raise ChromaDocumentStoreFilterError(msg) from e + + return ChromaFilter(ids=ids, where=where, where_document=where_document) + + +def _convert_filter_clause(filters: Dict[str, Any]) -> Dict[str, Any]: + """ + Converts Haystack filters to Chroma compatible filters. + """ + converted_clauses = {} + + if "field" in filters: + converted_clauses.update(_parse_comparison_condition(filters)) + else: + converted_clauses.update(_parse_logical_condition(filters)) + + return converted_clauses + + +def _create_where_document_filter(field: str, value: Dict[Any, Any]) -> Dict[str, Any]: + """ + Method to check if given haystack filter is a document filter + and converts it to Chroma-compatible where_document filter. + + """ + where_document: Dict[str, List[Any]] = defaultdict(list) + + # Create a single document filter for the content field + if field == "content": + return value + # In case of a logical operator, check if the given filters contain "content" + # Then combine the filters into a single where_document filter to pass to Chroma + if field in ["$and", "$or"] and value[0].get("content"): + # Use list comprehension to populate the field without modifying the original structure + document_filters = [ + _create_where_document_filter(k, v) for v in value if isinstance(v, dict) for k, v in v.items() + ] + where_document[field] = document_filters + + return where_document + + +def _parse_logical_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise ChromaDocumentStoreFilterError(msg) + if "conditions" not in condition: + msg = f"'conditions' key missing in {condition}" + raise ChromaDocumentStoreFilterError(msg) + + operator = condition["operator"] + conditions = [_convert_filter_clause(c) for c in condition["conditions"]] + + if operator not in OPERATORS: + msg = f"Unknown operator {operator}" + raise ChromaDocumentStoreFilterError(msg) + return {OPERATORS[operator]: conditions} + + +def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + if "field" not in condition: + msg = f"'field' key missing in {condition}" + raise ChromaDocumentStoreFilterError(msg) + field: str = "" + # remove the "meta." prefix from the field name + if condition["field"].startswith("meta."): + field = condition["field"].split(".")[-1] + else: + field = condition["field"] + + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise ChromaDocumentStoreFilterError(msg) + if "value" not in condition: + msg = f"'value' key missing in {condition}" + raise ChromaDocumentStoreFilterError(msg) + operator: str = condition["operator"] + value: Any = condition["value"] + + if operator not in OPERATORS: + msg = f"Unknown operator {operator}. Valid operators are: {list(OPERATORS.keys())}" + raise ChromaDocumentStoreFilterError(msg) + return {field: {OPERATORS[operator]: value}} diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 5a7e12b3d..cd20bd398 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -14,7 +14,7 @@ from haystack.testing.document_store import ( CountDocumentsTest, DeleteDocumentsTest, - LegacyFilterDocumentsTest, + FilterDocumentsTest, ) from haystack_integrations.document_stores.chroma import ChromaDocumentStore @@ -32,7 +32,7 @@ def __call__(self, input: Documents) -> Embeddings: # noqa - chroma will inspec return [np.random.default_rng().uniform(-1, 1, 768).tolist()] -class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, LegacyFilterDocumentsTest): +class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, FilterDocumentsTest): """ Common test cases will be provided by `DocumentStoreBaseTests` but you can add more to this class. @@ -66,17 +66,6 @@ def assert_documents_are_equal(self, received: List[Document], expected: List[Do assert doc_received.content == doc_expected.content assert doc_received.meta == doc_expected.meta - def test_ne_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - """ - We customize this test because Chroma consider "not equal" true when - a field is missing - """ - document_store.write_documents(filterable_docs) - result = document_store.filter_documents(filters={"page": {"$ne": "100"}}) - self.assert_documents_are_equal( - result, [doc for doc in filterable_docs if doc.meta.get("page", "100") != "100"] - ) - def test_delete_empty(self, document_store: ChromaDocumentStore): """ Deleting a non-existing document should not raise with Chroma @@ -90,8 +79,8 @@ def test_delete_not_empty_nonexisting(self, document_store: ChromaDocumentStore) doc = Document(content="test doc") document_store.write_documents([doc]) document_store.delete_documents(["non_existing"]) - - assert document_store.filter_documents(filters={"id": doc.id}) == [doc] + filters = {"operator": "==", "field": "id", "value": doc.id} + assert document_store.filter_documents(filters=filters) == [doc] def test_search(self): document_store = ChromaDocumentStore() @@ -234,88 +223,146 @@ def test_metadata_initialization(self, caplog): assert store._collection.metadata["hnsw:space"] == "ip" assert new_store._collection.metadata["hnsw:space"] == "ip" - @pytest.mark.skip(reason="Filter on dataframe contents is not supported.") - def test_filter_document_dataframe(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - pass - - @pytest.mark.skip(reason="Filter on table contents is not supported.") - def test_eq_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - pass + def test_contains(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + document_store.write_documents(filterable_docs) + filters = {"field": "content", "operator": "contains", "value": "FOO"} + result = document_store.filter_documents(filters=filters) + self.assert_documents_are_equal( + result, + [doc for doc in filterable_docs if doc.content and "FOO" in doc.content], + ) - @pytest.mark.skip(reason="Filter on embedding value is not supported.") - def test_eq_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - pass + def test_multiple_contains(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + document_store.write_documents(filterable_docs) + filters = { + "operator": "OR", + "conditions": [ + {"field": "content", "operator": "contains", "value": "FOO"}, + {"field": "content", "operator": "not contains", "value": "BAR"}, + ], + } + result = document_store.filter_documents(filters=filters) + self.assert_documents_are_equal( + result, + [doc for doc in filterable_docs if doc.content and ("FOO" in doc.content or "BAR" not in doc.content)], + ) - @pytest.mark.skip(reason="$in operator is not supported. Filter on table contents is not supported.") - def test_in_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - pass + def test_nested_logical_filters(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + document_store.write_documents(filterable_docs) + filters = { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "name_0"}, + { + "operator": "AND", + "conditions": [ + {"field": "meta.number", "operator": "!=", "value": 0}, + {"field": "meta.page", "operator": "==", "value": "123"}, + ], + }, + { + "operator": "AND", + "conditions": [ + {"field": "meta.chapter", "operator": "==", "value": "conclusion"}, + {"field": "meta.date", "operator": "==", "value": "1989-11-09T17:53:00"}, + ], + }, + ], + } + result = document_store.filter_documents(filters=filters) + self.assert_documents_are_equal( + result, + [ + doc + for doc in filterable_docs + if ( + # Ensure all required fields are present in doc.meta + ("name" in doc.meta and doc.meta.get("name") == "name_0") + or ( + all(key in doc.meta for key in ["number", "page"]) + and doc.meta.get("number") != 0 + and doc.meta.get("page") == "123" + ) + or ( + all(key in doc.meta for key in ["date", "chapter"]) + and doc.meta.get("chapter") == "conclusion" + and doc.meta.get("date") == "1989-11-09T17:53:00" + ) + ) + ], + ) - @pytest.mark.skip(reason="$in operator is not supported.") - def test_in_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - pass + # Override inequality tests from FilterDocumentsTest + # because chroma doesn't return documents with absent meta fields - @pytest.mark.skip(reason="Filter on table contents is not supported.") - def test_ne_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - pass + def test_comparison_not_equal(self, document_store, filterable_docs): + """Test filter_documents() with != comparator""" + document_store.write_documents(filterable_docs) + result = document_store.filter_documents({"field": "meta.number", "operator": "!=", "value": 100}) + self.assert_documents_are_equal( + result, [d for d in filterable_docs if "number" in d.meta and d.meta.get("number") != 100] + ) - @pytest.mark.skip(reason="Filter on embedding value is not supported.") - def test_ne_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): - pass + def test_comparison_not_in(self, document_store, filterable_docs): + """Test filter_documents() with 'not in' comparator""" + document_store.write_documents(filterable_docs) + result = document_store.filter_documents({"field": "meta.number", "operator": "not in", "value": [2, 9]}) + self.assert_documents_are_equal( + result, [d for d in filterable_docs if "number" in d.meta and d.meta.get("number") not in [2, 9]] + ) - @pytest.mark.skip(reason="$nin operator is not supported. Filter on table contents is not supported.") - def test_nin_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Filter on dataframe contents is not supported.") + def test_comparison_equal_with_dataframe( + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] + ): pass - @pytest.mark.skip(reason="$nin operator is not supported. Filter on embedding value is not supported.") - def test_nin_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Filter on dataframe contents is not supported.") + def test_comparison_not_equal_with_dataframe( + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] + ): pass - @pytest.mark.skip(reason="$nin operator is not supported.") - def test_nin_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Chroma does not support comparison with null values") + def test_comparison_equal_with_none(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter syntax not supported.") - def test_filter_simple_implicit_and_with_multi_key_dict( - self, document_store: ChromaDocumentStore, filterable_docs: List[Document] - ): + @pytest.mark.skip(reason="Chroma does not support comparison with null values") + def test_comparison_not_equal_with_none(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter syntax not supported.") - def test_filter_simple_explicit_and_with_list( - self, document_store: ChromaDocumentStore, filterable_docs: List[Document] - ): + @pytest.mark.skip(reason="Chroma does not support comparison with dates") + def test_comparison_greater_than_with_iso_date(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter syntax not supported.") - def test_filter_simple_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Chroma does not support comparison with null values") + def test_comparison_greater_than_with_none(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter syntax not supported.") - def test_filter_nested_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Chroma does not support comparison with dates") + def test_comparison_greater_than_equal_with_iso_date(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter syntax not supported.") - def test_filter_simple_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Chroma does not support comparison with null values") + def test_comparison_greater_than_equal_with_none(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter syntax not supported.") - def test_filter_nested_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Chroma does not support comparison with dates") + def test_comparison_less_than_with_iso_date(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter on table contents is not supported.") - def test_filter_nested_and_or_explicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Chroma does not support comparison with null values") + def test_comparison_less_than_with_none(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter syntax not supported.") - def test_filter_nested_and_or_implicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Chroma does not support comparison with dates") + def test_comparison_less_than_equal_with_iso_date(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter syntax not supported.") - def test_filter_nested_or_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): + @pytest.mark.skip(reason="Chroma does not support comparison with null values") + def test_comparison_less_than_equal_with_none(self, document_store, filterable_docs): pass - @pytest.mark.skip(reason="Filter syntax not supported.") - def test_filter_nested_multiple_identical_operators_same_level( - self, document_store: ChromaDocumentStore, filterable_docs: List[Document] - ): + @pytest.mark.skip(reason="Chroma does not support not operator") + def test_not_operator(self, document_store, filterable_docs): pass