From a995b4c0ef2628fa489afed5f8dfe0e1dc62a622 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=ADn=20Gotelli=20Ferenaz?= Date: Thu, 16 May 2024 20:17:22 -0300 Subject: [PATCH] vectorstore[minor]: Add support to filter by IS NULL and IS NOT NULL criteria (#40) Description: Add support to filter by IS NULL criteria in the metadata Issue: N/A Dependencies: N/A Twitter handle: @martinferenaz This PR fixes the problem when you want to search if a tag has a value (exists) in the metadata. To check that you need to compare with the "IS NULL" method and the "$eq" operator only checks that the tag exists and has a null value assigned. In addition, a test for the $nin condition was added and a bug arrised, the bug was resolved adding a not condition after the in_ --- examples/vectorstore.ipynb | 5 ++-- langchain_postgres/vectorstores.py | 14 +++++++++- .../fixtures/filtering_test_cases.py | 26 +++++++++++++++++++ tests/unit_tests/test_vectorstore.py | 15 +++++++++++ 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/examples/vectorstore.ipynb b/examples/vectorstore.ipynb index 257f403..1ae43b2 100644 --- a/examples/vectorstore.ipynb +++ b/examples/vectorstore.ipynb @@ -235,8 +235,8 @@ "\n", "The vectorstore supports a set of filters that can be applied against the metadata fields of the documents.\n", "\n", - "| Operator | Meaning/Category |\n", - "|----------|-------------------------|\n", + "| Operator | Meaning/Category |\n", + "|-----------|-------------------------|\n", "| \\$eq | Equality (==) |\n", "| \\$ne | Inequality (!=) |\n", "| \\$lt | Less than (<) |\n", @@ -246,6 +246,7 @@ "| \\$in | Special Cased (in) |\n", "| \\$nin | Special Cased (not in) |\n", "| \\$between | Special Cased (between) |\n", + "| \\$exists | Special Cased (is null) |\n", "| \\$like | Text (like) |\n", "| \\$ilike | Text (case-insensitive like) |\n", "| \\$and | Logical (and) |\n", diff --git a/langchain_postgres/vectorstores.py b/langchain_postgres/vectorstores.py index 5bdb65e..2061a92 100644 --- a/langchain_postgres/vectorstores.py +++ b/langchain_postgres/vectorstores.py @@ -66,6 +66,7 @@ class DistanceStrategy(str, enum.Enum): "$in", "$nin", "$between", + "$exists", } TEXT_OPERATORS = { @@ -702,13 +703,24 @@ def _handle_field_filter( if operator in {"$in"}: return queried_field.in_([str(val) for val in filter_value]) elif operator in {"$nin"}: - return queried_field.nin_([str(val) for val in filter_value]) + return ~queried_field.in_([str(val) for val in filter_value]) elif operator in {"$like"}: return queried_field.like(filter_value) elif operator in {"$ilike"}: return queried_field.ilike(filter_value) else: raise NotImplementedError() + elif operator == "$exists": + if not isinstance(filter_value, bool): + raise ValueError( + "Expected a boolean value for $exists " + f"operator, but got: {filter_value}" + ) + condition = func.jsonb_exists( + self.EmbeddingStore.cmetadata, + field, + ) + return ~condition if filter_value else condition else: raise NotImplementedError() diff --git a/tests/unit_tests/fixtures/filtering_test_cases.py b/tests/unit_tests/fixtures/filtering_test_cases.py index 9dcca44..0fb7e3f 100644 --- a/tests/unit_tests/fixtures/filtering_test_cases.py +++ b/tests/unit_tests/fixtures/filtering_test_cases.py @@ -198,10 +198,16 @@ {"id": {"$between": (1, 1)}}, [1], ), + # Test in ( {"name": {"$in": ["adam", "bob"]}}, [1, 2], ), + # Test nin + ( + {"name": {"$nin": ["adam", "bob"]}}, + [3], + ), ] TYPE_5_FILTERING_TEST_CASES = [ @@ -216,3 +222,23 @@ [1, 3], ), ] + +TYPE_6_FILTERING_TEST_CASES = [ + # These involve the special operator $exists + ( + {"happiness": {"$exists": True}}, + [], + ), + ( + {"happiness": {"$exists": False}}, + [1, 2, 3], + ), + ( + {"sadness": {"$exists": True}}, + [3], + ), + ( + {"sadness": {"$exists": False}}, + [1, 2], + ), +] diff --git a/tests/unit_tests/test_vectorstore.py b/tests/unit_tests/test_vectorstore.py index 2a9aca5..7516968 100644 --- a/tests/unit_tests/test_vectorstore.py +++ b/tests/unit_tests/test_vectorstore.py @@ -17,6 +17,7 @@ TYPE_3_FILTERING_TEST_CASES, TYPE_4_FILTERING_TEST_CASES, TYPE_5_FILTERING_TEST_CASES, + TYPE_6_FILTERING_TEST_CASES, ) from tests.utils import VECTORSTORE_CONNECTION_STRING as CONNECTION_STRING @@ -484,6 +485,17 @@ def test_pgvector_with_with_metadata_filters_5( assert [doc.metadata["id"] for doc in docs] == expected_ids, test_filter +@pytest.mark.parametrize("test_filter, expected_ids", TYPE_6_FILTERING_TEST_CASES) +def test_pgvector_with_with_metadata_filters_6( + pgvector: PGVector, + test_filter: Dict[str, Any], + expected_ids: List[int], +) -> None: + """Test end to end construction and search.""" + docs = pgvector.similarity_search("meow", k=5, filter=test_filter) + assert [doc.metadata["id"] for doc in docs] == expected_ids, test_filter + + @pytest.mark.parametrize( "invalid_filter", [ @@ -496,6 +508,8 @@ def test_pgvector_with_with_metadata_filters_5( {"$and": {}}, {"$between": {}}, {"$eq": {}}, + {"$exists": {}}, + {"$exists": 1}, ], ) def test_invalid_filters(pgvector: PGVector, invalid_filter: Any) -> None: @@ -510,6 +524,7 @@ def test_validate_operators() -> None: "$and", "$between", "$eq", + "$exists", "$gt", "$gte", "$ilike",