From 2de430d35d010fd0997be696e413d0714bfc4571 Mon Sep 17 00:00:00 2001 From: Michael Chouinard <46358556+chouinar@users.noreply.github.com> Date: Mon, 9 Sep 2024 15:50:30 -0400 Subject: [PATCH] Update search query builder to support int and date range queries (navapbc/simpler-grants-gov#195) Fixes #164 Adds support to our search query builder layer to handle building queries for filtering on ints and dates between specific date ranges. Added ints and dates in the same ticket as the queries are essentially the same, just ints vs dates. While it throws an exception if both start and end values are None, I think the API request schema should also do that so that the error is more relevant/accurate to the API, but I can add that later (likely a lot more niche edge cases to handle for requests). Nothing too exciting with these queries, they work as expected and are just simple ranges. The test dataset is roughly accurate (turns out books didn't always have exact release dates until the last ~20 years). I also have tested these queries manually with the opportunity endpoints fields, the following two queries would work (can be tested locally at http://localhost:5601/app/dev_tools#/console ): ``` GET opportunity-index-alias/_search { "size": 5, "query": { "bool": { "filter": [ { "range": { "summary.post_date": { "gte": "2020-01-01", "lte": "2025-01-01" } } } ] } } } GET opportunity-index-alias/_search { "size": 12, "query": { "bool": { "filter": [ { "range": { "summary.award_floor": { "gte": 1234, "lte": 100000 } } } ] } } } ``` --- .../search/opensearch_query_builder.py | 57 +++++++ .../search/test_opensearch_query_builder.py | 142 +++++++++++++++++- 2 files changed, 198 insertions(+), 1 deletion(-) diff --git a/api/src/adapters/search/opensearch_query_builder.py b/api/src/adapters/search/opensearch_query_builder.py index 4aa4e07e55..a7e900c4d1 100644 --- a/api/src/adapters/search/opensearch_query_builder.py +++ b/api/src/adapters/search/opensearch_query_builder.py @@ -1,3 +1,4 @@ +import datetime import typing from src.pagination.pagination_models import SortDirection @@ -17,6 +18,7 @@ class SearchQueryBuilder: * Sorted by relevancy score descending * Scored on titles containing "king" * Where the author is one of Brandon Sanderson or J R.R. Tolkien + * With a page count between 300 and 1000 * Returning aggregate counts of books by those authors in the full results This query could either be built manually and look like: @@ -52,6 +54,12 @@ class SearchQueryBuilder: "Brandon Sanderson", "J R.R. Tolkien" ] + }, + "range": { + "publication_date": { + "gte": 300, + "lte": 1000 + } } } ] @@ -75,6 +83,7 @@ class SearchQueryBuilder: .sort_by([("relevancy", SortDirection.DESCENDING)]) .simple_query("king", fields=["title.keyword"]) .filter_terms("author.keyword", terms=["Brandon Sanderson", "J R.R. Tolkien"]) + .filter_int_range("page_count", 300, 1000) .aggregation_terms(aggregation_name="author", field_name="author.keyword", minimum_count=0) .build() """ @@ -150,6 +159,54 @@ def filter_terms(self, field: str, terms: list) -> typing.Self: self.filters.append({"terms": {field: terms}}) return self + def filter_int_range( + self, field: str, min_value: int | None, max_value: int | None + ) -> typing.Self: + """ + For a given field, filter results to a range of integer values. + + If min or max is not provided, the range is unbounded and only + affects the minimum or maximum possible value. At least one min or max value must be specified. + + These filters do not affect the relevancy score, they are purely + a binary filter on the overall results. + """ + if min_value is None and max_value is None: + raise ValueError("Cannot use int range filter if both min and max are None") + + range_filter = {} + if min_value is not None: + range_filter["gte"] = min_value + if max_value is not None: + range_filter["lte"] = max_value + + self.filters.append({"range": {field: range_filter}}) + return self + + def filter_date_range( + self, field: str, start_date: datetime.date | None, end_date: datetime.date | None + ) -> typing.Self: + """ + For a given field, filter results to a range of dates. + + If start or end is not provided, the range is unbounded and only + affects the start or end date. At least one start or end date must be specified. + + These filters do not affect the relevancy score, they are purely + a binary filter on the overall results. + """ + if start_date is None and end_date is None: + raise ValueError("Cannot use date range filter if both start and end are None") + + range_filter = {} + if start_date is not None: + range_filter["gte"] = start_date.isoformat() + if end_date is not None: + range_filter["lte"] = end_date.isoformat() + + self.filters.append({"range": {field: range_filter}}) + return self + def aggregation_terms( self, aggregation_name: str, field_name: str, size: int = 25, minimum_count: int = 1 ) -> typing.Self: diff --git a/api/tests/src/adapters/search/test_opensearch_query_builder.py b/api/tests/src/adapters/search/test_opensearch_query_builder.py index 731c0e6caa..33f9b2b199 100644 --- a/api/tests/src/adapters/search/test_opensearch_query_builder.py +++ b/api/tests/src/adapters/search/test_opensearch_query_builder.py @@ -1,4 +1,5 @@ import uuid +from datetime import date import pytest @@ -12,6 +13,7 @@ "author": "Brandon Sanderson", "in_stock": True, "page_count": 1007, + "publication_date": "2010-08-31", } WORDS_OF_RADIANCE = { "id": 2, @@ -19,6 +21,7 @@ "author": "Brandon Sanderson", "in_stock": False, "page_count": 1087, + "publication_date": "2014-03-04", } OATHBRINGER = { "id": 3, @@ -26,6 +29,7 @@ "author": "Brandon Sanderson", "in_stock": True, "page_count": 1248, + "publication_date": "2017-11-14", } RHYTHM_OF_WAR = { "id": 4, @@ -33,6 +37,7 @@ "author": "Brandon Sanderson", "in_stock": False, "page_count": 1232, + "publication_date": "2020-11-17", } GAME_OF_THRONES = { "id": 5, @@ -40,6 +45,7 @@ "author": "George R.R. Martin", "in_stock": True, "page_count": 694, + "publication_date": "1996-08-01", } CLASH_OF_KINGS = { "id": 6, @@ -47,6 +53,7 @@ "author": "George R.R. Martin", "in_stock": True, "page_count": 768, + "publication_date": "1998-11-16", } STORM_OF_SWORDS = { "id": 7, @@ -54,6 +61,7 @@ "author": "George R.R. Martin", "in_stock": True, "page_count": 973, + "publication_date": "2000-08-08", } FEAST_FOR_CROWS = { "id": 8, @@ -61,6 +69,7 @@ "author": "George R.R. Martin", "in_stock": True, "page_count": 753, + "publication_date": "2005-10-17", } DANCE_WITH_DRAGONS = { "id": 9, @@ -68,6 +77,7 @@ "author": "George R.R. Martin", "in_stock": False, "page_count": 1056, + "publication_date": "2011-07-12", } FELLOWSHIP_OF_THE_RING = { "id": 10, @@ -75,6 +85,7 @@ "author": "J R.R. Tolkien", "in_stock": True, "page_count": 423, + "publication_date": "1954-07-29", } TWO_TOWERS = { "id": 11, @@ -82,6 +93,7 @@ "author": "J R.R. Tolkien", "in_stock": True, "page_count": 352, + "publication_date": "1954-11-11", } RETURN_OF_THE_KING = { "id": 12, @@ -89,6 +101,7 @@ "author": "J R.R. Tolkien", "in_stock": True, "page_count": 416, + "publication_date": "1955-10-20", } FULL_DATA = [ @@ -120,7 +133,9 @@ def validate_valid_request( f"Request generated was invalid and caused an error in search client: {json_value}" ) - assert resp.records == expected_results + assert ( + resp.records == expected_results + ), f"{[record['title'] for record in resp.records]} != {[expected['title'] for expected in expected_results]}" if expected_aggregations is not None: assert resp.aggregations == expected_aggregations @@ -364,6 +379,131 @@ def test_query_builder_filter_terms( validate_valid_request(search_client, search_index, builder, expected_results) + @pytest.mark.parametrize( + "start_date,end_date,expected_results", + [ + # Date range that will include all results + (date(1900, 1, 1), date(2050, 1, 1), FULL_DATA), + # Start only date range that will get all results + (date(1950, 1, 1), None, FULL_DATA), + # End only date range that will get all results + (None, date(2025, 1, 1), FULL_DATA), + # Range that filters to just oldest + ( + date(1950, 1, 1), + date(1960, 1, 1), + [FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING], + ), + # Unbounded range for oldest few + (None, date(1990, 1, 1), [FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING]), + # Unbounded range for newest few + (date(2011, 8, 1), None, [WORDS_OF_RADIANCE, OATHBRINGER, RHYTHM_OF_WAR]), + # Selecting a few in the middle + ( + date(2005, 1, 1), + date(2014, 1, 1), + [WAY_OF_KINGS, FEAST_FOR_CROWS, DANCE_WITH_DRAGONS], + ), + # Exact date + (date(1954, 7, 29), date(1954, 7, 29), [FELLOWSHIP_OF_THE_RING]), + # None fetched in range + (date(1981, 1, 1), date(1989, 1, 1), []), + ], + ) + def test_query_builder_filter_date_range( + self, search_client, search_index, start_date, end_date, expected_results + ): + builder = ( + SearchQueryBuilder() + .sort_by([]) + .filter_date_range("publication_date", start_date, end_date) + ) + + expected_ranges = {} + if start_date is not None: + expected_ranges["gte"] = start_date.isoformat() + if end_date is not None: + expected_ranges["lte"] = end_date.isoformat() + + expected_query = { + "size": 25, + "from": 0, + "track_scores": True, + "query": {"bool": {"filter": [{"range": {"publication_date": expected_ranges}}]}}, + } + + assert builder.build() == expected_query + + validate_valid_request(search_client, search_index, builder, expected_results) + + @pytest.mark.parametrize( + "min_value,max_value,expected_results", + [ + # All fetched + (1, 2000, FULL_DATA), + # None fetched + (2000, 3000, []), + # "Short" books + (300, 700, [GAME_OF_THRONES, FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING]), + # Unbounded short + (None, 416, [TWO_TOWERS, RETURN_OF_THE_KING]), + # Unbounded long + (1050, None, [WORDS_OF_RADIANCE, OATHBRINGER, RHYTHM_OF_WAR, DANCE_WITH_DRAGONS]), + # Middle length + ( + 500, + 1010, + [WAY_OF_KINGS, GAME_OF_THRONES, CLASH_OF_KINGS, STORM_OF_SWORDS, FEAST_FOR_CROWS], + ), + ], + ) + def test_query_builder_filter_int_range( + self, search_client, search_index, min_value, max_value, expected_results + ): + builder = ( + SearchQueryBuilder().sort_by([]).filter_int_range("page_count", min_value, max_value) + ) + + expected_ranges = {} + if min_value is not None: + expected_ranges["gte"] = min_value + if max_value is not None: + expected_ranges["lte"] = max_value + + expected_query = { + "size": 25, + "from": 0, + "track_scores": True, + "query": {"bool": {"filter": [{"range": {"page_count": expected_ranges}}]}}, + } + + assert builder.build() == expected_query + + validate_valid_request(search_client, search_index, builder, expected_results) + + def test_multiple_ranges(self, search_client, search_index): + # Sanity test that we can specify multiple ranges (in this case, a date + int range) + # in the same query + builder = ( + SearchQueryBuilder() + .sort_by([]) + .filter_int_range("page_count", 600, 1100) + .filter_date_range("publication_date", date(2000, 1, 1), date(2013, 1, 1)) + ) + + expected_results = [WAY_OF_KINGS, STORM_OF_SWORDS, FEAST_FOR_CROWS, DANCE_WITH_DRAGONS] + validate_valid_request( + search_client, search_index, builder, expected_results=expected_results + ) + + def test_filter_int_range_both_none(self): + with pytest.raises(ValueError, match="Cannot use int range filter"): + SearchQueryBuilder().filter_int_range("test_field", None, None) + + def test_filter_date_range_both_none(self): + with pytest.raises(ValueError, match="Cannot use date range filter"): + SearchQueryBuilder().filter_date_range("test_field", None, None) + @pytest.mark.parametrize( "query,fields,expected_results,expected_aggregations", [