Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Update search query builder to support int and date range queries (#195)
Browse files Browse the repository at this point in the history
Fixes #164

Adds support to our search query builder layer to handle building
queries for filtering on ints and dates between specific date ranges.

Added ints and dates in the same ticket as the queries are essentially
the same, just ints vs dates. While it throws an exception if both start
and end values are None, I think the API request schema should also do
that so that the error is more relevant/accurate to the API, but I can
add that later (likely a lot more niche edge cases to handle for
requests).

Nothing too exciting with these queries, they work as expected and are
just simple ranges.

The test dataset is roughly accurate (turns out books didn't always have
exact release dates until the last ~20 years).

I also have tested these queries manually with the opportunity endpoints
fields, the following two queries would work (can be tested locally at
http://localhost:5601/app/dev_tools#/console ):
```
GET opportunity-index-alias/_search
{
  "size": 5,
  "query": {
    "bool": {
      "filter": [
          {
            "range": {
              "summary.post_date": {
                "gte": "2020-01-01",
                "lte": "2025-01-01"
              }
            }
          }
        ]
    }
  }
}

GET opportunity-index-alias/_search
{
  "size": 12,
  "query": {
    "bool": {
      "filter": [
          {
            "range": {
              "summary.award_floor": {
                "gte": 1234,
                "lte": 100000
              }
            }
          }
        ]
    }
  }
}
```
  • Loading branch information
chouinar authored and acouch committed Sep 18, 2024
1 parent cef4fef commit 2de430d
Show file tree
Hide file tree
Showing 2 changed files with 198 additions and 1 deletion.
57 changes: 57 additions & 0 deletions api/src/adapters/search/opensearch_query_builder.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import typing

from src.pagination.pagination_models import SortDirection
Expand All @@ -17,6 +18,7 @@ class SearchQueryBuilder:
* Sorted by relevancy score descending
* Scored on titles containing "king"
* Where the author is one of Brandon Sanderson or J R.R. Tolkien
* With a page count between 300 and 1000
* Returning aggregate counts of books by those authors in the full results
This query could either be built manually and look like:
Expand Down Expand Up @@ -52,6 +54,12 @@ class SearchQueryBuilder:
"Brandon Sanderson",
"J R.R. Tolkien"
]
},
"range": {
"publication_date": {
"gte": 300,
"lte": 1000
}
}
}
]
Expand All @@ -75,6 +83,7 @@ class SearchQueryBuilder:
.sort_by([("relevancy", SortDirection.DESCENDING)])
.simple_query("king", fields=["title.keyword"])
.filter_terms("author.keyword", terms=["Brandon Sanderson", "J R.R. Tolkien"])
.filter_int_range("page_count", 300, 1000)
.aggregation_terms(aggregation_name="author", field_name="author.keyword", minimum_count=0)
.build()
"""
Expand Down Expand Up @@ -150,6 +159,54 @@ def filter_terms(self, field: str, terms: list) -> typing.Self:
self.filters.append({"terms": {field: terms}})
return self

def filter_int_range(
self, field: str, min_value: int | None, max_value: int | None
) -> typing.Self:
"""
For a given field, filter results to a range of integer values.
If min or max is not provided, the range is unbounded and only
affects the minimum or maximum possible value. At least one min or max value must be specified.
These filters do not affect the relevancy score, they are purely
a binary filter on the overall results.
"""
if min_value is None and max_value is None:
raise ValueError("Cannot use int range filter if both min and max are None")

range_filter = {}
if min_value is not None:
range_filter["gte"] = min_value
if max_value is not None:
range_filter["lte"] = max_value

self.filters.append({"range": {field: range_filter}})
return self

def filter_date_range(
self, field: str, start_date: datetime.date | None, end_date: datetime.date | None
) -> typing.Self:
"""
For a given field, filter results to a range of dates.
If start or end is not provided, the range is unbounded and only
affects the start or end date. At least one start or end date must be specified.
These filters do not affect the relevancy score, they are purely
a binary filter on the overall results.
"""
if start_date is None and end_date is None:
raise ValueError("Cannot use date range filter if both start and end are None")

range_filter = {}
if start_date is not None:
range_filter["gte"] = start_date.isoformat()
if end_date is not None:
range_filter["lte"] = end_date.isoformat()

self.filters.append({"range": {field: range_filter}})
return self

def aggregation_terms(
self, aggregation_name: str, field_name: str, size: int = 25, minimum_count: int = 1
) -> typing.Self:
Expand Down
142 changes: 141 additions & 1 deletion api/tests/src/adapters/search/test_opensearch_query_builder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import uuid
from datetime import date

import pytest

Expand All @@ -12,83 +13,95 @@
"author": "Brandon Sanderson",
"in_stock": True,
"page_count": 1007,
"publication_date": "2010-08-31",
}
WORDS_OF_RADIANCE = {
"id": 2,
"title": "Words of Radiance",
"author": "Brandon Sanderson",
"in_stock": False,
"page_count": 1087,
"publication_date": "2014-03-04",
}
OATHBRINGER = {
"id": 3,
"title": "Oathbringer",
"author": "Brandon Sanderson",
"in_stock": True,
"page_count": 1248,
"publication_date": "2017-11-14",
}
RHYTHM_OF_WAR = {
"id": 4,
"title": "Rhythm of War",
"author": "Brandon Sanderson",
"in_stock": False,
"page_count": 1232,
"publication_date": "2020-11-17",
}
GAME_OF_THRONES = {
"id": 5,
"title": "A Game of Thrones",
"author": "George R.R. Martin",
"in_stock": True,
"page_count": 694,
"publication_date": "1996-08-01",
}
CLASH_OF_KINGS = {
"id": 6,
"title": "A Clash of Kings",
"author": "George R.R. Martin",
"in_stock": True,
"page_count": 768,
"publication_date": "1998-11-16",
}
STORM_OF_SWORDS = {
"id": 7,
"title": "A Storm of Swords",
"author": "George R.R. Martin",
"in_stock": True,
"page_count": 973,
"publication_date": "2000-08-08",
}
FEAST_FOR_CROWS = {
"id": 8,
"title": "A Feast for Crows",
"author": "George R.R. Martin",
"in_stock": True,
"page_count": 753,
"publication_date": "2005-10-17",
}
DANCE_WITH_DRAGONS = {
"id": 9,
"title": "A Dance with Dragons",
"author": "George R.R. Martin",
"in_stock": False,
"page_count": 1056,
"publication_date": "2011-07-12",
}
FELLOWSHIP_OF_THE_RING = {
"id": 10,
"title": "The Fellowship of the Ring",
"author": "J R.R. Tolkien",
"in_stock": True,
"page_count": 423,
"publication_date": "1954-07-29",
}
TWO_TOWERS = {
"id": 11,
"title": "The Two Towers",
"author": "J R.R. Tolkien",
"in_stock": True,
"page_count": 352,
"publication_date": "1954-11-11",
}
RETURN_OF_THE_KING = {
"id": 12,
"title": "The Return of the King",
"author": "J R.R. Tolkien",
"in_stock": True,
"page_count": 416,
"publication_date": "1955-10-20",
}

FULL_DATA = [
Expand Down Expand Up @@ -120,7 +133,9 @@ def validate_valid_request(
f"Request generated was invalid and caused an error in search client: {json_value}"
)

assert resp.records == expected_results
assert (
resp.records == expected_results
), f"{[record['title'] for record in resp.records]} != {[expected['title'] for expected in expected_results]}"

if expected_aggregations is not None:
assert resp.aggregations == expected_aggregations
Expand Down Expand Up @@ -364,6 +379,131 @@ def test_query_builder_filter_terms(

validate_valid_request(search_client, search_index, builder, expected_results)

@pytest.mark.parametrize(
"start_date,end_date,expected_results",
[
# Date range that will include all results
(date(1900, 1, 1), date(2050, 1, 1), FULL_DATA),
# Start only date range that will get all results
(date(1950, 1, 1), None, FULL_DATA),
# End only date range that will get all results
(None, date(2025, 1, 1), FULL_DATA),
# Range that filters to just oldest
(
date(1950, 1, 1),
date(1960, 1, 1),
[FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING],
),
# Unbounded range for oldest few
(None, date(1990, 1, 1), [FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING]),
# Unbounded range for newest few
(date(2011, 8, 1), None, [WORDS_OF_RADIANCE, OATHBRINGER, RHYTHM_OF_WAR]),
# Selecting a few in the middle
(
date(2005, 1, 1),
date(2014, 1, 1),
[WAY_OF_KINGS, FEAST_FOR_CROWS, DANCE_WITH_DRAGONS],
),
# Exact date
(date(1954, 7, 29), date(1954, 7, 29), [FELLOWSHIP_OF_THE_RING]),
# None fetched in range
(date(1981, 1, 1), date(1989, 1, 1), []),
],
)
def test_query_builder_filter_date_range(
self, search_client, search_index, start_date, end_date, expected_results
):
builder = (
SearchQueryBuilder()
.sort_by([])
.filter_date_range("publication_date", start_date, end_date)
)

expected_ranges = {}
if start_date is not None:
expected_ranges["gte"] = start_date.isoformat()
if end_date is not None:
expected_ranges["lte"] = end_date.isoformat()

expected_query = {
"size": 25,
"from": 0,
"track_scores": True,
"query": {"bool": {"filter": [{"range": {"publication_date": expected_ranges}}]}},
}

assert builder.build() == expected_query

validate_valid_request(search_client, search_index, builder, expected_results)

@pytest.mark.parametrize(
"min_value,max_value,expected_results",
[
# All fetched
(1, 2000, FULL_DATA),
# None fetched
(2000, 3000, []),
# "Short" books
(300, 700, [GAME_OF_THRONES, FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING]),
# Unbounded short
(None, 416, [TWO_TOWERS, RETURN_OF_THE_KING]),
# Unbounded long
(1050, None, [WORDS_OF_RADIANCE, OATHBRINGER, RHYTHM_OF_WAR, DANCE_WITH_DRAGONS]),
# Middle length
(
500,
1010,
[WAY_OF_KINGS, GAME_OF_THRONES, CLASH_OF_KINGS, STORM_OF_SWORDS, FEAST_FOR_CROWS],
),
],
)
def test_query_builder_filter_int_range(
self, search_client, search_index, min_value, max_value, expected_results
):
builder = (
SearchQueryBuilder().sort_by([]).filter_int_range("page_count", min_value, max_value)
)

expected_ranges = {}
if min_value is not None:
expected_ranges["gte"] = min_value
if max_value is not None:
expected_ranges["lte"] = max_value

expected_query = {
"size": 25,
"from": 0,
"track_scores": True,
"query": {"bool": {"filter": [{"range": {"page_count": expected_ranges}}]}},
}

assert builder.build() == expected_query

validate_valid_request(search_client, search_index, builder, expected_results)

def test_multiple_ranges(self, search_client, search_index):
# Sanity test that we can specify multiple ranges (in this case, a date + int range)
# in the same query
builder = (
SearchQueryBuilder()
.sort_by([])
.filter_int_range("page_count", 600, 1100)
.filter_date_range("publication_date", date(2000, 1, 1), date(2013, 1, 1))
)

expected_results = [WAY_OF_KINGS, STORM_OF_SWORDS, FEAST_FOR_CROWS, DANCE_WITH_DRAGONS]
validate_valid_request(
search_client, search_index, builder, expected_results=expected_results
)

def test_filter_int_range_both_none(self):
with pytest.raises(ValueError, match="Cannot use int range filter"):
SearchQueryBuilder().filter_int_range("test_field", None, None)

def test_filter_date_range_both_none(self):
with pytest.raises(ValueError, match="Cannot use date range filter"):
SearchQueryBuilder().filter_date_range("test_field", None, None)

@pytest.mark.parametrize(
"query,fields,expected_results,expected_aggregations",
[
Expand Down

0 comments on commit 2de430d

Please sign in to comment.