From e8b3f44d8ea406c2f493f3c233ebf1c99964fad8 Mon Sep 17 00:00:00 2001 From: Michael Chouinard <46358556+chouinar@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:15:24 -0400 Subject: [PATCH] [Issue #2091] Setup utils for creating requests and parsing responses from search (navapbc/simpler-grants-gov#54) Fixes #2091 Created utilities for creating requests to opensearch, and parsing the responses into more manageable objects. Added some logic for configuring how we create indexes to use a different tokenizer + stemmer as the defaults aren't great. The search queries we need to create aren't that complex, but they're pretty large and very nested objects. To help with this, I've built a few generic utilities for creating the requests by using a builder to pass in each of the components of the search request. This way when the API gets built out next, the search logic really is just taking our requests and passing the details to the factories, which is pretty trivial. Responses are a bit less complex, they're just very nested, and adding a simple wrapper around them in the response helps any usage of the search client need to do a bit less indexing into dictionaries (eg. to access the response objects I was doing `values = [record["_source"] for record in response["hits"]["hits"]]` which is fun). That logic just safely handles parsing the responses in a very generic manner. Note that in both cases, there are many cases that we don't handle yet (a ton of other request params for example), we can add those as needed. Just focused on the ones we need right now. --- One unrelated change made it in here and that was adjusting how the analysis was done on an index. In short, the default tokenization/stemming of words was clunky for our use case. For example, the default stemmer treats `king` and `kings` as separate words. I adjusted the stemmer to use the [snowball stemmer](https://snowballstem.org/) which seems to work a bit better although we should definitely investigate this further. I also changed the tokenization to be on whitespaces as before it would separate on dashes, and a lot of terms in our system have dashes (opportunity number and agency pretty prominently). Since this logic is potentially going to be shared across many components (if we ever build out more search indexes) - I tried to document it pretty thoroughly with links to a lot of documentation. --- api/src/adapters/search/opensearch_client.py | 56 +- .../search/opensearch_query_builder.py | 217 ++++++++ .../adapters/search/opensearch_response.py | 125 +++++ api/src/pagination/pagination_models.py | 5 + .../adapters/search/test_opensearch_client.py | 10 +- .../search/test_opensearch_query_builder.py | 518 ++++++++++++++++++ .../test_load_opportunities_to_index.py | 18 +- 7 files changed, 924 insertions(+), 25 deletions(-) create mode 100644 api/src/adapters/search/opensearch_query_builder.py create mode 100644 api/src/adapters/search/opensearch_response.py create mode 100644 api/tests/src/adapters/search/test_opensearch_query_builder.py diff --git a/api/src/adapters/search/opensearch_client.py b/api/src/adapters/search/opensearch_client.py index b93d33917f..b2e5b2bea2 100644 --- a/api/src/adapters/search/opensearch_client.py +++ b/api/src/adapters/search/opensearch_client.py @@ -4,9 +4,34 @@ import opensearchpy from src.adapters.search.opensearch_config import OpensearchConfig, get_opensearch_config +from src.adapters.search.opensearch_response import SearchResponse logger = logging.getLogger(__name__) +# By default, we'll override the default analyzer+tokenization +# for a search index. You can provide your own when calling create_index +DEFAULT_INDEX_ANALYSIS = { + "analyzer": { + "default": { + "type": "custom", + "filter": ["lowercase", "custom_stemmer"], + # Change tokenization to whitespace as the default is very clunky + # with a lot of our IDs that have dashes in them. + # see: https://opensearch.org/docs/latest/analyzers/tokenizers/index/ + "tokenizer": "whitespace", + } + }, + # Change the default stemming to use snowball which handles plural + # queries better than the default + # TODO - there are a lot of stemmers, we should take some time to figure out + # which one works best with our particular dataset. Snowball is really + # basic and naive (literally just adjusting suffixes on words in common patterns) + # which might be fine generally, but we work with a lot of acronyms + # and should verify that doesn't cause any issues. + # see: https://opensearch.org/docs/latest/analyzers/token-filters/index/ + "filter": {"custom_stemmer": {"type": "snowball", "name": "english"}}, +} + class SearchClient: def __init__(self, opensearch_config: OpensearchConfig | None = None) -> None: @@ -17,15 +42,27 @@ def __init__(self, opensearch_config: OpensearchConfig | None = None) -> None: self._client = opensearchpy.OpenSearch(**_get_connection_parameters(opensearch_config)) def create_index( - self, index_name: str, *, shard_count: int = 1, replica_count: int = 1 + self, + index_name: str, + *, + shard_count: int = 1, + replica_count: int = 1, + analysis: dict | None = None ) -> None: """ Create an empty search index """ + + # Allow the user to adjust how the index analyzer + tokenization works + # but also have a general default. + if analysis is None: + analysis = DEFAULT_INDEX_ANALYSIS + body = { "settings": { - "index": {"number_of_shards": shard_count, "number_of_replicas": replica_count} - } + "index": {"number_of_shards": shard_count, "number_of_replicas": replica_count}, + "analysis": analysis, + }, } logger.info("Creating search index %s", index_name, extra={"index_name": index_name}) @@ -104,12 +141,17 @@ def swap_alias_index( for index in existing_indexes: self.delete_index(index) - def search(self, index_name: str, search_query: dict) -> dict: - # TODO - add more when we build out the request/response parsing logic - # we use something like Pydantic to help reorganize the response - # object into something easier to parse. + def search_raw(self, index_name: str, search_query: dict) -> dict: + # Simple wrapper around search if you don't want the request or response + # object handled in any special way. return self._client.search(index=index_name, body=search_query) + def search( + self, index_name: str, search_query: dict, include_scores: bool = True + ) -> SearchResponse: + response = self._client.search(index=index_name, body=search_query) + return SearchResponse.from_opensearch_response(response, include_scores) + def _get_connection_parameters(opensearch_config: OpensearchConfig) -> dict[str, Any]: # TODO - we'll want to add the AWS connection params here when we set that up diff --git a/api/src/adapters/search/opensearch_query_builder.py b/api/src/adapters/search/opensearch_query_builder.py new file mode 100644 index 0000000000..4aa4e07e55 --- /dev/null +++ b/api/src/adapters/search/opensearch_query_builder.py @@ -0,0 +1,217 @@ +import typing + +from src.pagination.pagination_models import SortDirection + + +class SearchQueryBuilder: + """ + Utility to help build queries to OpenSearch + + This helps with making sure everything we want in a search query goes + to the right spot in the large JSON object we're building. Note that + it still requires some understanding of OpenSearch (eg. when to add ".keyword" to a field name) + + For example, if you wanted to build a query against a search index containing + books with the following: + * Page size of 5, page number 1 + * Sorted by relevancy score descending + * Scored on titles containing "king" + * Where the author is one of Brandon Sanderson or J R.R. Tolkien + * Returning aggregate counts of books by those authors in the full results + + This query could either be built manually and look like: + + { + "size": 5, + "from": 0, + "track_scores": true, + "sort": [ + { + "_score": { + "order": "desc" + } + } + ], + "query": { + "bool": { + "must": [ + { + "simple_query_string": { + "query": "king", + "fields": [ + "title.keyword" + ], + "default_operator": "AND" + } + } + ], + "filter": [ + { + "terms": { + "author.keyword": [ + "Brandon Sanderson", + "J R.R. Tolkien" + ] + } + } + ] + } + }, + "aggs": { + "author": { + "terms": { + "field": "author.keyword", + "size": 25, + "min_doc_count": 0 + } + } + } + } + + Or you could use the builder and produce the same result: + + search_query = SearchQueryBuilder() + .pagination(page_size=5, page_number=1) + .sort_by([("relevancy", SortDirection.DESCENDING)]) + .simple_query("king", fields=["title.keyword"]) + .filter_terms("author.keyword", terms=["Brandon Sanderson", "J R.R. Tolkien"]) + .aggregation_terms(aggregation_name="author", field_name="author.keyword", minimum_count=0) + .build() + """ + + def __init__(self) -> None: + self.page_size = 25 + self.page_number = 1 + + self.sort_values: list[dict[str, dict[str, str]]] = [] + + self.must: list[dict] = [] + self.filters: list[dict] = [] + + self.aggregations: dict[str, dict] = {} + + def pagination(self, page_size: int, page_number: int) -> typing.Self: + """ + Set the pagination for the search request. + + Note that page number should be the human-readable page number + and start counting from 1. + """ + self.page_size = page_size + self.page_number = page_number + return self + + def sort_by(self, sort_values: list[typing.Tuple[str, SortDirection]]) -> typing.Self: + """ + List of tuples of field name + sort direction to sort by. If you wish to sort by the relevancy + score provide a field name of "relevancy". + + The order of the tuples matters, and the earlier values will take precedence - or put another way + the first tuple is the "primary sort", the second is the "secondary sort", and so on. If + all of the primary sort values are unique, then the secondary sorts won't be relevant. + + If this method is not called, no sort info will be added to the request, and OpenSearch + will internally default to sorting by relevancy score. If there is no scores calculated, + then the order is likely the IDs of the documents in the index. + + Note that multiple calls to this method will erase any info provided in a prior call. + """ + for field, sort_direction in sort_values: + if field == "relevancy": + field = "_score" + + self.sort_values.append({field: {"order": sort_direction.short_form()}}) + + return self + + def simple_query(self, query: str, fields: list[str]) -> typing.Self: + """ + Adds a simple_query_string which queries against the provided fields. + + The fields must include the full path to the object, and can include optional suffixes + to adjust the weighting. For example "opportunity_title^4" would increase any scores + derived from that field by 4x. + + See: https://opensearch.org/docs/latest/query-dsl/full-text/simple-query-string/ + """ + self.must.append( + {"simple_query_string": {"query": query, "fields": fields, "default_operator": "AND"}} + ) + + return self + + def filter_terms(self, field: str, terms: list) -> typing.Self: + """ + For a given field, filter to a set of values. + + These filters do not affect the relevancy score, they are purely + a binary filter on the overall results. + """ + self.filters.append({"terms": {field: terms}}) + return self + + def aggregation_terms( + self, aggregation_name: str, field_name: str, size: int = 25, minimum_count: int = 1 + ) -> typing.Self: + """ + Add a term aggregation to the request. Aggregations are the counts of particular fields in the + full response and are often displayed next to filters in a search UI. + + Size determines how many different values can be returned. + Minimum count determines how many occurrences need to occur to include in the response. + If you pass in 0 for this, then values that don't occur at all in the full result set will be returned. + + see: https://opensearch.org/docs/latest/aggregations/bucket/terms/ + """ + self.aggregations[aggregation_name] = { + "terms": {"field": field_name, "size": size, "min_doc_count": minimum_count} + } + return self + + def build(self) -> dict: + """ + Build the search request + """ + + # Base request + page_offset = self.page_size * (self.page_number - 1) + request: dict[str, typing.Any] = { + "size": self.page_size, + "from": page_offset, + # Always include the scores in the response objects + # even if we're sorting by non-relevancy + "track_scores": True, + } + + # Add sorting if any was provided + if len(self.sort_values) > 0: + request["sort"] = self.sort_values + + # Add a bool query + # + # The "must" block contains anything relevant to scoring + # The "filter" block contains filters that don't affect scoring and act + # as just binary filters + # + # See: https://opensearch.org/docs/latest/query-dsl/compound/bool/ + bool_query = {} + if len(self.must) > 0: + bool_query["must"] = self.must + + if len(self.filters) > 0: + bool_query["filter"] = self.filters + + # Add the query object which wraps the bool query + query_obj = {} + if len(bool_query) > 0: + query_obj["bool"] = bool_query + + if len(query_obj) > 0: + request["query"] = query_obj + + # Add any aggregations + # see: https://opensearch.org/docs/latest/aggregations/ + if len(self.aggregations) > 0: + request["aggs"] = self.aggregations + + return request diff --git a/api/src/adapters/search/opensearch_response.py b/api/src/adapters/search/opensearch_response.py new file mode 100644 index 0000000000..c8bb16cb6d --- /dev/null +++ b/api/src/adapters/search/opensearch_response.py @@ -0,0 +1,125 @@ +import dataclasses +import typing + + +@dataclasses.dataclass +class SearchResponse: + total_records: int + + records: list[dict[str, typing.Any]] + + aggregations: dict[str, dict[str, int]] + + @classmethod + def from_opensearch_response( + cls, raw_json: dict[str, typing.Any], include_scores: bool = True + ) -> typing.Self: + """ + Convert a raw search response into something a bit more manageable + by un-nesting and restructuring a few of they key fields. + """ + + """ + The hits object looks like: + { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 22.180708, + "hits": [ + { + "_index": "opportunity-index-2024-05-21_15-49-24", + "_id": "4", + "_score": 22.180708, + "_source": { + "opportunity_id": 4, + "opportunity_number": "ABC123-XYZ", + } + } + ] + } + """ + hits = raw_json.get("hits", {}) + hits_total = hits.get("total", {}) + total_records = hits_total.get("value", 0) + + raw_records: list[dict[str, typing.Any]] = hits.get("hits", []) + + records = [] + for raw_record in raw_records: + record = raw_record.get("_source", {}) + + if include_scores: + score: int | None = raw_record.get("_score", None) + record["relevancy_score"] = score + + records.append(record) + + raw_aggs: dict[str, dict[str, typing.Any]] = raw_json.get("aggregations", {}) + aggregations = _parse_aggregations(raw_aggs) + + return cls(total_records, records, aggregations) + + +def _parse_aggregations( + raw_aggs: dict[str, dict[str, typing.Any]] | None +) -> dict[str, dict[str, int]]: + # Note that this is assuming the response from a terms aggregation + # https://opensearch.org/docs/latest/aggregations/bucket/terms/ + + if raw_aggs is None: + return {} + + """ + Terms aggregations look like: + + "aggregations": { + "applicant_types": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "for_profit_organizations_other_than_small_businesses", + "doc_count": 1 + }, + { + "key": "other", + "doc_count": 1 + }, + { + "key": "state_governments", + "doc_count": 1 + } + ] + }, + "agencies": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "USAID", + "doc_count": 3 + } + ] + } + } + """ + + aggregations: dict[str, dict[str, int]] = {} + for field, raw_agg_value in raw_aggs.items(): + buckets: list[dict[str, typing.Any]] = raw_agg_value.get("buckets", []) + + field_aggregation: dict[str, int] = {} + for bucket in buckets: + key = bucket.get("key") + count = bucket.get("doc_count", 0) + + if key is None: + raise ValueError("Unable to parse aggregation, null key for %s" % field) + + field_aggregation[key] = count + + aggregations[field] = field_aggregation + + return aggregations diff --git a/api/src/pagination/pagination_models.py b/api/src/pagination/pagination_models.py index f3197e7618..e1a93e018a 100644 --- a/api/src/pagination/pagination_models.py +++ b/api/src/pagination/pagination_models.py @@ -11,6 +11,11 @@ class SortDirection(StrEnum): ASCENDING = "ascending" DESCENDING = "descending" + def short_form(self) -> str: + if self == SortDirection.DESCENDING: + return "desc" + return "asc" + class SortingParamsV0(BaseModel): order_by: str diff --git a/api/tests/src/adapters/search/test_opensearch_client.py b/api/tests/src/adapters/search/test_opensearch_client.py index d9ba221943..916c6effd1 100644 --- a/api/tests/src/adapters/search/test_opensearch_client.py +++ b/api/tests/src/adapters/search/test_opensearch_client.py @@ -90,16 +90,14 @@ def test_swap_alias_index(search_client, generic_index): search_client.swap_alias_index(tmp_index, alias_name, delete_prior_indexes=True) # Can search by this alias and get records from the tmp index - resp = search_client.search(alias_name, {}) - resp_records = [record["_source"] for record in resp["hits"]["hits"]] - assert resp_records == tmp_index_records + resp = search_client.search(alias_name, {}, include_scores=False) + assert resp.records == tmp_index_records # Swap the index to the generic one + delete the tmp one search_client.swap_alias_index(generic_index, alias_name, delete_prior_indexes=True) - resp = search_client.search(alias_name, {}) - resp_records = [record["_source"] for record in resp["hits"]["hits"]] - assert resp_records == records + resp = search_client.search(alias_name, {}, include_scores=False) + assert resp.records == records # Verify the tmp one was deleted assert search_client._client.indices.exists(tmp_index) is False diff --git a/api/tests/src/adapters/search/test_opensearch_query_builder.py b/api/tests/src/adapters/search/test_opensearch_query_builder.py new file mode 100644 index 0000000000..731c0e6caa --- /dev/null +++ b/api/tests/src/adapters/search/test_opensearch_query_builder.py @@ -0,0 +1,518 @@ +import uuid + +import pytest + +from src.adapters.search.opensearch_query_builder import SearchQueryBuilder +from src.pagination.pagination_models import SortDirection +from tests.conftest import BaseTestClass + +WAY_OF_KINGS = { + "id": 1, + "title": "The Way of Kings", + "author": "Brandon Sanderson", + "in_stock": True, + "page_count": 1007, +} +WORDS_OF_RADIANCE = { + "id": 2, + "title": "Words of Radiance", + "author": "Brandon Sanderson", + "in_stock": False, + "page_count": 1087, +} +OATHBRINGER = { + "id": 3, + "title": "Oathbringer", + "author": "Brandon Sanderson", + "in_stock": True, + "page_count": 1248, +} +RHYTHM_OF_WAR = { + "id": 4, + "title": "Rhythm of War", + "author": "Brandon Sanderson", + "in_stock": False, + "page_count": 1232, +} +GAME_OF_THRONES = { + "id": 5, + "title": "A Game of Thrones", + "author": "George R.R. Martin", + "in_stock": True, + "page_count": 694, +} +CLASH_OF_KINGS = { + "id": 6, + "title": "A Clash of Kings", + "author": "George R.R. Martin", + "in_stock": True, + "page_count": 768, +} +STORM_OF_SWORDS = { + "id": 7, + "title": "A Storm of Swords", + "author": "George R.R. Martin", + "in_stock": True, + "page_count": 973, +} +FEAST_FOR_CROWS = { + "id": 8, + "title": "A Feast for Crows", + "author": "George R.R. Martin", + "in_stock": True, + "page_count": 753, +} +DANCE_WITH_DRAGONS = { + "id": 9, + "title": "A Dance with Dragons", + "author": "George R.R. Martin", + "in_stock": False, + "page_count": 1056, +} +FELLOWSHIP_OF_THE_RING = { + "id": 10, + "title": "The Fellowship of the Ring", + "author": "J R.R. Tolkien", + "in_stock": True, + "page_count": 423, +} +TWO_TOWERS = { + "id": 11, + "title": "The Two Towers", + "author": "J R.R. Tolkien", + "in_stock": True, + "page_count": 352, +} +RETURN_OF_THE_KING = { + "id": 12, + "title": "The Return of the King", + "author": "J R.R. Tolkien", + "in_stock": True, + "page_count": 416, +} + +FULL_DATA = [ + WAY_OF_KINGS, + WORDS_OF_RADIANCE, + OATHBRINGER, + RHYTHM_OF_WAR, + GAME_OF_THRONES, + CLASH_OF_KINGS, + STORM_OF_SWORDS, + FEAST_FOR_CROWS, + DANCE_WITH_DRAGONS, + FELLOWSHIP_OF_THE_RING, + TWO_TOWERS, + RETURN_OF_THE_KING, +] + + +def validate_valid_request( + search_client, index, request, expected_results, expected_aggregations=None +): + json_value = request.build() + try: + resp = search_client.search(index, json_value, include_scores=False) + + except Exception: + # If it errors while making the query, catch the exception just to give a message that makes it a bit clearer + pytest.fail( + f"Request generated was invalid and caused an error in search client: {json_value}" + ) + + assert resp.records == expected_results + + if expected_aggregations is not None: + assert resp.aggregations == expected_aggregations + + +class TestOpenSearchQueryBuilder(BaseTestClass): + @pytest.fixture(scope="class") + def search_index(self, search_client): + index_name = f"test-search-index-{uuid.uuid4().int}" + + search_client.create_index(index_name) + + try: + yield index_name + finally: + # Try to clean up the index at the end + search_client.delete_index(index_name) + + @pytest.fixture(scope="class", autouse=True) + def seed_data(self, search_client, search_index): + search_client.bulk_upsert(search_index, FULL_DATA, primary_key_field="id") + + def test_query_builder_empty(self, search_client, search_index): + builder = SearchQueryBuilder() + + assert builder.build() == {"size": 25, "from": 0, "track_scores": True} + + validate_valid_request(search_client, search_index, builder, FULL_DATA) + + @pytest.mark.parametrize( + "page_size,page_number,sort_values,expected_sort,expected_results", + [ + ### ID Sorting + (25, 1, [("id", SortDirection.ASCENDING)], [{"id": {"order": "asc"}}], FULL_DATA), + (3, 1, [("id", SortDirection.ASCENDING)], [{"id": {"order": "asc"}}], FULL_DATA[:3]), + ( + 15, + 1, + [("id", SortDirection.DESCENDING)], + [{"id": {"order": "desc"}}], + FULL_DATA[::-1], + ), + ( + 5, + 2, + [("id", SortDirection.DESCENDING)], + [{"id": {"order": "desc"}}], + FULL_DATA[-6:-11:-1], + ), + (10, 100, [("id", SortDirection.DESCENDING)], [{"id": {"order": "desc"}}], []), + ### Title sorting + ( + 2, + 1, + [("title.keyword", SortDirection.ASCENDING)], + [{"title.keyword": {"order": "asc"}}], + [CLASH_OF_KINGS, DANCE_WITH_DRAGONS], + ), + ( + 3, + 4, + [("title.keyword", SortDirection.DESCENDING)], + [{"title.keyword": {"order": "desc"}}], + [FEAST_FOR_CROWS, DANCE_WITH_DRAGONS, CLASH_OF_KINGS], + ), + ( + 10, + 2, + [("title.keyword", SortDirection.ASCENDING)], + [{"title.keyword": {"order": "asc"}}], + [WAY_OF_KINGS, WORDS_OF_RADIANCE], + ), + ### Page Count + ( + 3, + 1, + [("page_count", SortDirection.ASCENDING)], + [{"page_count": {"order": "asc"}}], + [TWO_TOWERS, RETURN_OF_THE_KING, FELLOWSHIP_OF_THE_RING], + ), + ( + 4, + 2, + [("page_count", SortDirection.DESCENDING)], + [{"page_count": {"order": "desc"}}], + [WAY_OF_KINGS, STORM_OF_SWORDS, CLASH_OF_KINGS, FEAST_FOR_CROWS], + ), + ### Multi-sorts + # Author ascending (Primary) + Page count descending (Secondary) + ( + 5, + 1, + [ + ("author.keyword", SortDirection.ASCENDING), + ("page_count", SortDirection.DESCENDING), + ], + [{"author.keyword": {"order": "asc"}}, {"page_count": {"order": "desc"}}], + [OATHBRINGER, RHYTHM_OF_WAR, WORDS_OF_RADIANCE, WAY_OF_KINGS, DANCE_WITH_DRAGONS], + ), + # Author descending (Primary) + ID descending (Secondary) + ( + 4, + 1, + [("author.keyword", SortDirection.DESCENDING), ("id", SortDirection.DESCENDING)], + [{"author.keyword": {"order": "desc"}}, {"id": {"order": "desc"}}], + [RETURN_OF_THE_KING, TWO_TOWERS, FELLOWSHIP_OF_THE_RING, DANCE_WITH_DRAGONS], + ), + ], + ) + def test_query_builder_pagination_and_sorting( + self, + search_client, + search_index, + page_size, + page_number, + sort_values, + expected_sort, + expected_results, + ): + builder = ( + SearchQueryBuilder() + .pagination(page_size=page_size, page_number=page_number) + .sort_by(sort_values) + ) + + assert builder.build() == { + "size": page_size, + "from": page_size * (page_number - 1), + "track_scores": True, + "sort": expected_sort, + } + + validate_valid_request(search_client, search_index, builder, expected_results) + + # Note that by having parametrize twice, it will run every one of the specific tests with the different + # sort by parameter to show that they behave the same + @pytest.mark.parametrize("sort_by", [[], [("relevancy", SortDirection.DESCENDING)]]) + @pytest.mark.parametrize( + "filters,expected_results", + [ + ### Author + ( + [("author.keyword", ["Brandon Sanderson"])], + [WAY_OF_KINGS, WORDS_OF_RADIANCE, OATHBRINGER, RHYTHM_OF_WAR], + ), + ( + [("author.keyword", ["George R.R. Martin", "Mark Twain"])], + [ + GAME_OF_THRONES, + CLASH_OF_KINGS, + STORM_OF_SWORDS, + FEAST_FOR_CROWS, + DANCE_WITH_DRAGONS, + ], + ), + ( + [("author.keyword", ["J R.R. Tolkien"])], + [FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING], + ), + ( + [("author.keyword", ["Brandon Sanderson", "J R.R. Tolkien"])], + [ + WAY_OF_KINGS, + WORDS_OF_RADIANCE, + OATHBRINGER, + RHYTHM_OF_WAR, + FELLOWSHIP_OF_THE_RING, + TWO_TOWERS, + RETURN_OF_THE_KING, + ], + ), + ( + [("author.keyword", ["Brandon Sanderson", "George R.R. Martin", "J R.R. Tolkien"])], + FULL_DATA, + ), + ([("author.keyword", ["Mark Twain"])], []), + ### in stock + ([("in_stock", [False])], [WORDS_OF_RADIANCE, RHYTHM_OF_WAR, DANCE_WITH_DRAGONS]), + ([("in_stock", [True, False])], FULL_DATA), + ### page count + ([("page_count", [1007, 694, 352])], [WAY_OF_KINGS, GAME_OF_THRONES, TWO_TOWERS]), + ([("page_count", [1, 2, 3])], []), + ### Multi-filter + # Author + In Stock + ( + [("author.keyword", ["Brandon Sanderson"]), ("in_stock", [True])], + [WAY_OF_KINGS, OATHBRINGER], + ), + ( + [ + ("author.keyword", ["George R.R. Martin", "J R.R. Tolkien"]), + ("in_stock", [False]), + ], + [DANCE_WITH_DRAGONS], + ), + # Author + Title + ( + [ + ("author.keyword", ["Brandon Sanderson", "J R.R. Tolkien", "Mark Twain"]), + ( + "title.keyword", + ["A Game of Thrones", "The Way of Kings", "The Fellowship of the Ring"], + ), + ], + [WAY_OF_KINGS, FELLOWSHIP_OF_THE_RING], + ), + ( + [ + ("author.keyword", ["George R.R. Martin", "J R.R. Tolkien"]), + ( + "title.keyword", + ["A Game of Thrones", "The Way of Kings", "The Fellowship of the Ring"], + ), + ], + [GAME_OF_THRONES, FELLOWSHIP_OF_THE_RING], + ), + ], + ) + def test_query_builder_filter_terms( + self, search_client, search_index, filters, expected_results, sort_by + ): + builder = SearchQueryBuilder().sort_by(sort_by) + + expected_terms = [] + for filter in filters: + builder.filter_terms(filter[0], filter[1]) + + expected_terms.append({"terms": {filter[0]: filter[1]}}) + + expected_query = { + "size": 25, + "from": 0, + "track_scores": True, + "query": {"bool": {"filter": expected_terms}}, + } + + if len(sort_by) > 0: + expected_query["sort"] = [{"_score": {"order": "desc"}}] + + assert builder.build() == expected_query + + validate_valid_request(search_client, search_index, builder, expected_results) + + @pytest.mark.parametrize( + "query,fields,expected_results,expected_aggregations", + [ + ( + "king", + ["title"], + [WAY_OF_KINGS, CLASH_OF_KINGS, RETURN_OF_THE_KING], + { + "author": { + "Brandon Sanderson": 1, + "George R.R. Martin": 1, + "J R.R. Tolkien": 1, + }, + "in_stock": {0: 0, 1: 3}, + }, + ), + ( + "R.R.", + ["author"], + [ + GAME_OF_THRONES, + CLASH_OF_KINGS, + STORM_OF_SWORDS, + FEAST_FOR_CROWS, + DANCE_WITH_DRAGONS, + FELLOWSHIP_OF_THE_RING, + TWO_TOWERS, + RETURN_OF_THE_KING, + ], + { + "author": { + "Brandon Sanderson": 0, + "George R.R. Martin": 5, + "J R.R. Tolkien": 3, + }, + "in_stock": {0: 1, 1: 7}, + }, + ), + ( + "Martin (Crows | Storm)", + ["title", "author"], + [STORM_OF_SWORDS, FEAST_FOR_CROWS], + { + "author": { + "Brandon Sanderson": 0, + "George R.R. Martin": 2, + "J R.R. Tolkien": 0, + }, + "in_stock": {0: 0, 1: 2}, + }, + ), + ( + "(Sanderson + (Words | King)) | Tolkien | Crow", + ["title", "author"], + [ + WAY_OF_KINGS, + WORDS_OF_RADIANCE, + FEAST_FOR_CROWS, + FELLOWSHIP_OF_THE_RING, + TWO_TOWERS, + RETURN_OF_THE_KING, + ], + { + "author": { + "Brandon Sanderson": 2, + "George R.R. Martin": 1, + "J R.R. Tolkien": 3, + }, + "in_stock": {0: 1, 1: 5}, + }, + ), + ( + "-R.R. -Oathbringer", + ["title", "author"], + [WAY_OF_KINGS, WORDS_OF_RADIANCE, RHYTHM_OF_WAR], + { + "author": { + "Brandon Sanderson": 3, + "George R.R. Martin": 0, + "J R.R. Tolkien": 0, + }, + "in_stock": {0: 2, 1: 1}, + }, + ), + ( + "Brandon | George | J", + ["title", "author"], + FULL_DATA, + { + "author": { + "Brandon Sanderson": 4, + "George R.R. Martin": 5, + "J R.R. Tolkien": 3, + }, + "in_stock": {0: 3, 1: 9}, + }, + ), + ( + "how to make a pizza", + ["title", "author"], + [], + { + "author": { + "Brandon Sanderson": 0, + "George R.R. Martin": 0, + "J R.R. Tolkien": 0, + }, + "in_stock": {0: 0, 1: 0}, + }, + ), + ], + ) + def test_query_builder_simple_query_and_aggregations( + self, search_client, search_index, query, fields, expected_results, expected_aggregations + ): + # Add a sort by ID ascending to make it so any relevancy from this is ignored, just testing that values returned + builder = SearchQueryBuilder().sort_by([("id", SortDirection.ASCENDING)]) + + builder.simple_query(query, fields) + + # Statically add the same aggregated fields every time + builder.aggregation_terms("author", "author.keyword", minimum_count=0).aggregation_terms( + "in_stock", "in_stock", minimum_count=0 + ) + + assert builder.build() == { + "size": 25, + "from": 0, + "track_scores": True, + "query": { + "bool": { + "must": [ + { + "simple_query_string": { + "query": query, + "fields": fields, + "default_operator": "AND", + } + } + ] + } + }, + "sort": [{"id": {"order": "asc"}}], + "aggs": { + "author": {"terms": {"field": "author.keyword", "size": 25, "min_doc_count": 0}}, + "in_stock": {"terms": {"field": "in_stock", "size": 25, "min_doc_count": 0}}, + }, + } + + validate_valid_request( + search_client, search_index, builder, expected_results, expected_aggregations + ) diff --git a/api/tests/src/search/backend/test_load_opportunities_to_index.py b/api/tests/src/search/backend/test_load_opportunities_to_index.py index a079b83c8a..e939f569fd 100644 --- a/api/tests/src/search/backend/test_load_opportunities_to_index.py +++ b/api/tests/src/search/backend/test_load_opportunities_to_index.py @@ -52,12 +52,10 @@ def test_load_opportunities_to_index( # Just do some rough validation that the data is present resp = search_client.search(opportunity_index_alias, {"size": 100}) - total_records = resp["hits"]["total"]["value"] - assert total_records == len(opportunities) + assert resp.total_records == len(opportunities) - records = [record["_source"] for record in resp["hits"]["hits"]] assert set([opp.opportunity_id for opp in opportunities]) == set( - [record["opportunity_id"] for record in records] + [record["opportunity_id"] for record in resp.records] ) # Rerunning without changing anything about the data in the DB doesn't meaningfully change anything @@ -65,12 +63,10 @@ def test_load_opportunities_to_index( load_opportunities_to_index.run() resp = search_client.search(opportunity_index_alias, {"size": 100}) - total_records = resp["hits"]["total"]["value"] - assert total_records == len(opportunities) + assert resp.total_records == len(opportunities) - records = [record["_source"] for record in resp["hits"]["hits"]] assert set([opp.opportunity_id for opp in opportunities]) == set( - [record["opportunity_id"] for record in records] + [record["opportunity_id"] for record in resp.records] ) # Rerunning but first add a few more opportunities to show up @@ -82,10 +78,8 @@ def test_load_opportunities_to_index( resp = search_client.search(opportunity_index_alias, {"size": 100}) - total_records = resp["hits"]["total"]["value"] - assert total_records == len(opportunities) + assert resp.total_records == len(opportunities) - records = [record["_source"] for record in resp["hits"]["hits"]] assert set([opp.opportunity_id for opp in opportunities]) == set( - [record["opportunity_id"] for record in records] + [record["opportunity_id"] for record in resp.records] )