From cc2e45328cec3d510d718da9d21d7f1b9c6d5884 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 18 Nov 2024 16:01:19 -0500 Subject: [PATCH 1/2] fix: drop trailing slashes for value-search update denorm mappings: single iri for each value, better synonyms update tests with fixes while leaving old _flats strategy as-is --- .../index_strategy/trovesearch_denorm.py | 27 ++--- .../_common_trovesearch_tests.py | 104 ++++++++++++------ .../test_trove_indexcard_flats.py | 9 ++ 3 files changed, 93 insertions(+), 47 deletions(-) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 4b98e8ee7..7b8c38c61 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -55,7 +55,7 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): CURRENT_STRATEGY_CHECKSUM = ChecksumIri( checksumalgorithm_name='sha-256', salt='TrovesearchDenormIndexStrategy', - hexdigest='e538bbc5966a6a289da9e5ba51ecde5ff29528bf07e940716ef8a888d6601916', + hexdigest='05fde844940b927edaab6d7096d2d845273fde10d3cb5ff1e2862c1161318bbe', ) # abstract method from IndexStrategy @@ -127,7 +127,6 @@ def _card_mappings(self): def _iri_value_mappings(self): return { - 'value_iri': ts.KEYWORD_MAPPING, 'value_name': ts.KEYWORD_MAPPING, 'value_title': ts.KEYWORD_MAPPING, 'value_label': ts.KEYWORD_MAPPING, @@ -137,7 +136,8 @@ def _iri_value_mappings(self): def _paths_and_values_mappings(self): return { - 'focus_iri': ts.IRI_KEYWORD_MAPPING, + 'single_focus_iri': ts.KEYWORD_MAPPING, + 'focus_iri_synonyms': ts.KEYWORD_MAPPING, 'propertypaths_present': ts.KEYWORD_MAPPING, # flattened properties (dynamic sub-properties with keyword values) 'iri_by_propertypath': ts.FLATTENED_MAPPING, @@ -311,8 +311,7 @@ def _card_subdoc(self) -> dict: def _iri_value_subdoc(self, iri: str) -> dict: _shortwalk = self._fullwalk.shortwalk_from(iri) return { - 'value_iri': iri, - 'value_iris': self._exact_and_suffuniq_iris(iri), + **self._paths_and_values(_shortwalk), 'value_name': list(self._texts_at_properties(_shortwalk, ts.NAME_PROPERTIES)), 'value_title': list(self._texts_at_properties(_shortwalk, ts.TITLE_PROPERTIES)), 'value_label': list(self._texts_at_properties(_shortwalk, ts.LABEL_PROPERTIES)), @@ -320,12 +319,15 @@ def _iri_value_subdoc(self, iri: str) -> dict: ts.propertypath_as_keyword(_path) for _path in self._fullwalk.paths_by_iri[iri] ], - **self._paths_and_values(_shortwalk), } def _paths_and_values(self, walk: ts.GraphWalk): return { - 'focus_iri': self._exact_and_suffuniq_iris(walk.focus_iri), + 'single_focus_iri': walk.focus_iri.rstrip('/'), # remove trailing slash, but keep the scheme + 'focus_iri_synonyms': ts.suffuniq_iris(ts.iri_synonyms( + walk.focus_iri, + self.rdfdoc, + )), 'propertypaths_present': self._propertypaths_present(walk), 'iri_by_propertypath': self._iris_by_propertypath(walk), 'iri_by_depth': self._iris_by_depth(walk), @@ -391,13 +393,6 @@ def _ints_by_propertypath(self, walk: ts.GraphWalk): for _path, _value_set in walk.integer_values.items() } - def _exact_and_suffuniq_iris(self, iri: str): - _synonyms = ts.iri_synonyms(iri, self.rdfdoc) - return { - 'exact': list(_synonyms), - 'suffuniq': ts.suffuniq_iris(_synonyms), - } - ### # normalizing search responses @@ -598,7 +593,7 @@ def _iri_filter(self, search_filter) -> dict: def _path_iri_query(self, path, suffuniq_iris): if path == (OWL.sameAs,): - _field = f'{self.base_field}.focus_iri.suffuniq' + _field = f'{self.base_field}.focus_iri_synonyms' elif is_globpath(path): _field = f'{self.base_field}.iri_by_depth.{_depth_field_name(len(path))}' else: @@ -801,7 +796,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d 'aggs': { 'agg_valuesearch_iris': { 'terms': { - 'field': 'iri_value.value_iri', + 'field': 'iri_value.single_focus_iri', # WARNING: terribly hacky pagination (part one) 'size': cursor.start_offset + cursor.page_size + 1, }, diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index db265322b..53de73720 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -1,7 +1,6 @@ from typing import Iterable, Iterator import dataclasses from datetime import date, timedelta -import itertools import math from urllib.parse import urlencode @@ -56,22 +55,14 @@ def test_for_smoke_with_daemon(self): def test_cardsearch(self): self._fill_test_data_for_querying() - _cardsearch_cases = itertools.chain( - self.cardsearch_cases(), - self.cardsearch_integer_cases(), - ) - for _queryparams, _expected_focus_iris in _cardsearch_cases: + for _queryparams, _expected_focus_iris in self.cardsearch_cases(): self._assert_cardsearch_iris(_queryparams, _expected_focus_iris) def test_cardsearch_after_deletion(self): _cards = self._fill_test_data_for_querying() _deleted_focus_iris = {BLARG.b} self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris]) - _cardsearch_cases = itertools.chain( - self.cardsearch_cases(), - self.cardsearch_integer_cases(), - ) - for _queryparams, _expected_focus_iris in _cardsearch_cases: + for _queryparams, _expected_focus_iris in self.cardsearch_cases(): if isinstance(_expected_focus_iris, set): _expected_focus_iris -= _deleted_focus_iris else: @@ -173,17 +164,17 @@ def test_valuesearch(self): def test_valuesearch_after_deletion(self): _cards = self._fill_test_data_for_querying() - _deleted_focus_iris = {BLARG.b} + _deleted_focus_iris = {BLARG.c} self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris]) _cases = [ ( {'valueSearchPropertyPath': 'subject'}, - {BLARG.subj_a, BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_c}, # BLARG.subj_b no longer present + {BLARG.subj_a, BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_b}, # BLARG.subj_c no longer present ), ( {'valueSearchPropertyPath': 'dateCreated'}, - {'1999', '2024'}, # 2012 no longer present + {'1999', '2012'}, # 2024 no longer present ), ( - {'valueSearchPropertyPath': 'subject', 'cardSearchText': 'bbbb'}, + {'valueSearchPropertyPath': 'subject', 'cardSearchText': 'cccc'}, set(), # none ) ] @@ -296,7 +287,11 @@ def _fill_test_data_for_querying(self): DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, DCTERMS.creator: {BLARG.someone_else}, DCTERMS.title: {rdf.literal('cccc')}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_c}, + DCTERMS.subject: { + BLARG['subj_ac/'], # this one has an extra trailing slash + BLARG.subj_bc, + BLARG.subj_c, + }, DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, }, BLARG.someone_else: { @@ -401,14 +396,6 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str {'cardSearchFilter[dcterms:replaces][is-absent]': ''}, set(), ) - yield ( - {'cardSearchFilter[subject]': BLARG.subj_ac}, - {BLARG.c, BLARG.a}, - ) - yield ( - {'cardSearchFilter[subject][none-of]': BLARG.subj_ac}, - {BLARG.b}, - ) yield ( { 'cardSearchFilter[subject]': BLARG.subj_bc, @@ -510,6 +497,8 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str {'cardSearchText': '"what is here"'}, {BLARG.b}, ) + yield from self.cardsearch_trailingslash_cases() + yield from self.cardsearch_integer_cases() def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: # cases that depend on integer values getting indexed @@ -522,6 +511,17 @@ def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] | [BLARG.c, BLARG.a, BLARG.b], # ordered list ) + def cardsearch_trailingslash_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: + # cases that depend on trailing-slash ignorance + yield ( + {'cardSearchFilter[subject]': BLARG.subj_ac}, + {BLARG.c, BLARG.a}, + ) + yield ( + {'cardSearchFilter[subject][none-of]': BLARG.subj_ac}, + {BLARG.b}, + ) + def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: yield ( {'valueSearchPropertyPath': 'references'}, @@ -531,6 +531,26 @@ def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: {'valueSearchPropertyPath': 'dateCreated'}, {'1999', '2012', '2024'}, ) + yield ( + { + 'valueSearchPropertyPath': 'references', + 'valueSearchFilter[resourceType]': BLARG.Thing, + }, + {BLARG.b, BLARG.c}, + ) + yield ( + { + 'valueSearchPropertyPath': 'references', + 'valueSearchText': 'bbbb', + }, + {BLARG.b}, + ) + yield from self.valuesearch_trailingslash_cases() + yield from self.valuesearch_sameas_cases() + # TODO: more + + def valuesearch_trailingslash_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: + # cases that depend on trailing-slash ignorance yield ( {'valueSearchPropertyPath': 'subject'}, {BLARG.subj_a, BLARG.subj_ac, BLARG.subj_b, BLARG.subj_bc, BLARG.subj_c}, @@ -547,21 +567,43 @@ def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: {'valueSearchPropertyPath': 'subject', 'cardSearchText': 'aaaa'}, {BLARG.subj_ac, BLARG.subj_a}, ) + + def valuesearch_sameas_cases(self): yield ( { - 'valueSearchPropertyPath': 'references', - 'valueSearchFilter[resourceType]': BLARG.Thing, + 'valueSearchPropertyPath': 'subject', + 'cardSearchFilter[sameAs]': BLARG.a, }, - {BLARG.b, BLARG.c}, + {BLARG.subj_ac, BLARG.subj_a}, ) yield ( { - 'valueSearchPropertyPath': 'references', - 'valueSearchText': 'bbbb', + 'valueSearchPropertyPath': 'subject', + 'cardSearchFilter[sameAs]': BLARG.a_same, }, - {BLARG.b}, + {BLARG.subj_ac, BLARG.subj_a}, + ) + yield ( + { + 'valueSearchPropertyPath': 'subject', + 'cardSearchFilter[sameAs]': BLARG.b_same, + }, + {BLARG.subj_b, BLARG.subj_bc}, + ) + yield ( + { + 'valueSearchPropertyPath': 'subject', + 'valueSearchFilter[sameAs]': BLARG.subj_a, + }, + {BLARG.subj_a}, + ) + yield ( + { + 'valueSearchPropertyPath': 'subject', + 'cardSearchFilter[subject]': BLARG.subj_ac, + }, + {BLARG.subj_ac, BLARG.subj_a, BLARG.subj_c, BLARG.subj_bc}, ) - # TODO: more def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): _messages_chunk = messages.MessagesChunk( diff --git a/tests/share/search/index_strategy/test_trove_indexcard_flats.py b/tests/share/search/index_strategy/test_trove_indexcard_flats.py index 2cccf2d41..0718ad346 100644 --- a/tests/share/search/index_strategy/test_trove_indexcard_flats.py +++ b/tests/share/search/index_strategy/test_trove_indexcard_flats.py @@ -10,3 +10,12 @@ def get_index_strategy(self): def cardsearch_integer_cases(self): yield from () # integers not indexed by this strategy + + def cardsearch_trailingslash_cases(self): + yield from () # trailing-slash handling improved in trovesearch_denorm + + def valuesearch_sameas_cases(self): + yield from () # sameas handling improved in trovesearch_denorm + + def valuesearch_trailingslash_cases(self): + yield from () # trailing-slash handling improved in trovesearch_denorm From 63740cac8f3ea5f33530caf0b6ac78ec7901a3df Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 21 Nov 2024 13:23:44 -0500 Subject: [PATCH 2/2] [ENG-6586] shard trovesearch_denorm --- share/search/index_strategy/trovesearch_denorm.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 7b8c38c61..8bbbbc7c5 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -55,7 +55,7 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): CURRENT_STRATEGY_CHECKSUM = ChecksumIri( checksumalgorithm_name='sha-256', salt='TrovesearchDenormIndexStrategy', - hexdigest='05fde844940b927edaab6d7096d2d845273fde10d3cb5ff1e2862c1161318bbe', + hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8', ) # abstract method from IndexStrategy @@ -73,7 +73,10 @@ def backfill_message_type(self): # abstract method from Elastic8IndexStrategy def index_settings(self): - return {} + return { + 'number_of_shards': 5, + 'number_of_replicas': 2, + } # abstract method from Elastic8IndexStrategy def index_mappings(self):