Skip to content

Commit

Permalink
Merge pull request #832 from aaxelb/fix/valuesearch-trailing-slashes
Browse files Browse the repository at this point in the history
[ENG-6576] fix: duplicate value-search values
  • Loading branch information
aaxelb authored Nov 21, 2024
2 parents 1db8e75 + 63740ca commit 39df273
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 48 deletions.
32 changes: 15 additions & 17 deletions share/search/index_strategy/trovesearch_denorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy):
CURRENT_STRATEGY_CHECKSUM = ChecksumIri(
checksumalgorithm_name='sha-256',
salt='TrovesearchDenormIndexStrategy',
hexdigest='e538bbc5966a6a289da9e5ba51ecde5ff29528bf07e940716ef8a888d6601916',
hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8',
)

# abstract method from IndexStrategy
Expand All @@ -73,7 +73,10 @@ def backfill_message_type(self):

# abstract method from Elastic8IndexStrategy
def index_settings(self):
return {}
return {
'number_of_shards': 5,
'number_of_replicas': 2,
}

# abstract method from Elastic8IndexStrategy
def index_mappings(self):
Expand Down Expand Up @@ -127,7 +130,6 @@ def _card_mappings(self):

def _iri_value_mappings(self):
return {
'value_iri': ts.KEYWORD_MAPPING,
'value_name': ts.KEYWORD_MAPPING,
'value_title': ts.KEYWORD_MAPPING,
'value_label': ts.KEYWORD_MAPPING,
Expand All @@ -137,7 +139,8 @@ def _iri_value_mappings(self):

def _paths_and_values_mappings(self):
return {
'focus_iri': ts.IRI_KEYWORD_MAPPING,
'single_focus_iri': ts.KEYWORD_MAPPING,
'focus_iri_synonyms': ts.KEYWORD_MAPPING,
'propertypaths_present': ts.KEYWORD_MAPPING,
# flattened properties (dynamic sub-properties with keyword values)
'iri_by_propertypath': ts.FLATTENED_MAPPING,
Expand Down Expand Up @@ -311,21 +314,23 @@ def _card_subdoc(self) -> dict:
def _iri_value_subdoc(self, iri: str) -> dict:
_shortwalk = self._fullwalk.shortwalk_from(iri)
return {
'value_iri': iri,
'value_iris': self._exact_and_suffuniq_iris(iri),
**self._paths_and_values(_shortwalk),
'value_name': list(self._texts_at_properties(_shortwalk, ts.NAME_PROPERTIES)),
'value_title': list(self._texts_at_properties(_shortwalk, ts.TITLE_PROPERTIES)),
'value_label': list(self._texts_at_properties(_shortwalk, ts.LABEL_PROPERTIES)),
'at_card_propertypaths': [
ts.propertypath_as_keyword(_path)
for _path in self._fullwalk.paths_by_iri[iri]
],
**self._paths_and_values(_shortwalk),
}

def _paths_and_values(self, walk: ts.GraphWalk):
return {
'focus_iri': self._exact_and_suffuniq_iris(walk.focus_iri),
'single_focus_iri': walk.focus_iri.rstrip('/'), # remove trailing slash, but keep the scheme
'focus_iri_synonyms': ts.suffuniq_iris(ts.iri_synonyms(
walk.focus_iri,
self.rdfdoc,
)),
'propertypaths_present': self._propertypaths_present(walk),
'iri_by_propertypath': self._iris_by_propertypath(walk),
'iri_by_depth': self._iris_by_depth(walk),
Expand Down Expand Up @@ -391,13 +396,6 @@ def _ints_by_propertypath(self, walk: ts.GraphWalk):
for _path, _value_set in walk.integer_values.items()
}

def _exact_and_suffuniq_iris(self, iri: str):
_synonyms = ts.iri_synonyms(iri, self.rdfdoc)
return {
'exact': list(_synonyms),
'suffuniq': ts.suffuniq_iris(_synonyms),
}

###
# normalizing search responses

Expand Down Expand Up @@ -598,7 +596,7 @@ def _iri_filter(self, search_filter) -> dict:

def _path_iri_query(self, path, suffuniq_iris):
if path == (OWL.sameAs,):
_field = f'{self.base_field}.focus_iri.suffuniq'
_field = f'{self.base_field}.focus_iri_synonyms'
elif is_globpath(path):
_field = f'{self.base_field}.iri_by_depth.{_depth_field_name(len(path))}'
else:
Expand Down Expand Up @@ -801,7 +799,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d
'aggs': {
'agg_valuesearch_iris': {
'terms': {
'field': 'iri_value.value_iri',
'field': 'iri_value.single_focus_iri',
# WARNING: terribly hacky pagination (part one)
'size': cursor.start_offset + cursor.page_size + 1,
},
Expand Down
104 changes: 73 additions & 31 deletions tests/share/search/index_strategy/_common_trovesearch_tests.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Iterable, Iterator
import dataclasses
from datetime import date, timedelta
import itertools
import math
from urllib.parse import urlencode

Expand Down Expand Up @@ -56,22 +55,14 @@ def test_for_smoke_with_daemon(self):

def test_cardsearch(self):
self._fill_test_data_for_querying()
_cardsearch_cases = itertools.chain(
self.cardsearch_cases(),
self.cardsearch_integer_cases(),
)
for _queryparams, _expected_focus_iris in _cardsearch_cases:
for _queryparams, _expected_focus_iris in self.cardsearch_cases():
self._assert_cardsearch_iris(_queryparams, _expected_focus_iris)

def test_cardsearch_after_deletion(self):
_cards = self._fill_test_data_for_querying()
_deleted_focus_iris = {BLARG.b}
self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris])
_cardsearch_cases = itertools.chain(
self.cardsearch_cases(),
self.cardsearch_integer_cases(),
)
for _queryparams, _expected_focus_iris in _cardsearch_cases:
for _queryparams, _expected_focus_iris in self.cardsearch_cases():
if isinstance(_expected_focus_iris, set):
_expected_focus_iris -= _deleted_focus_iris
else:
Expand Down Expand Up @@ -173,17 +164,17 @@ def test_valuesearch(self):

def test_valuesearch_after_deletion(self):
_cards = self._fill_test_data_for_querying()
_deleted_focus_iris = {BLARG.b}
_deleted_focus_iris = {BLARG.c}
self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris])
_cases = [
(
{'valueSearchPropertyPath': 'subject'},
{BLARG.subj_a, BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_c}, # BLARG.subj_b no longer present
{BLARG.subj_a, BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_b}, # BLARG.subj_c no longer present
), (
{'valueSearchPropertyPath': 'dateCreated'},
{'1999', '2024'}, # 2012 no longer present
{'1999', '2012'}, # 2024 no longer present
), (
{'valueSearchPropertyPath': 'subject', 'cardSearchText': 'bbbb'},
{'valueSearchPropertyPath': 'subject', 'cardSearchText': 'cccc'},
set(), # none
)
]
Expand Down Expand Up @@ -296,7 +287,11 @@ def _fill_test_data_for_querying(self):
DCTERMS.created: {rdf.literal(date(2024, 12, 31))},
DCTERMS.creator: {BLARG.someone_else},
DCTERMS.title: {rdf.literal('cccc')},
DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_c},
DCTERMS.subject: {
BLARG['subj_ac/'], # this one has an extra trailing slash
BLARG.subj_bc,
BLARG.subj_c,
},
DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')},
},
BLARG.someone_else: {
Expand Down Expand Up @@ -401,14 +396,6 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str
{'cardSearchFilter[dcterms:replaces][is-absent]': ''},
set(),
)
yield (
{'cardSearchFilter[subject]': BLARG.subj_ac},
{BLARG.c, BLARG.a},
)
yield (
{'cardSearchFilter[subject][none-of]': BLARG.subj_ac},
{BLARG.b},
)
yield (
{
'cardSearchFilter[subject]': BLARG.subj_bc,
Expand Down Expand Up @@ -510,6 +497,8 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str
{'cardSearchText': '"what is here"'},
{BLARG.b},
)
yield from self.cardsearch_trailingslash_cases()
yield from self.cardsearch_integer_cases()

def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]:
# cases that depend on integer values getting indexed
Expand All @@ -522,6 +511,17 @@ def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] |
[BLARG.c, BLARG.a, BLARG.b], # ordered list
)

def cardsearch_trailingslash_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]:
# cases that depend on trailing-slash ignorance
yield (
{'cardSearchFilter[subject]': BLARG.subj_ac},
{BLARG.c, BLARG.a},
)
yield (
{'cardSearchFilter[subject][none-of]': BLARG.subj_ac},
{BLARG.b},
)

def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
yield (
{'valueSearchPropertyPath': 'references'},
Expand All @@ -531,6 +531,26 @@ def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
{'valueSearchPropertyPath': 'dateCreated'},
{'1999', '2012', '2024'},
)
yield (
{
'valueSearchPropertyPath': 'references',
'valueSearchFilter[resourceType]': BLARG.Thing,
},
{BLARG.b, BLARG.c},
)
yield (
{
'valueSearchPropertyPath': 'references',
'valueSearchText': 'bbbb',
},
{BLARG.b},
)
yield from self.valuesearch_trailingslash_cases()
yield from self.valuesearch_sameas_cases()
# TODO: more

def valuesearch_trailingslash_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
# cases that depend on trailing-slash ignorance
yield (
{'valueSearchPropertyPath': 'subject'},
{BLARG.subj_a, BLARG.subj_ac, BLARG.subj_b, BLARG.subj_bc, BLARG.subj_c},
Expand All @@ -547,21 +567,43 @@ def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
{'valueSearchPropertyPath': 'subject', 'cardSearchText': 'aaaa'},
{BLARG.subj_ac, BLARG.subj_a},
)

def valuesearch_sameas_cases(self):
yield (
{
'valueSearchPropertyPath': 'references',
'valueSearchFilter[resourceType]': BLARG.Thing,
'valueSearchPropertyPath': 'subject',
'cardSearchFilter[sameAs]': BLARG.a,
},
{BLARG.b, BLARG.c},
{BLARG.subj_ac, BLARG.subj_a},
)
yield (
{
'valueSearchPropertyPath': 'references',
'valueSearchText': 'bbbb',
'valueSearchPropertyPath': 'subject',
'cardSearchFilter[sameAs]': BLARG.a_same,
},
{BLARG.b},
{BLARG.subj_ac, BLARG.subj_a},
)
yield (
{
'valueSearchPropertyPath': 'subject',
'cardSearchFilter[sameAs]': BLARG.b_same,
},
{BLARG.subj_b, BLARG.subj_bc},
)
yield (
{
'valueSearchPropertyPath': 'subject',
'valueSearchFilter[sameAs]': BLARG.subj_a,
},
{BLARG.subj_a},
)
yield (
{
'valueSearchPropertyPath': 'subject',
'cardSearchFilter[subject]': BLARG.subj_ac,
},
{BLARG.subj_ac, BLARG.subj_a, BLARG.subj_c, BLARG.subj_bc},
)
# TODO: more

def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]):
_messages_chunk = messages.MessagesChunk(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,12 @@ def get_index_strategy(self):

def cardsearch_integer_cases(self):
yield from () # integers not indexed by this strategy

def cardsearch_trailingslash_cases(self):
yield from () # trailing-slash handling improved in trovesearch_denorm

def valuesearch_sameas_cases(self):
yield from () # sameas handling improved in trovesearch_denorm

def valuesearch_trailingslash_cases(self):
yield from () # trailing-slash handling improved in trovesearch_denorm

0 comments on commit 39df273

Please sign in to comment.