Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-6576][ENG-6586] fix: duplicate value-search values #832

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 15 additions & 17 deletions share/search/index_strategy/trovesearch_denorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy):
CURRENT_STRATEGY_CHECKSUM = ChecksumIri(
checksumalgorithm_name='sha-256',
salt='TrovesearchDenormIndexStrategy',
hexdigest='e538bbc5966a6a289da9e5ba51ecde5ff29528bf07e940716ef8a888d6601916',
hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8',
)

# abstract method from IndexStrategy
Expand All @@ -73,7 +73,10 @@ def backfill_message_type(self):

# abstract method from Elastic8IndexStrategy
def index_settings(self):
return {}
return {
'number_of_shards': 5,
'number_of_replicas': 2,
}

# abstract method from Elastic8IndexStrategy
def index_mappings(self):
Expand Down Expand Up @@ -127,7 +130,6 @@ def _card_mappings(self):

def _iri_value_mappings(self):
return {
'value_iri': ts.KEYWORD_MAPPING,
'value_name': ts.KEYWORD_MAPPING,
'value_title': ts.KEYWORD_MAPPING,
'value_label': ts.KEYWORD_MAPPING,
Expand All @@ -137,7 +139,8 @@ def _iri_value_mappings(self):

def _paths_and_values_mappings(self):
return {
'focus_iri': ts.IRI_KEYWORD_MAPPING,
'single_focus_iri': ts.KEYWORD_MAPPING,
'focus_iri_synonyms': ts.KEYWORD_MAPPING,
'propertypaths_present': ts.KEYWORD_MAPPING,
# flattened properties (dynamic sub-properties with keyword values)
'iri_by_propertypath': ts.FLATTENED_MAPPING,
Expand Down Expand Up @@ -311,21 +314,23 @@ def _card_subdoc(self) -> dict:
def _iri_value_subdoc(self, iri: str) -> dict:
_shortwalk = self._fullwalk.shortwalk_from(iri)
return {
'value_iri': iri,
'value_iris': self._exact_and_suffuniq_iris(iri),
**self._paths_and_values(_shortwalk),
'value_name': list(self._texts_at_properties(_shortwalk, ts.NAME_PROPERTIES)),
'value_title': list(self._texts_at_properties(_shortwalk, ts.TITLE_PROPERTIES)),
'value_label': list(self._texts_at_properties(_shortwalk, ts.LABEL_PROPERTIES)),
'at_card_propertypaths': [
ts.propertypath_as_keyword(_path)
for _path in self._fullwalk.paths_by_iri[iri]
],
**self._paths_and_values(_shortwalk),
}

def _paths_and_values(self, walk: ts.GraphWalk):
return {
'focus_iri': self._exact_and_suffuniq_iris(walk.focus_iri),
'single_focus_iri': walk.focus_iri.rstrip('/'), # remove trailing slash, but keep the scheme
'focus_iri_synonyms': ts.suffuniq_iris(ts.iri_synonyms(
walk.focus_iri,
self.rdfdoc,
)),
'propertypaths_present': self._propertypaths_present(walk),
'iri_by_propertypath': self._iris_by_propertypath(walk),
'iri_by_depth': self._iris_by_depth(walk),
Expand Down Expand Up @@ -391,13 +396,6 @@ def _ints_by_propertypath(self, walk: ts.GraphWalk):
for _path, _value_set in walk.integer_values.items()
}

def _exact_and_suffuniq_iris(self, iri: str):
_synonyms = ts.iri_synonyms(iri, self.rdfdoc)
return {
'exact': list(_synonyms),
'suffuniq': ts.suffuniq_iris(_synonyms),
}

###
# normalizing search responses

Expand Down Expand Up @@ -598,7 +596,7 @@ def _iri_filter(self, search_filter) -> dict:

def _path_iri_query(self, path, suffuniq_iris):
if path == (OWL.sameAs,):
_field = f'{self.base_field}.focus_iri.suffuniq'
_field = f'{self.base_field}.focus_iri_synonyms'
elif is_globpath(path):
_field = f'{self.base_field}.iri_by_depth.{_depth_field_name(len(path))}'
else:
Expand Down Expand Up @@ -801,7 +799,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d
'aggs': {
'agg_valuesearch_iris': {
'terms': {
'field': 'iri_value.value_iri',
'field': 'iri_value.single_focus_iri',
# WARNING: terribly hacky pagination (part one)
'size': cursor.start_offset + cursor.page_size + 1,
},
Expand Down
104 changes: 73 additions & 31 deletions tests/share/search/index_strategy/_common_trovesearch_tests.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Iterable, Iterator
import dataclasses
from datetime import date, timedelta
import itertools
import math
from urllib.parse import urlencode

Expand Down Expand Up @@ -56,22 +55,14 @@ def test_for_smoke_with_daemon(self):

def test_cardsearch(self):
self._fill_test_data_for_querying()
_cardsearch_cases = itertools.chain(
self.cardsearch_cases(),
self.cardsearch_integer_cases(),
)
for _queryparams, _expected_focus_iris in _cardsearch_cases:
for _queryparams, _expected_focus_iris in self.cardsearch_cases():
self._assert_cardsearch_iris(_queryparams, _expected_focus_iris)

def test_cardsearch_after_deletion(self):
_cards = self._fill_test_data_for_querying()
_deleted_focus_iris = {BLARG.b}
self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris])
_cardsearch_cases = itertools.chain(
self.cardsearch_cases(),
self.cardsearch_integer_cases(),
)
for _queryparams, _expected_focus_iris in _cardsearch_cases:
for _queryparams, _expected_focus_iris in self.cardsearch_cases():
if isinstance(_expected_focus_iris, set):
_expected_focus_iris -= _deleted_focus_iris
else:
Expand Down Expand Up @@ -173,17 +164,17 @@ def test_valuesearch(self):

def test_valuesearch_after_deletion(self):
_cards = self._fill_test_data_for_querying()
_deleted_focus_iris = {BLARG.b}
_deleted_focus_iris = {BLARG.c}
self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris])
_cases = [
(
{'valueSearchPropertyPath': 'subject'},
{BLARG.subj_a, BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_c}, # BLARG.subj_b no longer present
{BLARG.subj_a, BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_b}, # BLARG.subj_c no longer present
), (
{'valueSearchPropertyPath': 'dateCreated'},
{'1999', '2024'}, # 2012 no longer present
{'1999', '2012'}, # 2024 no longer present
), (
{'valueSearchPropertyPath': 'subject', 'cardSearchText': 'bbbb'},
{'valueSearchPropertyPath': 'subject', 'cardSearchText': 'cccc'},
set(), # none
)
]
Expand Down Expand Up @@ -296,7 +287,11 @@ def _fill_test_data_for_querying(self):
DCTERMS.created: {rdf.literal(date(2024, 12, 31))},
DCTERMS.creator: {BLARG.someone_else},
DCTERMS.title: {rdf.literal('cccc')},
DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc, BLARG.subj_c},
DCTERMS.subject: {
BLARG['subj_ac/'], # this one has an extra trailing slash
BLARG.subj_bc,
BLARG.subj_c,
},
DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')},
},
BLARG.someone_else: {
Expand Down Expand Up @@ -401,14 +396,6 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str
{'cardSearchFilter[dcterms:replaces][is-absent]': ''},
set(),
)
yield (
{'cardSearchFilter[subject]': BLARG.subj_ac},
{BLARG.c, BLARG.a},
)
yield (
{'cardSearchFilter[subject][none-of]': BLARG.subj_ac},
{BLARG.b},
)
yield (
{
'cardSearchFilter[subject]': BLARG.subj_bc,
Expand Down Expand Up @@ -510,6 +497,8 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str
{'cardSearchText': '"what is here"'},
{BLARG.b},
)
yield from self.cardsearch_trailingslash_cases()
yield from self.cardsearch_integer_cases()

def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]:
# cases that depend on integer values getting indexed
Expand All @@ -522,6 +511,17 @@ def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] |
[BLARG.c, BLARG.a, BLARG.b], # ordered list
)

def cardsearch_trailingslash_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]:
# cases that depend on trailing-slash ignorance
yield (
{'cardSearchFilter[subject]': BLARG.subj_ac},
{BLARG.c, BLARG.a},
)
yield (
{'cardSearchFilter[subject][none-of]': BLARG.subj_ac},
{BLARG.b},
)

def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
yield (
{'valueSearchPropertyPath': 'references'},
Expand All @@ -531,6 +531,26 @@ def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
{'valueSearchPropertyPath': 'dateCreated'},
{'1999', '2012', '2024'},
)
yield (
{
'valueSearchPropertyPath': 'references',
'valueSearchFilter[resourceType]': BLARG.Thing,
},
{BLARG.b, BLARG.c},
)
yield (
{
'valueSearchPropertyPath': 'references',
'valueSearchText': 'bbbb',
},
{BLARG.b},
)
yield from self.valuesearch_trailingslash_cases()
yield from self.valuesearch_sameas_cases()
# TODO: more

def valuesearch_trailingslash_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
# cases that depend on trailing-slash ignorance
yield (
{'valueSearchPropertyPath': 'subject'},
{BLARG.subj_a, BLARG.subj_ac, BLARG.subj_b, BLARG.subj_bc, BLARG.subj_c},
Expand All @@ -547,21 +567,43 @@ def valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]:
{'valueSearchPropertyPath': 'subject', 'cardSearchText': 'aaaa'},
{BLARG.subj_ac, BLARG.subj_a},
)

def valuesearch_sameas_cases(self):
yield (
{
'valueSearchPropertyPath': 'references',
'valueSearchFilter[resourceType]': BLARG.Thing,
'valueSearchPropertyPath': 'subject',
'cardSearchFilter[sameAs]': BLARG.a,
},
{BLARG.b, BLARG.c},
{BLARG.subj_ac, BLARG.subj_a},
)
yield (
{
'valueSearchPropertyPath': 'references',
'valueSearchText': 'bbbb',
'valueSearchPropertyPath': 'subject',
'cardSearchFilter[sameAs]': BLARG.a_same,
},
{BLARG.b},
{BLARG.subj_ac, BLARG.subj_a},
)
yield (
{
'valueSearchPropertyPath': 'subject',
'cardSearchFilter[sameAs]': BLARG.b_same,
},
{BLARG.subj_b, BLARG.subj_bc},
)
yield (
{
'valueSearchPropertyPath': 'subject',
'valueSearchFilter[sameAs]': BLARG.subj_a,
},
{BLARG.subj_a},
)
yield (
{
'valueSearchPropertyPath': 'subject',
'cardSearchFilter[subject]': BLARG.subj_ac,
},
{BLARG.subj_ac, BLARG.subj_a, BLARG.subj_c, BLARG.subj_bc},
)
# TODO: more

def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]):
_messages_chunk = messages.MessagesChunk(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,12 @@ def get_index_strategy(self):

def cardsearch_integer_cases(self):
yield from () # integers not indexed by this strategy

def cardsearch_trailingslash_cases(self):
yield from () # trailing-slash handling improved in trovesearch_denorm

def valuesearch_sameas_cases(self):
yield from () # sameas handling improved in trovesearch_denorm

def valuesearch_trailingslash_cases(self):
yield from () # trailing-slash handling improved in trovesearch_denorm
Loading