From 7b68b96529864e94003519841aa15faac385b2e0 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 8 Aug 2024 14:19:54 -0400 Subject: [PATCH 01/14] agglomulated changes from quest/minimal-nest --- _TODO.txt | 171 ++++ api/search/views.py | 4 +- api/views/feeds.py | 8 +- project/settings.py | 45 +- share/admin/__init__.py | 7 +- share/admin/search.py | 39 +- share/bin/search.py | 12 +- share/checks.py | 8 +- share/models/feature_flag.py | 1 + share/search/__init__.py | 3 +- share/search/daemon.py | 14 +- share/search/index_messenger.py | 18 +- share/search/index_strategy/__init__.py | 96 +- share/search/index_strategy/_base.py | 111 +-- .../index_strategy/_trovesearch_util.py | 231 +++++ share/search/index_strategy/_util.py | 10 +- share/search/index_strategy/elastic8.py | 110 ++- .../search/index_strategy/sharev2_elastic5.py | 2 +- .../index_strategy/trove_indexcard_flats.py | 74 +- .../index_strategy/trovesearch_excessive.py | 93 ++ .../index_strategy/trovesearch_indexcard.py | 858 ++++++++++++++++++ .../index_strategy/trovesearch_irivalues.py | 99 ++ share/search/messages.py | 10 +- share/tasks/__init__.py | 4 +- templates/admin/search-indexes.html | 15 +- tests/_testutil.py | 12 - tests/api/test_elasticsearch.py | 6 +- tests/api/test_feeds.py | 2 +- tests/share/bin/test_sharectl.py | 20 +- tests/share/search/__init__.py | 18 + tests/share/search/conftest.py | 39 +- .../_common_trovesearch_tests.py | 363 ++++++++ .../index_strategy/_with_real_services.py | 47 +- .../index_strategy/test_sharev2_elastic5.py | 11 +- .../index_strategy/test_sharev2_elastic8.py | 5 +- ...strategy.py => test_strategy_selection.py} | 51 +- .../test_trove_indexcard_flats.py | 321 +------ .../test_trovesearch_excessive.py | 11 + .../test_trovesearch_indexcard.py | 15 + .../test_trovesearch_iri_usage.py | 12 + tests/share/search/test_admin_workflow.py | 10 +- trove/models/indexcard.py | 4 +- trove/trovesearch/search_params.py | 16 +- trove/trovesearch/search_response.py | 18 +- trove/trovesearch/trovesearch_gathering.py | 19 +- trove/views/search.py | 4 +- 46 files changed, 2275 insertions(+), 772 deletions(-) create mode 100644 _TODO.txt create mode 100644 share/search/index_strategy/_trovesearch_util.py create mode 100644 share/search/index_strategy/trovesearch_excessive.py create mode 100644 share/search/index_strategy/trovesearch_indexcard.py create mode 100644 share/search/index_strategy/trovesearch_irivalues.py delete mode 100644 tests/_testutil.py create mode 100644 tests/share/search/__init__.py create mode 100644 tests/share/search/index_strategy/_common_trovesearch_tests.py rename tests/share/search/index_strategy/{test_base_index_strategy.py => test_strategy_selection.py} (55%) create mode 100644 tests/share/search/index_strategy/test_trovesearch_excessive.py create mode 100644 tests/share/search/index_strategy/test_trovesearch_indexcard.py create mode 100644 tests/share/search/index_strategy/test_trovesearch_iri_usage.py diff --git a/_TODO.txt b/_TODO.txt new file mode 100644 index 000000000..caac45940 --- /dev/null +++ b/_TODO.txt @@ -0,0 +1,171 @@ +using trove for a dashboard of metrics +====================================== + +on frontend... + +a dashboard has a consistent `cardSearchFilter` set + +for each metadata property (or property-path) of interest, +make a request to `/trove/index-value-search` with that `valueSearchPropertyPath` +and the dashboard's `cardSearchFilter` set + + + +denormalized IndexStrategy +========================== + +current mappings: + simple: + indexcard_uuid + focus_iri + suffuniq_focus_iri + source_record_identifier + source_config_label + iri_paths_present + iri_paths_present_suffuniq + flattened: + flat_iri_values + flat_iri_values_suffuniq + nested: (THE PROBLEM) + nested_iri... + nested_date... + nested_text... + + +to denormalize for performance (removing (most) `nested` mappings) +while supporting existing api used by osf-search... + +edges to consider: +- `cardSearchText[property.path]` + - dynamic template for text values per property-path (...to limited depth?) +- `valueSearchFilter[resourceType]` + - dynamic template for iri values per resource-type? +- `valueSearchText` + - ...new index for value-search? + - ...maybe can use the same dynamic fields added for `cardSearchText[property.path]`? + ...but how to keep the text associated with the iri value... + - ...could keep the old `nested` garbage around, but only use it when `valueSearchText`? +- `cardSearchFilter[sameAs][iri-prefix]=https://orcid.org/` + - new filter operator +- `cardSearchText[*.*.*]`, `cardSearchFilter[*.*.*]` + - dynamic templates for values by depth? + + +possible future card-index mappings: + simple: + indexcard_uuid + suid.source_config_label + suid.source_record_identifier + focus_iri.exact + focus_iri.suffuniq + propertypaths_present + flattened: + iri_by_propertypath.exact.* + iri_by_propertypath.suffuniq.* + iri_by_propertypath_length.exact.* + iri_by_propertypath_length.suffuniq.* + dynamic: (used instead of the old nested fields for most queries) + dynamics.text_by_propertypath.* + dynamics.text_by_propertypath_length.* + dynamics.date_by_propertypath.* + (maybe) dynamics.number_by_propertypath.* + nested: (ONLY for index-value-search with `valueSearchText` or `valueSearchFilter[resourceType]`) + iri_usage + iri.exact + iri.suffuniq + propertypath + propertypath_length + type_iri.exact + type_iri.suffuniq + name_text + title_text + label_text + namelike_text (combined three) + + +multiple strategies? +==================== +after reluctantly accepting `nested` for certain value-searches... how about multiple index strategies? + +select suitable index-strategy based on query + +most queries go to a more constrained index-strategy with a smaller, faster, +completely non-nested index (calling it "trovesearch_indexcard") + +queries that need the extra complexity go to a more complex index-strategy +with larger, slower index (calling it "trovesearch_excessive") + +however... even simple value-searches need to get metadata about each iri value +(at least `rdf:type` and something name-like (`dcterms:title`, `foaf:name`, `rdfs:label`...)) +-- without the `nested` mapping, there's not a good way (that i see) to do that in a single query + +so how about a third index strategy just for looking up iri-value metadata? +(calling it "trovesearch_irivalues") + + +trovesearch_indexcard (one per indexcard): + simple: + indexcard_iri + indexcard_pk + suid.source_config_label + suid.source_record_identifier + focus_iri.exact + focus_iri.suffuniq + propertypaths_present + flattened: + iri_by_propertypath.* + iri_by_depth.* + dynamic: + dynamics.text_by_propertypath.* + dynamics.text_by_depth.* + dynamics.date_by_propertypath.* + + +trovesearch_irivalues (one per (indexcard, iri) pair) + simple: + iri.exact (includes sameAs synonyms) + iri.suffuniq (includes sameAs synonyms) + indexcard_iri + indexcard_pk + propertypath_from_focus + depth_from_focus + flattened: + iri_by_relative_propertypath.* + iri_by_relative_depth.* + dynamic: + dynamics.text_by_relative_propertypath.* + dynamics.text_by_relative_depth.* + dynamics.date_by_relative_propertypath.* + + +trovesearch_excessive: + (all fields from trovesearch_indexcard, plus a nested field with + fields from (or similar to) trovesearch_irivalues) + + +...ok maybe, but revisiting "trovesearch_irivalues (one per (indexcard, iri) pair)", +that's a looot of documents, and awful wasteful for the common case of commonly used iris, +and trickier to remove docs for iri values no longer used + +returning to an old idea discarded from the first "index-card-search" implementation... +how about an index with (only) one doc per referenced iri? would need to: +- use IDENTIFIER_USAGE/BACKFILL_IDENTIFIER_USAGE messages + emit after non-backfill indexcard indexing, perhaps deduped within each message chunk +- index strategy should, for each identifier message: + query for indexcards that include that identifier, + aggregate metadata included in those indexcards about that identifier, + store document describing that identifier and its usage + +important to account for erroneous sameAs assertions (make it easy to undo) + +revised trovesearch_irivalues (one per iri) + simple: + iri + used_at_propertypath + flattened: + iri_by_relative_propertypath.* + iri_by_relative_depth.* + dynamic: + dynamics.text_by_relative_propertypath.* + dynamics.text_by_relative_depth.* + dynamics.date_by_relative_propertypath.* diff --git a/api/search/views.py b/api/search/views.py index a8fc19cb9..12075a82d 100644 --- a/api/search/views.py +++ b/api/search/views.py @@ -8,7 +8,7 @@ from api import authentication from share.search import exceptions -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy class Sharev2ElasticSearchView(views.APIView): @@ -32,7 +32,7 @@ def _handle_request(self, request): if 'scroll' in queryparams: return http.HttpResponseForbidden(reason='Scroll is not supported.') try: - specific_index = IndexStrategy.get_for_sharev2_search(requested_index_strategy) + specific_index = index_strategy.get_index_for_sharev2_search(requested_index_strategy) except exceptions.IndexStrategyError as error: raise http.Http404(str(error)) try: diff --git a/api/views/feeds.py b/api/views/feeds.py index 4934b2c28..417d479fa 100644 --- a/api/views/feeds.py +++ b/api/views/feeds.py @@ -10,7 +10,7 @@ import pendulum import sentry_sdk -from share.search import IndexStrategy +from share.search import index_strategy from share.search.exceptions import IndexStrategyError from share.util.xml import strip_illegal_xml_chars @@ -34,6 +34,8 @@ class MetadataRecordsRSS(Feed): description = 'Updates to the SHARE open dataset' author_name = 'SHARE' + _search_index: index_strategy.IndexStrategy.SpecificIndex + def title(self, obj): query = json.dumps(obj.get('query', 'All')) return prepare_string('SHARE: Atom feed for query: {}'.format(query)) @@ -41,7 +43,7 @@ def title(self, obj): def get_object(self, request): self._order = request.GET.get('order') elastic_query = request.GET.get('elasticQuery') - self._index_strategy = IndexStrategy.get_for_sharev2_search(request.GET.get('indexStrategy')) + self._search_index = index_strategy.get_index_for_sharev2_search(request.GET.get('indexStrategy')) if self._order not in {'date_modified', 'date_updated', 'date_created', 'date_published'}: self._order = 'date_modified' @@ -62,7 +64,7 @@ def get_object(self, request): def items(self, obj): try: - json_response = self._index_strategy.pls_handle_search__sharev2_backcompat( + json_response = self._search_index.pls_handle_search__sharev2_backcompat( request_body=obj, ) except IndexStrategyError: diff --git a/project/settings.py b/project/settings.py index d091e9e7c..0dafab53f 100644 --- a/project/settings.py +++ b/project/settings.py @@ -314,52 +314,15 @@ def split(string, delim): 'TIMEOUT': int(os.environ.get('ELASTICSEARCH_TIMEOUT', '45')), 'CHUNK_SIZE': int(os.environ.get('ELASTICSEARCH_CHUNK_SIZE', 2000)), 'MAX_RETRIES': int(os.environ.get('ELASTICSEARCH_MAX_RETRIES', 7)), - 'INDEX_STRATEGIES': {}, # populated below based on environment } ELASTICSEARCH5_URL = ( os.environ.get('ELASTICSEARCH5_URL') - or os.environ.get('ELASTICSEARCH_URL') + or os.environ.get('ELASTICSEARCH_URL') # backcompat ) -if ELASTICSEARCH5_URL: - ELASTICSEARCH['INDEX_STRATEGIES']['sharev2_elastic5'] = { - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic5.Sharev2Elastic5IndexStrategy', - 'CLUSTER_SETTINGS': { - 'URL': ELASTICSEARCH5_URL, - }, - } ELASTICSEARCH8_URL = os.environ.get('ELASTICSEARCH8_URL') -if ELASTICSEARCH8_URL: - ELASTICSEARCH8_CERT_PATH = os.environ.get('ELASTICSEARCH8_CERT_PATH') - ELASTICSEARCH8_USERNAME = os.environ.get('ELASTICSEARCH8_USERNAME', 'elastic') - ELASTICSEARCH8_SECRET = os.environ.get('ELASTICSEARCH8_SECRET') - ELASTICSEARCH8_CLUSTER_SETTINGS = { - 'URL': ELASTICSEARCH8_URL, - 'AUTH': ( - (ELASTICSEARCH8_USERNAME, ELASTICSEARCH8_SECRET) - if ELASTICSEARCH8_SECRET is not None - else None - ), - 'CERT_PATH': ELASTICSEARCH8_CERT_PATH, - } - ELASTICSEARCH['INDEX_STRATEGIES'].update({ - 'sharev2_elastic8': { - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic8.Sharev2Elastic8IndexStrategy', - 'CLUSTER_SETTINGS': ELASTICSEARCH8_CLUSTER_SETTINGS, - }, - 'trove_indexcard_flats': { - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy', - 'CLUSTER_SETTINGS': ELASTICSEARCH8_CLUSTER_SETTINGS, - }, - }) -DEFAULT_INDEX_STRATEGY_FOR_LEGACY_SEARCH = ( - 'sharev2_elastic5' - if ELASTICSEARCH5_URL - else ( - 'sharev2_elastic8' - if ELASTICSEARCH8_URL - else None - ) -) +ELASTICSEARCH8_CERT_PATH = os.environ.get('ELASTICSEARCH8_CERT_PATH') +ELASTICSEARCH8_USERNAME = os.environ.get('ELASTICSEARCH8_USERNAME', 'elastic') +ELASTICSEARCH8_SECRET = os.environ.get('ELASTICSEARCH8_SECRET') # Seconds, not an actual celery settings CELERY_RETRY_BACKOFF_BASE = int(os.environ.get('CELERY_RETRY_BACKOFF_BASE', 2 if DEBUG else 10)) diff --git a/share/admin/__init__.py b/share/admin/__init__.py index 7d1b67a75..9e68fe2e9 100644 --- a/share/admin/__init__.py +++ b/share/admin/__init__.py @@ -17,7 +17,7 @@ from share.admin.celery import CeleryTaskResultAdmin from share.admin.jobs import HarvestJobAdmin from share.admin.readonly import ReadOnlyAdmin -from share.admin.search import search_indexes_view +from share.admin.search import search_indexes_view, search_index_mappings_view from share.admin.util import TimeLimitedPaginator, linked_fk, linked_many, SourceConfigFilter from share.harvest.scheduler import HarvestScheduler from share.models import ( @@ -48,6 +48,11 @@ def get_urls(self): self.admin_view(search_indexes_view), name='search-indexes', ), + path( + 'search-index-mappings/', + self.admin_view(search_index_mappings_view), + name='search-index-mappings', + ), *super().get_urls(), ] diff --git a/share/admin/search.py b/share/admin/search.py index ce7e3aafe..ea8a254e9 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -1,13 +1,13 @@ import logging -from django.http.response import HttpResponseRedirect +from django.http.response import HttpResponseRedirect, JsonResponse from django.template.response import TemplateResponse from django.urls import reverse from share.admin.util import admin_url from share.models.index_backfill import IndexBackfill from share.search.index_messenger import IndexMessenger -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy logger = logging.getLogger(__name__) @@ -20,11 +20,12 @@ def search_indexes_view(request): 'admin/search-indexes.html', context={ 'search_url_prefix': _search_url_prefix(), + 'mappings_url_prefix': _mappings_url_prefix(), 'index_status_by_strategy': _index_status_by_strategy(), }, ) if request.method == 'POST': - _specific_index = IndexStrategy.get_specific_index(request.POST['specific_indexname']) + _specific_index = index_strategy.get_specific_index(request.POST['specific_indexname']) _pls_doer = PLS_DOERS[request.POST['pls_do']] _pls_doer(_specific_index) _redirect_id = ( @@ -35,24 +36,35 @@ def search_indexes_view(request): return HttpResponseRedirect('#'.join((request.path, _redirect_id))) +def search_index_mappings_view(request, index_name): + _specific_index = index_strategy.get_specific_index(index_name) + _mappings = _specific_index.pls_get_mappings() + return JsonResponse(_mappings) + + def _search_url_prefix(): api_url = reverse('api:search') return f'{api_url}?indexStrategy=' # append strategyname or indexname +def _mappings_url_prefix(): + # return reverse('admin:search-index-mappings', kwargs={'index_name': ''}) + return '/admin/search-index-mappings/' + + def _index_status_by_strategy(): - backfill_by_indexname = { + backfill_by_indexname: dict[str, IndexBackfill] = { backfill.specific_indexname: backfill for backfill in ( IndexBackfill.objects - .filter(index_strategy_name__in=IndexStrategy.all_strategies_by_name().keys()) + .filter(index_strategy_name__in=index_strategy.all_index_strategies().keys()) ) } status_by_strategy = {} _messenger = IndexMessenger() - for index_strategy in IndexStrategy.all_strategies(): - current_index = index_strategy.for_current_index() - status_by_strategy[index_strategy.name] = { + for _index_strategy in index_strategy.all_index_strategies().values(): + current_index = _index_strategy.for_current_index() + status_by_strategy[_index_strategy.name] = { 'current': { 'status': current_index.pls_get_status(), 'backfill': _serialize_backfill( @@ -62,7 +74,7 @@ def _index_status_by_strategy(): }, 'prior': sorted(( specific_index.pls_get_status() - for specific_index in index_strategy.each_specific_index() + for specific_index in _index_strategy.each_specific_index() if not specific_index.is_current ), reverse=True), 'queues': [ @@ -71,15 +83,18 @@ def _index_status_by_strategy(): **_messenger.get_queue_stats(_queue_name), } for _queue_name in ( - index_strategy.urgent_messagequeue_name, - index_strategy.nonurgent_messagequeue_name, + _index_strategy.urgent_messagequeue_name, + _index_strategy.nonurgent_messagequeue_name, ) ], } return status_by_strategy -def _serialize_backfill(specific_index: IndexStrategy.SpecificIndex, backfill: IndexBackfill): +def _serialize_backfill( + specific_index: index_strategy.IndexStrategy.SpecificIndex, + backfill: IndexBackfill | None, +): if not specific_index.is_current: return {} if not backfill: diff --git a/share/bin/search.py b/share/bin/search.py index 80418440d..69f5c0eff 100644 --- a/share/bin/search.py +++ b/share/bin/search.py @@ -1,7 +1,7 @@ from project.celery import app as celery_app from share.bin.util import command -from share.search import IndexStrategy +from share.search import index_strategy from share.search.exceptions import IndexStrategyError from share.search.daemon import IndexerDaemonControl @@ -29,7 +29,7 @@ def purge(args, argv): Usage: {0} search purge ... """ for index_name in args['']: - specific_index = IndexStrategy.get_specific_index(index_name) + specific_index = index_strategy.get_specific_index(index_name) specific_index.pls_delete() @@ -43,18 +43,16 @@ def setup(args, argv): if _is_initial: _specific_indexes = [ _index_strategy.for_current_index() - for _index_strategy in IndexStrategy.all_strategies() + for _index_strategy in index_strategy.all_index_strategies().values() ] else: _index_or_strategy_name = args[''] try: - _specific_indexes = [ - IndexStrategy.get_by_name(_index_or_strategy_name).for_current_index(), - ] + _specific_indexes = [index_strategy.get_specific_index(_index_or_strategy_name)] except IndexStrategyError: try: _specific_indexes = [ - IndexStrategy.get_specific_index(_index_or_strategy_name), + index_strategy.get_specific_index(_index_or_strategy_name), ] except IndexStrategyError: raise IndexStrategyError(f'unrecognized index or strategy name "{_index_or_strategy_name}"') diff --git a/share/checks.py b/share/checks.py index 0a3ec321f..a53d2a228 100644 --- a/share/checks.py +++ b/share/checks.py @@ -2,18 +2,18 @@ def check_all_index_strategies_current(app_configs, **kwargs): - from share.search import IndexStrategy + from share.search import index_strategy from share.search.exceptions import IndexStrategyError errors = [] - for index_strategy in IndexStrategy.all_strategies(): + for _index_strategy in index_strategy.all_index_strategies().values(): try: - index_strategy.assert_strategy_is_current() + _index_strategy.assert_strategy_is_current() except IndexStrategyError as exception: errors.append( checks.Error( 'IndexStrategy changed without checksum confirmation!', hint=str(exception), - obj=index_strategy, + obj=_index_strategy, id='share.search.E001', ) ) diff --git a/share/models/feature_flag.py b/share/models/feature_flag.py index df0903122..f2f9c57cc 100644 --- a/share/models/feature_flag.py +++ b/share/models/feature_flag.py @@ -31,6 +31,7 @@ class FeatureFlag(models.Model): IGNORE_SHAREV2_INGEST = 'ignore_sharev2_ingest' SUGGEST_CREATOR_FACET = 'suggest_creator_facet' FORBID_UNTRUSTED_FEED = 'forbid_untrusted_feed' + TROVESEARCH_POLYSTRAT = 'trovesearch_polystrat' # name _should_ be one of the constants above, but that is not enforced by `choices` name = models.TextField(unique=True) diff --git a/share/search/__init__.py b/share/search/__init__.py index ac51d920e..7f723488c 100644 --- a/share/search/__init__.py +++ b/share/search/__init__.py @@ -1,6 +1,5 @@ from share.search.messages import MessageType, MessagesChunk -from share.search.index_strategy import IndexStrategy from share.search.index_messenger import IndexMessenger -__all__ = ('IndexStrategy', 'IndexMessenger', 'MessageType', 'MessagesChunk',) +__all__ = ('IndexMessenger', 'MessageType', 'MessagesChunk',) diff --git a/share/search/daemon.py b/share/search/daemon.py index 58108a1f3..90aedb855 100644 --- a/share/search/daemon.py +++ b/share/search/daemon.py @@ -11,7 +11,12 @@ from kombu.mixins import ConsumerMixin import sentry_sdk -from share.search import exceptions, messages, IndexStrategy, IndexMessenger +from share.search import ( + exceptions, + messages, + index_strategy, + IndexMessenger, +) logger = logging.getLogger(__name__) @@ -52,7 +57,7 @@ def start_daemonthreads_for_strategy(self, index_strategy): return _daemon def start_all_daemonthreads(self): - for _index_strategy in IndexStrategy.all_strategies(): + for _index_strategy in index_strategy.all_index_strategies().values(): self.start_daemonthreads_for_strategy(_index_strategy) def stop_daemonthreads(self, *, wait=False): @@ -176,7 +181,7 @@ def __repr__(self): @dataclasses.dataclass class MessageHandlingLoop: - index_strategy: IndexStrategy + index_strategy: index_strategy.IndexStrategy message_type: messages.MessageType stop_event: threading.Event local_message_queue: queue.Queue @@ -243,7 +248,6 @@ def _get_daemon_messages(self): return daemon_messages_by_target_id def _handle_some_messages(self): - # each message corresponds to one action on this daemon's index start_time = time.time() doc_count, error_count = 0, 0 daemon_messages_by_target_id = self._get_daemon_messages() @@ -265,7 +269,7 @@ def _handle_some_messages(self): logger.error('%sEncountered error: %s', self.log_prefix, message_response.error_text) sentry_sdk.capture_message('error handling message', extras={'message_response': message_response}) target_id = message_response.index_message.target_id - for daemon_message in daemon_messages_by_target_id.pop(target_id): + for daemon_message in daemon_messages_by_target_id.pop(target_id, ()): daemon_message.ack() # finally set it free if daemon_messages_by_target_id: # should be empty by now logger.error('%sUnhandled messages?? %s', self.log_prefix, len(daemon_messages_by_target_id)) diff --git a/share/search/index_messenger.py b/share/search/index_messenger.py index 7162ef533..0cd51293b 100644 --- a/share/search/index_messenger.py +++ b/share/search/index_messenger.py @@ -12,7 +12,7 @@ from share.models import FeatureFlag from share.search.messages import MessagesChunk, MessageType -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy logger = logging.getLogger(__name__) @@ -32,7 +32,7 @@ def __init__(self, *, celery_app=None, index_strategys=None): if celery_app is None else celery_app ) - self.index_strategys = index_strategys or tuple(IndexStrategy.all_strategies()) + self.index_strategys = index_strategys or tuple(index_strategy.all_index_strategies().values()) def notify_indexcard_update(self, indexcards, *, urgent=False): self.send_messages_chunk( @@ -62,18 +62,18 @@ def notify_suid_update(self, suid_ids, *, urgent=False): ) def incoming_messagequeue_iter(self, channel) -> typing.Iterable[kombu.Queue]: - for index_strategy in self.index_strategys: - yield kombu.Queue(channel=channel, name=index_strategy.urgent_messagequeue_name) - yield kombu.Queue(channel=channel, name=index_strategy.nonurgent_messagequeue_name) + for _index_strategy in self.index_strategys: + yield kombu.Queue(channel=channel, name=_index_strategy.urgent_messagequeue_name) + yield kombu.Queue(channel=channel, name=_index_strategy.nonurgent_messagequeue_name) def outgoing_messagequeue_iter(self, connection, message_type: MessageType, urgent: bool) -> typing.Iterable[kombu.simple.SimpleQueue]: - for index_strategy in self.index_strategys: - if message_type in index_strategy.supported_message_types: + for _index_strategy in self.index_strategys: + if message_type in _index_strategy.supported_message_types: yield connection.SimpleQueue( name=( - index_strategy.urgent_messagequeue_name + _index_strategy.urgent_messagequeue_name if urgent - else index_strategy.nonurgent_messagequeue_name + else _index_strategy.nonurgent_messagequeue_name ), ) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 6daa53848..2ea608bd1 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -1,4 +1,98 @@ +from __future__ import annotations +import functools +from types import MappingProxyType + +from django.conf import settings + +from share.search.exceptions import IndexStrategyError +from share.models import FeatureFlag +from trove.trovesearch import search_params +from .sharev2_elastic5 import Sharev2Elastic5IndexStrategy +from .sharev2_elastic8 import Sharev2Elastic8IndexStrategy +from .trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy +from .trovesearch_indexcard import TrovesearchIndexcardIndexStrategy +from .trovesearch_irivalues import TrovesearchIrivaluesIndexStrategy +from .trovesearch_excessive import TrovesearchExcessiveIndexStrategy from ._base import IndexStrategy -__all__ = ('IndexStrategy',) +__all__ = ( + 'IndexStrategy', + 'all_index_strategies', + 'get_index_for_sharev2_search', + 'get_index_for_trovesearch', + 'get_index_strategy', + 'get_specific_index', +) + + +@functools.cache +def all_index_strategies() -> MappingProxyType[str, IndexStrategy]: + return MappingProxyType({ + _strategy.name: _strategy + for _strategy in _iter_all_index_strategies() + }) + + +def _iter_all_index_strategies(): + if settings.ELASTICSEARCH5_URL: + yield Sharev2Elastic5IndexStrategy(name='sharev2_elastic5') + if settings.ELASTICSEARCH8_URL: + yield Sharev2Elastic8IndexStrategy(name='sharev2_elastic8') + yield TroveIndexcardFlatsIndexStrategy(name='trove_indexcard_flats') + yield TrovesearchIndexcardIndexStrategy(name='trovesearch_indexcard') + yield TrovesearchIrivaluesIndexStrategy(name='trovesearch_irivalues') + yield TrovesearchExcessiveIndexStrategy(name='trovesearch_excessive') + + +def get_index_strategy(strategyname: str) -> IndexStrategy: + try: + return all_index_strategies()[strategyname] + except KeyError: + raise IndexStrategyError(f'unknown index strategy "{strategyname}"') + + +def get_specific_index(indexname_or_strategyname: str, *, for_search=False) -> IndexStrategy.SpecificIndex: + try: + _strategy = get_index_strategy(indexname_or_strategyname) + return ( + _strategy.pls_get_default_for_searching() + if for_search + else _strategy.for_current_index() + ) + except IndexStrategyError: + for _index_strategy in all_index_strategies().values(): + try: + return _index_strategy.for_specific_index(indexname_or_strategyname) + except IndexStrategyError: + pass + raise IndexStrategyError(f'unrecognized name "{indexname_or_strategyname}"') + + +def get_index_for_sharev2_search(requested_name=None) -> IndexStrategy.SpecificIndex: + if requested_name: + _name = requested_name + elif ( + settings.ELASTICSEARCH5_URL + and not FeatureFlag.objects.flag_is_up(FeatureFlag.ELASTIC_EIGHT_DEFAULT) + ): + _name = 'sharev2_elastic5' + elif settings.ELASTICSEARCH8_URL: + _name = 'sharev2_elastic8' + else: + raise IndexStrategyError('no available index for sharev2 search') + return get_specific_index(_name, for_search=True) + + +def get_index_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy.SpecificIndex: + if params.index_strategy_name: # specific strategy requested + _name = params.index_strategy_name + elif not FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_POLYSTRAT): + _name = 'trove_indexcard_flats' + else: + _name = ( + 'trovesearch_indexcard' + if TrovesearchIndexcardIndexStrategy.works_with_params(params) + else 'trovesearch_excessive' + ) + return get_specific_index(_name, for_search=True) diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index a2b14f7b5..1f21aefd5 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -1,12 +1,8 @@ import abc -import importlib import logging import typing -from django.conf import settings - from share.search import messages -from share.models.feature_flag import FeatureFlag from share.models.index_backfill import IndexBackfill from share.search.exceptions import IndexStrategyError from share.search.index_status import IndexStatus @@ -40,111 +36,17 @@ class IndexStrategy(abc.ABC): * may know of version- or cluster-specific features (should include identifiers like version numbers in subclass name) ''' - CURRENT_STRATEGY_CHECKSUM: ChecksumIri = None # set on subclasses to protect against accidents - - __all_strategys_by_name = None # cache for cls.all_strategies_by_name() - - @classmethod - def clear_strategy_cache(self): - self.__all_strategys_by_name = None - - @classmethod - def all_strategies_by_name(cls) -> 'dict[str, IndexStrategy]': - if cls.__all_strategys_by_name is None: - cls.__all_strategys_by_name = { - name: cls._load_from_settings(name, index_strategy_settings) - for name, index_strategy_settings - in settings.ELASTICSEARCH['INDEX_STRATEGIES'].items() - } - return cls.__all_strategys_by_name - - @classmethod - def all_strategies(cls) -> 'typing.Iterable[IndexStrategy]': - yield from cls.all_strategies_by_name().values() - - @classmethod - def get_by_name(cls, index_strategy_name: str) -> 'IndexStrategy': - try: - return cls.all_strategies_by_name()[index_strategy_name] - except KeyError: - raise IndexStrategyError(f'unknown index strategy "{index_strategy_name}"') - - @classmethod - def get_specific_index(cls, specific_indexname: str) -> 'IndexStrategy.SpecificIndex': - for index_strategy in cls.all_strategies(): - try: - return index_strategy.for_specific_index(specific_indexname) - except IndexStrategyError: - pass - raise IndexStrategyError(f'unrecognized indexname "{specific_indexname}"') - - @classmethod - def get_for_sharev2_search(cls, requested_name=None) -> 'IndexStrategy.SpecificIndex': - if requested_name: - _name = requested_name - else: - _name = ( - 'sharev2_elastic8' - if FeatureFlag.objects.flag_is_up(FeatureFlag.ELASTIC_EIGHT_DEFAULT) - else settings.DEFAULT_INDEX_STRATEGY_FOR_LEGACY_SEARCH - ) - try: # could be a strategy name - return cls.get_by_name(_name).pls_get_default_for_searching() - except IndexStrategyError: - try: # could be a specific indexname - return cls.get_specific_index(_name) - except IndexStrategyError: - raise IndexStrategyError(f'unknown name: "{_name}"') - - @classmethod - def get_for_trove_search(cls, requested_name=None) -> 'IndexStrategy.SpecificIndex': - if requested_name: - _name = requested_name - else: - _name = 'trove_indexcard_flats' - try: # could be a strategy name - return cls.get_by_name(_name).pls_get_default_for_searching() - except IndexStrategyError: - try: # could be a specific indexname - return cls.get_specific_index(_name) - except IndexStrategyError: - raise IndexStrategyError(f'unknown name: "{_name}"') - - @classmethod - def _load_from_settings(cls, index_strategy_name, index_strategy_settings): - assert set(index_strategy_settings) == {'INDEX_STRATEGY_CLASS', 'CLUSTER_SETTINGS'}, ( - 'values in settings.ELASTICSEARCH[\'INDEX_STRATEGIES\'] must have keys: ' - 'INDEX_STRATEGY_CLASS, CLUSTER_SETTINGS' - ) - class_path = index_strategy_settings['INDEX_STRATEGY_CLASS'] - module_name, separator, class_name = class_path.rpartition('.') - if not separator: - raise IndexStrategyError(f'INDEX_STRATEGY_CLASS should be importable dotted-path to an IndexStrategy class; got "{class_path}"') - assert module_name.startswith('share.search.index_strategy.'), ( - 'for now, INDEX_STRATEGY_CLASS must start with "share.search.index_strategy."' - f' (got "{module_name}")' - ) - index_strategy_class = getattr(importlib.import_module(module_name), class_name) - assert issubclass(index_strategy_class, cls) - return index_strategy_class( - name=index_strategy_name, - cluster_settings=index_strategy_settings['CLUSTER_SETTINGS'], - ) - - def __init__(self, name, cluster_settings): + CURRENT_STRATEGY_CHECKSUM: ChecksumIri # set on subclasses to protect against accidents + + def __init__(self, name): self.name = name - self.cluster_settings = cluster_settings def __repr__(self): return ''.join(( self.__class__.__qualname__, - f'(name={self.name})' + f'(name="{self.name}")' )) - @property - def cluster_url(self): - return self.cluster_settings['URL'] - @property def nonurgent_messagequeue_name(self): return f'{self.name}.nonurgent' @@ -188,7 +90,7 @@ def assert_strategy_is_current(self): ```''') def for_specific_index(self, specific_indexname) -> 'IndexStrategy.SpecificIndex': - return self.SpecificIndex(self, specific_indexname) + return self.SpecificIndex(self, specific_indexname) # type: ignore[abstract] def for_current_index(self) -> 'IndexStrategy.SpecificIndex': return self.for_specific_index(self.current_indexname) @@ -321,6 +223,9 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: raise NotImplementedError + def pls_get_mappings(self) -> dict: + raise NotImplementedError + # TODO someday: # def pls_handle_propertysearch(self, propertysearch_params: PropertysearchParams) -> PropertysearchResponse: # raise NotImplementedError diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py new file mode 100644 index 000000000..2e97cea82 --- /dev/null +++ b/share/search/index_strategy/_trovesearch_util.py @@ -0,0 +1,231 @@ +from __future__ import annotations +import base64 +from collections import defaultdict +import contextlib +import dataclasses +import datetime +import functools +import json +import logging +import typing + +from django.db.models import Exists, OuterRef +from primitive_metadata import primitive_rdf as rdf + +from trove import models as trove_db +from trove.trovesearch.search_params import ( + is_globpath, +) +from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri +from trove.vocab.namespaces import ( + DCTERMS, + FOAF, + OSFMAP, + OWL, + RDFS, + SKOS, + TROVE, +) +from trove.vocab.osfmap import is_date_property + + +_logger = logging.getLogger(__name__) + + +### +# type aliases + +Propertypath = tuple[str, ...] + + +### +# constants + +SKIPPABLE_PROPERTIES = ( + OSFMAP.contains, # too much, not helpful + OWL.sameAs, # handled special +) + +TITLE_PROPERTIES = (DCTERMS.title,) +NAME_PROPERTIES = (FOAF.name, OSFMAP.fileName) +LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel) +NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES) + +VALUESEARCH_MAX = 234 +CARDSEARCH_MAX = 9997 + +KEYWORD_LENGTH_MAX = 8191 # skip keyword terms that might exceed lucene's internal limit +# (see https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html) +KEYWORD_MAPPING = {'type': 'keyword', 'ignore_above': KEYWORD_LENGTH_MAX} +FLATTENED_MAPPING = {'type': 'flattened', 'ignore_above': KEYWORD_LENGTH_MAX} +TEXT_MAPPING = { + 'type': 'text', + 'index_options': 'offsets', # for highlighting +} +IRI_KEYWORD_MAPPING = { + 'type': 'object', + 'properties': { # for indexing iri values two ways: + 'exact': KEYWORD_MAPPING, # the exact iri value (e.g. "https://foo.example/bar/") + 'suffuniq': KEYWORD_MAPPING, # "sufficiently unique" (e.g. "://foo.example/bar") + }, +} + + +### +# utilities + +def latest_rdf_for_indexcard_pks(indexcard_pks): + return ( + trove_db.LatestIndexcardRdf.objects + .filter(indexcard_id__in=indexcard_pks) + .filter(Exists( + trove_db.DerivedIndexcard.objects + .filter(upriver_indexcard_id=OuterRef('indexcard_id')) + .filter(deriver_identifier__in=( + trove_db.ResourceIdentifier.objects + .queryset_for_iri(TROVE['derive/osfmap_json']) + )) + )) + .exclude(indexcard__deleted__isnull=False) + .select_related('indexcard__source_record_suid__source_config') + .prefetch_related('indexcard__focus_identifier_set') + ) + + +def propertypath_as_keyword(path: Propertypath) -> str: + return json.dumps(path if is_globpath(path) else [ + get_sufficiently_unique_iri(_iri) + for _iri in path + ]) + + +def propertypath_as_field_name(path: Propertypath) -> str: + _path_keyword = propertypath_as_keyword(path) + return base64.urlsafe_b64encode(_path_keyword.encode()).decode() + + +def suffuniq_iris(iris: typing.Iterable[str]) -> list[str]: + # deduplicates, may reorder + return list({ + get_sufficiently_unique_iri(_iri) + for _iri in iris + }) + + +@dataclasses.dataclass +class GraphWalk: + rdfdoc: rdf.RdfGraph + focus_iri: str + recursive: bool = True + iri_values: dict[Propertypath, set[str]] = dataclasses.field( + default_factory=lambda: defaultdict(set), + ) + text_values: dict[Propertypath, set[rdf.Literal]] = dataclasses.field( + default_factory=lambda: defaultdict(set), + ) + date_values: dict[Propertypath, set[datetime.date]] = dataclasses.field( + default_factory=lambda: defaultdict(set), + ) + paths_walked: set[Propertypath] = dataclasses.field(default_factory=set) + _visiting: set[str] = dataclasses.field(default_factory=set) + + def __post_init__(self): + for _walk_path, _walk_obj in self._walk_from_subject(self.focus_iri): + self.paths_walked.add(_walk_path) + if isinstance(_walk_obj, str): + self.iri_values[_walk_path].add(_walk_obj) + elif isinstance(_walk_obj, datetime.date): + self.date_values[_walk_path].add(_walk_obj) + elif is_date_property(_walk_path[-1]): + try: + _parsed_date = datetime.date.fromisoformat(_walk_obj.unicode_value) + except ValueError: + _logger.debug('skipping malformatted date "%s"', _walk_obj.unicode_value) + else: + self.date_values[_walk_path].add(_parsed_date) + elif isinstance(_walk_obj, rdf.Literal): + self.text_values[_walk_path].add(_walk_obj.unicode_value) + + def shortwalk(self, from_iri: str) -> GraphWalk: + return GraphWalk( + self.rdfdoc, + self.focus_iri, + recursive=False, + ) + + def _walk_from_subject( + self, + iri_or_blanknode: str | rdf.Blanknode, + path_so_far: tuple[str, ...] = (), + ) -> typing.Iterator[tuple[Propertypath, rdf.RdfObject]]: + '''walk the graph from the given subject, yielding (pathkey, obj) for every reachable object + ''' + with self._visit(iri_or_blanknode): + _twoples = ( + iri_or_blanknode + if isinstance(iri_or_blanknode, frozenset) + else self.rdfdoc.tripledict.get(iri_or_blanknode, {}) + ) + for _next_steps, _obj in walk_twoples(_twoples): + _path = (*path_so_far, *_next_steps) + yield (_path, _obj) + if self.recursive and isinstance(_obj, str) and (_obj not in self._visiting): + # step further for iri or blanknode + yield from self._walk_from_subject(_obj, path_so_far=_path) + + @functools.cached_property + def paths_by_iri(self) -> defaultdict[str, set[Propertypath]]: + _paths_by_iri: defaultdict[str, set[Propertypath]] = defaultdict(set) + for _path, _iris in self.iri_values.items(): + for _iri in _iris: + _paths_by_iri[_iri].add(_path) + return _paths_by_iri + + def iri_synonyms(self, iri: str) -> set[str]: + # note: extremely limited inference -- assumes objects of owl:sameAs are not used as subjects + _synonyms = ( + _synonym + for _synonym in self.rdfdoc.q(iri, OWL.sameAs) + if is_worthwhile_iri(_synonym) + ) + return {iri, *_synonyms} + + def iris_synonyms(self, iris: typing.Iterable[str]) -> set[str]: + return { + _synonym + for _iri in iris + for _synonym in self.iri_synonyms(_iri) + } + + @contextlib.contextmanager + def _visit(self, focus_obj): + assert focus_obj not in self._visiting + self._visiting.add(focus_obj) + yield + self._visiting.discard(focus_obj) + + +def walk_twoples( + twoples: rdf.RdfTwopleDictionary | rdf.Blanknode, +) -> typing.Iterator[tuple[Propertypath, rdf.RdfObject]]: + if isinstance(twoples, frozenset): + _iter_twoples = ( + (_pred, _obj) + for _pred, _obj in twoples + if _pred not in SKIPPABLE_PROPERTIES + ) + else: + _iter_twoples = ( + (_pred, _obj) + for _pred, _obj_set in twoples.items() + if _pred not in SKIPPABLE_PROPERTIES + for _obj in _obj_set + ) + for _pred, _obj in _iter_twoples: + _path = (_pred,) + if isinstance(_obj, frozenset): + for _innerpath, _innerobj in walk_twoples(_obj): + _fullpath = (*_path, *_innerpath) + yield (_fullpath, _innerobj) + else: + yield (_path, _obj) diff --git a/share/search/index_strategy/_util.py b/share/search/index_strategy/_util.py index ffa9999f6..5b3586006 100644 --- a/share/search/index_strategy/_util.py +++ b/share/search/index_strategy/_util.py @@ -2,6 +2,7 @@ import dataclasses import datetime import json +import typing def timestamp_to_readable_datetime(timestamp_in_milliseconds): @@ -16,10 +17,13 @@ def timestamp_to_readable_datetime(timestamp_in_milliseconds): def encode_cursor_dataclass(dataclass_instance) -> str: _as_json = json.dumps(dataclasses.astuple(dataclass_instance)) - _cursor_bytes = base64.b64encode(_as_json.encode()) + _cursor_bytes = base64.urlsafe_b64encode(_as_json.encode()) return _cursor_bytes.decode() -def decode_cursor_dataclass(cursor: str, dataclass_class) -> dict: - _as_list = json.loads(base64.b64decode(cursor)) +_SomeDataclass = typing.TypeVar('_SomeDataclass') + + +def decode_cursor_dataclass(cursor: str, dataclass_class: type[_SomeDataclass]) -> _SomeDataclass: + _as_list = json.loads(base64.urlsafe_b64decode(cursor)) return dataclass_class(*_as_list) diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 4fded9788..7e772e41f 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -1,5 +1,8 @@ +from __future__ import annotations import abc import collections +import dataclasses +from http import HTTPStatus import logging import typing @@ -26,10 +29,14 @@ def __init__(self, *args, **kwargs): should_sniff = settings.ELASTICSEARCH['SNIFF'] timeout = settings.ELASTICSEARCH['TIMEOUT'] self.es8_client = elasticsearch8.Elasticsearch( - self.cluster_url, + settings.ELASTICSEARCH8_URL, # security: - ca_certs=self.cluster_settings.get('CERT_PATH'), - basic_auth=self.cluster_settings.get('AUTH'), + ca_certs=settings.ELASTICSEARCH8_CERT_PATH, + basic_auth=( + (settings.ELASTICSEARCH8_USERNAME, settings.ELASTICSEARCH8_SECRET) + if settings.ELASTICSEARCH8_SECRET is not None + else None + ), # retry: retry_on_timeout=True, request_timeout=timeout, @@ -57,6 +64,13 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk) -> typin # yield (message_target_id, elastic_action) pairs raise NotImplementedError + def before_chunk( + self, + messages_chunk: messages.MessagesChunk, + indexnames: typing.Iterable[str], + ) -> None: + pass # implement when needed + ### # helper methods for subclasses to use (or override) @@ -109,45 +123,56 @@ def each_specific_index(self): def pls_handle_messages_chunk(self, messages_chunk): self.assert_message_type(messages_chunk.message_type) if messages_chunk.message_type.is_backfill: - indexnames = [self.current_indexname] + _indexnames = {self.current_indexname} else: - indexnames = self._get_indexnames_for_alias(self._alias_for_keeping_live) - _targetid_by_docid = {} - done_counter = collections.Counter() - bulk_stream = streaming_bulk( + _indexnames = self._get_indexnames_for_alias(self._alias_for_keeping_live) + self.before_chunk(messages_chunk, _indexnames) + _action_tracker = _ActionTracker() + _bulk_stream = streaming_bulk( self.es8_client, - self._elastic_actions_with_index(messages_chunk, indexnames, _targetid_by_docid), + self._elastic_actions_with_index(messages_chunk, _indexnames, _action_tracker), raise_on_error=False, max_retries=settings.ELASTICSEARCH['MAX_RETRIES'], ) - for (_ok, _response) in bulk_stream: + for (_ok, _response) in _bulk_stream: (_op_type, _response_body) = next(iter(_response.items())) _status = _response_body.get('status') _docid = _response_body['_id'] + _indexname = _response_body['_index'] _is_done = _ok or (_op_type == 'delete' and _status == 404) - _message_target_id = _targetid_by_docid[_docid] - done_counter[_message_target_id] += 1 - if done_counter[_message_target_id] >= len(indexnames): + if _is_done: + _action_tracker.action_done(_indexname, _docid) + else: + _action_tracker.action_errored(_indexname, _docid) + # yield error responses immediately yield messages.IndexMessageResponse( - is_done=_is_done, - index_message=messages.IndexMessage(messages_chunk.message_type, _message_target_id), - status_code=_response_body.get('status'), - error_text=( - None - if _ok - else str(_response_body) - ) + is_done=False, + index_message=messages.IndexMessage( + messages_chunk.message_type, + _action_tracker.get_message_id(_docid), + ), + status_code=_status, + error_text=str(_response_body), ) + # yield successes after the whole chunk completes + # (since one message may involve several actions) + for _messageid in _action_tracker.all_done_messages(): + yield messages.IndexMessageResponse( + is_done=True, + index_message=messages.IndexMessage(messages_chunk.message_type, _messageid), + status_code=HTTPStatus.OK.value, + error_text=None, + ) # abstract method from IndexStrategy - def pls_make_default_for_searching(self, specific_index: 'SpecificIndex'): + def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificIndex): self._set_indexnames_for_alias( self._alias_for_searching, {specific_index.indexname}, ) # abstract method from IndexStrategy - def pls_get_default_for_searching(self) -> 'SpecificIndex': + def pls_get_default_for_searching(self) -> IndexStrategy.SpecificIndex: # a SpecificIndex for an alias will work fine for searching, but # will error if you try to invoke lifecycle hooks return self.for_specific_index(self._alias_for_searching) @@ -166,12 +191,13 @@ def _alias_for_searching(self): def _alias_for_keeping_live(self): return f'{self.indexname_prefix}live' - def _elastic_actions_with_index(self, messages_chunk, indexnames, targetid_by_docid): + def _elastic_actions_with_index(self, messages_chunk, indexnames, action_tracker: _ActionTracker): if not indexnames: raise ValueError('cannot index to no indexes') for _message_target_id, _elastic_action in self.build_elastic_actions(messages_chunk): - targetid_by_docid[_elastic_action['_id']] = _message_target_id + _docid = _elastic_action['_id'] for _indexname in indexnames: + action_tracker.add_action(_message_target_id, _indexname, _docid) yield { **_elastic_action, '_index': _indexname, @@ -325,3 +351,37 @@ def pls_stop_keeping_live(self): alias_name=self.index_strategy._alias_for_keeping_live, ) logger.warning('%r: no longer kept live', self) + + def pls_get_mappings(self): + return self.index_strategy.es8_client.indices.get_mapping(index=self.indexname).body + + +@dataclasses.dataclass +class _ActionTracker: + messageid_by_docid: dict[str, int] = dataclasses.field(default_factory=dict) + actions_by_messageid: dict[int, set[tuple[str, str]]] = dataclasses.field( + default_factory=lambda: collections.defaultdict(set), + ) + errored_messageids: set[int] = dataclasses.field(default_factory=set) + + def add_action(self, message_id: int, index_name: str, doc_id: str): + self.messageid_by_docid[doc_id] = message_id + self.actions_by_messageid[message_id].add((index_name, doc_id)) + + def action_done(self, index_name: str, doc_id: str): + _messageid = self.messageid_by_docid[doc_id] + _message_actions = self.actions_by_messageid[_messageid] + _message_actions.discard((index_name, doc_id)) + + def action_errored(self, index_name: str, doc_id: str): + _messageid = self.messageid_by_docid[doc_id] + self.errored_messageids.add(_messageid) + + def get_message_id(self, doc_id: str): + return self.messageid_by_docid[doc_id] + + def all_done_messages(self): + for _messageid, _actions in self.actions_by_messageid.items(): + if _messageid not in self.errored_messageids: + assert not _actions + yield _messageid diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index 951921f56..13edb4881 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -39,7 +39,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) should_sniff = settings.ELASTICSEARCH['SNIFF'] self.es5_client = elasticsearch5.Elasticsearch( - self.cluster_url, + settings.ELASTICSEARCH5_URL, retry_on_timeout=True, timeout=settings.ELASTICSEARCH['TIMEOUT'], # sniff before doing anything diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index e2b879d24..a88946b45 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -345,18 +345,14 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: _cursor = _SimpleCursor.from_page_param(valuesearch_params.page) - _is_date_search = all( - is_date_property(_path[-1]) - for _path in valuesearch_params.valuesearch_propertypath_set - ) + _is_date_search = is_date_property(valuesearch_params.valuesearch_propertypath[-1]) _search_kwargs = dict( query=self._cardsearch_query( valuesearch_params.cardsearch_filter_set, valuesearch_params.cardsearch_textsegment_set, - additional_filters=[{'terms': {'iri_paths_present': [ - iri_path_as_keyword(_path) - for _path in valuesearch_params.valuesearch_propertypath_set - ]}}], + additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + )}}], ), size=0, # ignore cardsearch hits; just want the aggs aggs=( @@ -451,46 +447,14 @@ def _cardsearch_aggs(self, cardsearch_params): ], 'size': len(cardsearch_params.related_property_paths), }} - if cardsearch_params.unnamed_iri_values: - _aggs['global_agg'] = { - 'global': {}, - 'aggs': { - 'filtervalue_info': { - 'nested': {'path': 'nested_iri'}, - 'aggs': { - 'iri_values': { - 'terms': { - 'field': 'nested_iri.iri_value', - 'include': list(cardsearch_params.unnamed_iri_values), - 'size': len(cardsearch_params.unnamed_iri_values), - }, - 'aggs': { - 'type_iri': {'terms': { - 'field': 'nested_iri.value_type_iri', - }}, - 'name_text': {'terms': { - 'field': 'nested_iri.value_name_text.raw', - }}, - 'title_text': {'terms': { - 'field': 'nested_iri.value_title_text.raw', - }}, - 'label_text': {'terms': { - 'field': 'nested_iri.value_label_text.raw', - }}, - }, - }, - }, - }, - }, - } return _aggs def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: '_SimpleCursor'): _nested_iri_bool = { - 'filter': [{'terms': {'nested_iri.suffuniq_path_from_focus': [ - iri_path_as_keyword(_path, suffuniq=True) - for _path in valuesearch_params.valuesearch_propertypath_set - ]}}], + 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + )}}], 'must': [], 'must_not': [], 'should': [], @@ -552,11 +516,11 @@ def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): 'nested': {'path': 'nested_date'}, 'aggs': { 'value_at_propertypath': { - 'filter': {'terms': { - 'nested_date.suffuniq_path_from_focus': [ - iri_path_as_keyword(_path, suffuniq=True) - for _path in valuesearch_params.valuesearch_propertypath_set - ], + 'filter': {'term': { + 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + ), }}, 'aggs': { 'count_by_year': { @@ -760,7 +724,7 @@ def _cardsearch_response( _uuid: _i for (_i, _uuid) in enumerate(cursor.first_page_uuids) } - _results.sort(key=lambda _r: _uuid_index[_r.card_uuid()]) + _results.sort(key=lambda _r: _uuid_index[_r.card_uuid]) else: _should_start_reproducible_randomness = ( cursor.random_sort @@ -773,16 +737,9 @@ def _cardsearch_response( ) if _should_start_reproducible_randomness: cursor.first_page_uuids = tuple( - _result.card_uuid() + _result.card_uuid for _result in _results ) - _filtervalue_info = [] - if cardsearch_params.unnamed_iri_values: - _filtervalue_agg = es8_response['aggregations']['global_agg']['filtervalue_info']['iri_values'] - _filtervalue_info.extend( - self._valuesearch_iri_result(_iri_bucket) - for _iri_bucket in _filtervalue_agg['buckets'] - ) _relatedproperty_list = [] if cardsearch_params.related_property_paths: _relatedproperty_list.extend( @@ -803,7 +760,6 @@ def _cardsearch_response( else cursor.result_count ), search_result_page=_results, - filtervalue_info=_filtervalue_info, related_propertypath_results=_relatedproperty_list, next_page_cursor=cursor.next_cursor(), prev_page_cursor=cursor.prev_cursor(), diff --git a/share/search/index_strategy/trovesearch_excessive.py b/share/search/index_strategy/trovesearch_excessive.py new file mode 100644 index 000000000..1cb6ae6b2 --- /dev/null +++ b/share/search/index_strategy/trovesearch_excessive.py @@ -0,0 +1,93 @@ +import typing + +from primitive_metadata import primitive_rdf as rdf + +from share.util.checksum_iri import ChecksumIri + +from . import _trovesearch_util as ts +from .trovesearch_indexcard import TrovesearchIndexcardIndexStrategy as IndexcardStrategy + + +class TrovesearchExcessiveIndexStrategy(IndexcardStrategy): + '''a more complicated version of the "indexcard" trovesearch strategy + + for `index-value-search` queries that the flatter index can't handle + ''' + CURRENT_STRATEGY_CHECKSUM = ChecksumIri( + checksumalgorithm_name='sha-256', + salt='TrovesearchExcessiveIndexStrategy', + hexdigest='...', + ) + + # override TrovesearchIndexcardIndexStrategy + def index_mappings(self): + _mappings = super().index_mappings() + _namelike_text_mapping = { + **ts.TEXT_MAPPING, + 'fields': {'keyword': ts.KEYWORD_MAPPING}, + 'copy_to': 'iri_usage.namelike_text', + } + # add nested properties + # (warning: SLOW, use only when needed (and do be sure to question that need)) + _mappings['properties']['iri_usage'] = { + 'type': 'nested', + 'properties': { + 'iri': ts.IRI_KEYWORD_MAPPING, # include sameAs + 'propertypath_from_focus': ts.KEYWORD_MAPPING, + 'depth_from_focus': ts.KEYWORD_MAPPING, + # flattened properties (dynamic sub-properties with keyword values) + 'relative_iri_by_propertypath': ts.FLATTENED_MAPPING, + 'relative_iri_by_depth': ts.FLATTENED_MAPPING, + # text properties (only a few) + 'name_text': _namelike_text_mapping, + 'title_text': _namelike_text_mapping, + 'label_text': _namelike_text_mapping, + 'namelike_text': {'type': 'text'}, + }, + } + return _mappings + + # override TrovesearchIndexcardIndexStrategy + class _SourcedocBuilder(IndexcardStrategy._SourcedocBuilder): + # override TrovesearchIndexcardIndexStrategy._SourcedocBuilder + def build(self): + _sourcedoc = super().build() + _sourcedoc['iri_usage'] = self._nested_iri_usages() + return _sourcedoc + + def _nested_iri_usages(self) -> list: + return list(filter(bool, ( + self._iri_usage_sourcedoc(_iri, _paths) + for _iri, _paths in self._fullwalk.paths_by_iri.items() + ))) + + def _iri_usage_sourcedoc(self, iri: str, paths: set[ts.Propertypath]) -> dict | None: + _shortwalk = self._fullwalk.shortwalk(iri) + return { + 'iri': self._exact_and_suffuniq_iris([iri], _shortwalk), + 'propertypath_from_focus': list(map(ts.propertypath_as_keyword, paths)), + 'depth_from_focus': list(map(len, paths)), + 'iri_by_propertypath': self._iris_by_propertypath(_shortwalk), + 'iri_by_depth': self._iris_by_depth(_shortwalk), + 'dynamics': { + 'text_by_propertypath': self._texts_by_propertypath(_shortwalk), + 'text_by_depth': self._texts_by_depth(_shortwalk), + 'date_by_propertypath': self._dates_by_propertypath(_shortwalk), + }, + } + + def _gather_text_values(self, focus_iri, pathset) -> typing.Iterator[str]: + for _obj in self.rdfdoc.q(focus_iri, pathset): + if isinstance(_obj, rdf.Literal): + yield _obj.unicode_value + + # override TrovesearchIndexcardIndexStrategy + class _ValuesearchQueryBuilder(IndexcardStrategy._ValuesearchQueryBuilder): + ... + + # override _CardsearchQueryBuilder + def _additional_cardsearch_filters(self) -> list[dict]: + # TODO: consider + return [{'term': {'propertypaths_present': ts.propertypath_as_keyword( + self.params.valuesearch_propertypath + )}}] diff --git a/share/search/index_strategy/trovesearch_indexcard.py b/share/search/index_strategy/trovesearch_indexcard.py new file mode 100644 index 000000000..89ee0f2a0 --- /dev/null +++ b/share/search/index_strategy/trovesearch_indexcard.py @@ -0,0 +1,858 @@ +from __future__ import annotations +import base64 +from collections import abc, defaultdict +import dataclasses +import functools +import json +import logging +import re +from typing import Iterable, ClassVar, Iterator + +from django.conf import settings +import elasticsearch8 +from primitive_metadata import primitive_rdf as rdf + +from share.search import exceptions +from share.search import messages +from share.search.index_strategy.elastic8 import Elastic8IndexStrategy +from share.search.index_strategy._util import encode_cursor_dataclass, decode_cursor_dataclass +from share.util.checksum_iri import ChecksumIri +from trove import models as trove_db +from trove.trovesearch.search_params import ( + CardsearchParams, + ValuesearchParams, + SearchFilter, + Textsegment, + PageParam, + is_globpath, +) +from trove.trovesearch.search_response import ( + CardsearchResponse, + ValuesearchResponse, + TextMatchEvidence, + CardsearchResult, + ValuesearchResult, + PropertypathUsage, +) +from trove.vocab.osfmap import is_date_property +from trove.vocab.namespaces import TROVE +from . import _trovesearch_util as ts + + +logger = logging.getLogger(__name__) + + +class TrovesearchIndexcardIndexStrategy(Elastic8IndexStrategy): + CURRENT_STRATEGY_CHECKSUM = ChecksumIri( + checksumalgorithm_name='sha-256', + salt='TrovesearchIndexcardIndexStrategy', + hexdigest='...', + ) + + @classmethod + def works_with_params(cls, params: CardsearchParams): + return ( + not isinstance(params, ValuesearchParams) + or ( # constraints on valuesearch: + not params.valuesearch_textsegment_set + and all( + _filter.is_sameas_filter() + for _filter in params.valuesearch_filter_set + ) + ) + ) + + # abstract method from IndexStrategy + @property + def supported_message_types(self): + return { + messages.MessageType.UPDATE_INDEXCARD, + messages.MessageType.BACKFILL_INDEXCARD, + } + + # abstract method from IndexStrategy + @property + def backfill_message_type(self): + return messages.MessageType.BACKFILL_INDEXCARD + + # abstract method from Elastic8IndexStrategy + def index_settings(self): + return {} + + # abstract method from Elastic8IndexStrategy + def index_mappings(self): + return { + 'dynamic': 'false', + 'properties': { + # simple keyword properties + 'indexcard_iri': ts.KEYWORD_MAPPING, + 'indexcard_pk': ts.KEYWORD_MAPPING, + 'suid': { + 'type': 'object', + 'properties': { + 'source_config_label': ts.KEYWORD_MAPPING, + 'source_record_identifier': ts.KEYWORD_MAPPING, + }, + }, + 'focus_iri': ts.IRI_KEYWORD_MAPPING, + 'propertypaths_present': ts.KEYWORD_MAPPING, + # flattened properties (dynamic sub-properties with keyword values) + 'iri_by_propertypath': ts.FLATTENED_MAPPING, + 'iri_by_depth': ts.FLATTENED_MAPPING, + # dynamic properties (see dynamic_templates, below) + 'dynamics': { + 'type': 'object', + 'properties': { + 'text_by_propertypath': {'type': 'object', 'dynamic': True}, + 'text_by_depth': {'type': 'object', 'dynamic': True}, + 'date_by_propertypath': {'type': 'object', 'dynamic': True}, + }, + }, + }, + 'dynamic_templates': [ + {'dynamic_text_by_path': { + 'path_match': 'dynamics.text_by_propertypath.*', + 'mapping': ts.TEXT_MAPPING, + }}, + {'dynamic_text_by_depth': { + 'path_match': 'dynamics.text_by_depth.*', + 'mapping': ts.TEXT_MAPPING, + }}, + {'dynamic_date': { + 'path_match': 'dynamics.date_by_propertypath.*', + 'mapping': { + 'type': 'date', + 'format': 'strict_date_optional_time', + }, + }}, + ], + } + + # abstract method from Elastic8IndexStrategy + def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): + _indexcard_rdf_qs = ( + ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) + .select_related('indexcard__source_record_suid__source_config') + ) + _remaining_indexcard_pks = set(messages_chunk.target_ids_chunk) + for _indexcard_rdf in _indexcard_rdf_qs: + _docbuilder = self._SourcedocBuilder(_indexcard_rdf) + if not _docbuilder.should_skip(): # if skipped, will be deleted + _indexcard_pk = _indexcard_rdf.indexcard_id + _index_action = self.build_index_action( + doc_id=str(_indexcard_pk), + doc_source=_docbuilder.build(), + ) + _remaining_indexcard_pks.discard(_indexcard_pk) + yield _indexcard_pk, _index_action + # delete any that don't have "latest" rdf and derived osfmap_json + for _indexcard_pk in _remaining_indexcard_pks: + yield _indexcard_pk, self.build_delete_action(_indexcard_pk) + + ### + # implement abstract IndexStrategy.SpecificIndex + + class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): + + # abstract method from IndexStrategy.SpecificIndex + def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + return self.index_strategy.es8_client.search( + index=self.indexname, + body={ + **(request_body or {}), + 'track_total_hits': True, + }, + params=(request_queryparams or {}), + ) + + # abstract method from IndexStrategy.SpecificIndex + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse: + _querybuilder = self.index_strategy._CardsearchQueryBuilder(cardsearch_params) + _search_kwargs = _querybuilder.build() + _cursor = _querybuilder.cardsearch_cursor + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.index_strategy.es8_client.search( + index=self.indexname, + source=False, # no need to get _source, identifiers are enough + docvalue_fields=['indexcard_iri'], + highlight={ # TODO: only one field gets highlighted? + 'require_field_match': False, + 'fields': {'dynamics.text_by_propertypath.*': {}}, + }, + **_search_kwargs, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self.index_strategy._cardsearch_response(cardsearch_params, _es8_response, _cursor) + + # abstract method from IndexStrategy.SpecificIndex + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: + _querybuilder = self.index_strategy._ValuesearchQueryBuilder(valuesearch_params) + _search_kwargs = _querybuilder.build() + _cursor = _querybuilder.valuesearch_cursor + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.index_strategy.es8_client.search( + index=self.indexname, + **_search_kwargs, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self.index_strategy._valuesearch_response(valuesearch_params, _es8_response, _cursor) + + ### + # building sourcedocs + + @dataclasses.dataclass + class _SourcedocBuilder: + '''build an elasticsearch sourcedoc for an rdf document + ''' + indexcard_rdf: trove_db.IndexcardRdf + indexcard: trove_db.Indexcard = dataclasses.field(init=False) + rdfdoc: rdf.RdfTripleDictionary = dataclasses.field(init=False) + focus_iri: str = dataclasses.field(init=False) + + def __post_init__(self) -> None: + self.indexcard = self.indexcard_rdf.indexcard + self.rdfdoc = rdf.RdfGraph(self.indexcard_rdf.as_rdf_tripledict()) + self.focus_iri = self.indexcard_rdf.focus_iri + + def should_skip(self) -> bool: + _suid = self.indexcard.source_record_suid + return ( + # skip cards that belong to an obsolete suid with a later duplicate + _suid.has_forecompat_replacement() + # ...or that are without some value for name/title/label + or not any(self.rdfdoc.q(self.focus_iri, ts.NAMELIKE_PROPERTIES)) + ) + + def build(self) -> dict: + _sourcedoc = { + 'indexcard_iri': self.indexcard.get_iri(), + 'indexcard_pk': str(self.indexcard.pk), + 'suid': { + 'source_record_identifier': self.indexcard.source_record_suid.identifier, + 'source_config_label': self.indexcard.source_record_suid.source_config.label, + }, + 'focus_iri': self._exact_and_suffuniq_iris([self.focus_iri], self._fullwalk), + 'propertypaths_present': self._propertypaths_present(self._fullwalk), + 'iri_by_propertypath': self._iris_by_propertypath(self._fullwalk), + 'iri_by_depth': self._iris_by_depth(self._fullwalk), + 'dynamics': { + 'text_by_propertypath': self._texts_by_propertypath(self._fullwalk), + 'text_by_depth': self._texts_by_depth(self._fullwalk), + 'date_by_propertypath': self._dates_by_propertypath(self._fullwalk), + }, + } + return _sourcedoc + + @functools.cached_property + def _fullwalk(self) -> ts.GraphWalk: + return ts.GraphWalk(self.rdfdoc, self.focus_iri) + + def _propertypaths_present(self, walk: ts.GraphWalk): + return [ + ts.propertypath_as_keyword(_path) + for _path in walk.paths_walked + ] + + def _iris_by_propertypath(self, walk: ts.GraphWalk): + return { + ts.propertypath_as_field_name(_path): ts.suffuniq_iris(walk.iris_synonyms(_iris)) + for _path, _iris in walk.iri_values.items() + } + + def _iris_by_depth(self, walk: ts.GraphWalk): + _by_depth: dict[int, set[str]] = defaultdict(set) + for _path, _iris in walk.iri_values.items(): + _by_depth[len(_path)].update(_iris) + return { + _depth_field_name(_depth): ts.suffuniq_iris(walk.iris_synonyms(_iris)) + for _depth, _iris in _by_depth.items() + } + + def _texts_by_propertypath(self, walk: ts.GraphWalk): + return { + ts.propertypath_as_field_name(_path): list(_value_set) + for _path, _value_set in walk.text_values.items() + } + + def _texts_by_depth(self, walk: ts.GraphWalk): + _by_depth: dict[int, set[str]] = defaultdict(set) + for _path, _value_set in walk.text_values.items(): + _by_depth[len(_path)].update(_value_set) + return { + _depth_field_name(_depth): list(_value_set) + for _depth, _value_set in _by_depth.items() + } + + def _dates_by_propertypath(self, walk: ts.GraphWalk): + return { + ts.propertypath_as_field_name(_path): [ + _date.isoformat() + for _date in _value_set + ] + for _path, _value_set in walk.date_values.items() + } + + def _exact_and_suffuniq_iris(self, iris: Iterable[str], walk: ts.GraphWalk): + _synonyms = walk.iris_synonyms(iris) + return { + 'exact': list(_synonyms), + 'suffuniq': ts.suffuniq_iris(_synonyms), + } + + ### + # building queries + + @dataclasses.dataclass + class _CardsearchQueryBuilder: + params: CardsearchParams + + def build(self): + return { + 'query': self._cardsearch_query(), + 'aggs': self._cardsearch_aggs(), + 'sort': list(self._cardsearch_sorts()) or None, + 'from_': self.cardsearch_cursor.cardsearch_start_index(), + 'size': self.cardsearch_cursor.page_size, + } + + @functools.cached_property + def cardsearch_cursor(self): + return _CardsearchCursor.from_cardsearch_params(self.params) + + @property + def relevance_matters(self) -> bool: + return not self.cardsearch_cursor.random_sort + + def _cardsearch_query(self) -> dict: + _bool_query = { + 'filter': self._additional_cardsearch_filters(), + 'must': [], + 'must_not': [], + 'should': [], + } + # iri-keyword filters + for _searchfilter in self.params.cardsearch_filter_set: + if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: + _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: + _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: + _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: + _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) + elif _searchfilter.operator.is_date_operator(): + _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) + else: + raise ValueError(f'unknown filter operator {_searchfilter.operator}') + # text-based queries + for _boolkey, _textquery in self._cardsearch_text_boolparts(): + _bool_query[_boolkey].append(_textquery) + return self._wrap_bool_query(_bool_query) + + def _wrap_bool_query(self, bool_query_innards) -> dict: + # note: may modify bool_query_innards in-place + _cursor = self.cardsearch_cursor + if not _cursor or not _cursor.random_sort: + # no need for randomness + return {'bool': bool_query_innards} + if not _cursor.first_page_pks: + # independent random sample + return { + 'function_score': { + 'query': {'bool': bool_query_innards}, + 'boost_mode': 'replace', + 'random_score': {}, # default random_score is fast and unpredictable + }, + } + _firstpage_filter = {'terms': {'indexcard_pk': _cursor.first_page_pks}} + if _cursor.is_first_page(): + # returning to a first page previously visited + bool_query_innards['filter'].append(_firstpage_filter) + return {'bool': bool_query_innards} + # get a subsequent page using reproducible randomness + bool_query_innards['must_not'].append(_firstpage_filter) + return { + 'function_score': { + 'query': {'bool': bool_query_innards}, + 'boost_mode': 'replace', + 'random_score': { + 'seed': ''.join(_cursor.first_page_pks), + 'field': 'indexcard_pk', + }, + }, + } + + def _additional_cardsearch_filters(self) -> list[dict]: + return [] # for overriding + + def _cardsearch_aggs(self): + _aggs = {} + if self.params.related_property_paths: + _aggs['agg_related_propertypath_usage'] = {'terms': { + 'field': 'propertypaths_present', + 'include': [ + ts.propertypath_as_keyword(_path) + for _path in self.params.related_property_paths + ], + 'size': len(self.params.related_property_paths), + }} + return _aggs + + def _cardsearch_presence_query(self, search_filter) -> dict: + return _any_query([ + self._cardsearch_path_presence_query(_path) + for _path in search_filter.propertypath_set + ]) + + def _cardsearch_path_presence_query(self, path: ts.Propertypath): + return {'term': {'propertypaths_present': ts.propertypath_as_keyword(path)}} + + def _cardsearch_iri_filter(self, search_filter) -> dict: + _iris = ts.suffuniq_iris(search_filter.value_set) + return _any_query([ + self._cardsearch_path_iri_query(_path, _iris) + for _path in search_filter.propertypath_set + ]) + + def _cardsearch_path_iri_query(self, path, suffuniq_iris): + _field = ( + f'iri_by_propertypath.{ts.propertypath_as_field_name(path)}' + if not is_globpath(path) + else f'iri_by_depth.{_depth_field_name(len(path))}' + ) + return {'terms': {_field: suffuniq_iris}} + + def _cardsearch_date_filter(self, search_filter): + return _any_query([ + self._date_filter_for_path(_path, search_filter.operator, search_filter.value_set) + for _path in search_filter.propertypath_set + ]) + + def _date_filter_for_path(self, path, filter_operator, value_set): + _field = f'dynamics.date_by_propertypath.{ts.propertypath_as_field_name(path)}' + if filter_operator == SearchFilter.FilterOperator.BEFORE: + _value = min(value_set) # rely on string-comparable isoformat + return {'range': {_field: {'lt': _daterange_value(_value)}}} + elif filter_operator == SearchFilter.FilterOperator.AFTER: + _value = max(value_set) # rely on string-comparable isoformat + return {'range': {_field: {'gt': _daterange_value(_value)}}} + elif filter_operator == SearchFilter.FilterOperator.AT_DATE: + return _any_query([ + {'range': {_field: {'gte': _filtervalue, 'lte': _filtervalue}}} + for _filtervalue in map(_daterange_value, value_set) + ]) + else: + raise ValueError(f'invalid date filter operator (got {filter_operator})') + + def _cardsearch_sorts(self): + for _sortparam in self.params.sort_list: + _pathfield = ts.propertypath_as_field_name((_sortparam.property_iri,)) + _fieldpath = f'dynamics.date_by_propertypath.{_pathfield}' + _order = 'desc' if _sortparam.descending else 'asc' + yield {_fieldpath: _order} + + def _cardsearch_text_boolparts(self) -> Iterator[tuple[str, dict]]: + for _textsegment in self.params.cardsearch_textsegment_set: + if _textsegment.is_negated: + yield 'must_not', self._exact_text_query(_textsegment) + elif not _textsegment.is_fuzzy: + yield 'must', self._exact_text_query(_textsegment) + else: + yield 'must', self._fuzzy_text_must_query(_textsegment) + if self.relevance_matters: + yield 'should', self._fuzzy_text_should_query(_textsegment) + + def _text_field_name(self, propertypath: ts.Propertypath): + return ( + f'dynamics.text_by_propertypath.{ts.propertypath_as_field_name(propertypath)}' + if not is_globpath(propertypath) + else f'dynamics.text_by_depth.{_depth_field_name(len(propertypath))}' + ) + + def _exact_text_query(self, textsegment: Textsegment) -> dict: + # TODO: textsegment.is_openended (prefix query) + return _any_query([ + {'match_phrase': {self._text_field_name(_path): {'query': textsegment.text}}} + for _path in textsegment.propertypath_set + ]) + + def _fuzzy_text_must_query(self, textsegment: Textsegment) -> dict: + # TODO: textsegment.is_openended (prefix query) + return _any_query([ + {'match': { + self._text_field_name(_path): { + 'query': textsegment.text, + 'fuzziness': 'AUTO', + # TODO: consider 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) + }, + }} + for _path in textsegment.propertypath_set + ]) + + def _fuzzy_text_should_query(self, textsegment: Textsegment): + _slop = len(textsegment.text.split()) + return _any_query([ + {'match_phrase': { + self._text_field_name(_path): {'query': textsegment.text, 'slop': _slop}, + }} + for _path in textsegment.propertypath_set + ]) + + class _ValuesearchQueryBuilder(_CardsearchQueryBuilder): + params: ValuesearchParams + + # override _CardsearchQueryBuilder + def build(self): + if self._is_date_valuesearch(): + _aggs = self._valuesearch_date_aggs() + else: + _aggs = self._valuesearch_iri_aggs() + return dict( + query=self._cardsearch_query(), + size=0, # ignore cardsearch hits; just want the aggs + aggs=_aggs, + ) + + @functools.cached_property + def valuesearch_cursor(self): + return _SimpleCursor.from_page_param(self.params.page) + + # override _CardsearchQueryBuilder + @property + def relevance_matters(self) -> bool: + return False # valuesearch always ordered by count + + def _is_date_valuesearch(self) -> bool: + return is_date_property(self.params.valuesearch_propertypath[-1]) + + def _valuesearch_iri_aggs(self): + _propertypath = self.params.valuesearch_propertypath + _field = f'iri_by_propertypath.{ts.propertypath_as_field_name(_propertypath)}' + _terms_agg: dict = {'field': _field} + _specific_iris = list(set(self.params.valuesearch_iris())) + if _specific_iris: + _terms_agg['include'] = _specific_iris + _terms_agg['size'] = len(_specific_iris) + return {'agg_valuesearch_iris': {'terms': _terms_agg}} + + def _valuesearch_date_aggs(self): + _propertypath = self.params.valuesearch_propertypath + _field = f'date_by_propertypath.{ts.propertypath_as_field_name(_propertypath)}' + _aggs = { + 'agg_valuesearch_dates': { + 'date_histogram': { + 'field': _field, + 'calendar_interval': 'year', + 'format': 'yyyy', + 'order': {'_key': 'desc'}, + 'min_doc_count': 1, + }, + }, + } + return _aggs + + ### + # normalizing search responses + + def _valuesearch_response( + self, + valuesearch_params: ValuesearchParams, + es8_response: dict, + cursor: '_SimpleCursor', + ) -> ValuesearchResponse: + _iri_aggs = es8_response['aggregations'].get('agg_valuesearch_iris') + if _iri_aggs: + _buckets = _iri_aggs['buckets'] + _bucket_count = len(_buckets) + # WARNING: terribly inefficient pagination (part two) + _page_end_index = cursor.start_index + cursor.page_size + _bucket_page = _buckets[cursor.start_index:_page_end_index] # discard prior pages + cursor.result_count = ( + -1 # "many more" + if (_bucket_count > _page_end_index) # agg includes one more, if there + else _bucket_count + ) + return ValuesearchResponse( + search_result_page=[ + self._valuesearch_iri_result(_iri_bucket) + for _iri_bucket in _bucket_page + ], + next_page_cursor=cursor.next_cursor(), + prev_page_cursor=cursor.prev_cursor(), + first_page_cursor=cursor.first_cursor(), + ) + else: # assume date + _year_buckets = ( + es8_response['aggregations'] + ['agg_valuesearch_dates'] + ['buckets'] + ) + return ValuesearchResponse( + search_result_page=[ + self._valuesearch_date_result(_year_bucket) + for _year_bucket in _year_buckets + ], + ) + + def _valuesearch_iri_result(self, iri_bucket) -> ValuesearchResult: + return ValuesearchResult( + value_iri=iri_bucket['key'], + # TODO: get type and text somehow + value_type=_bucketlist(iri_bucket.get('type_iri', [])), + name_text=_bucketlist(iri_bucket.get('name_text', [])), + title_text=_bucketlist(iri_bucket.get('title_text', [])), + label_text=_bucketlist(iri_bucket.get('label_text', [])), + match_count=iri_bucket['doc_count'], + ) + + def _valuesearch_date_result(self, date_bucket) -> ValuesearchResult: + return ValuesearchResult( + value_iri=None, + value_value=date_bucket['key_as_string'], + label_text=(date_bucket['key_as_string'],), + match_count=date_bucket['doc_count'], + ) + + def _cardsearch_response( + self, + cardsearch_params: CardsearchParams, + es8_response: dict, + cursor: '_CardsearchCursor', + ) -> CardsearchResponse: + _es8_total = es8_response['hits']['total'] + if _es8_total['relation'] != 'eq': + cursor.result_count = -1 # "too many" + else: # exact (and small) count + cursor.result_count = _es8_total['value'] + if cursor.random_sort and not cursor.is_first_page(): + # account for the filtered-out first page + assert cursor.result_count is not None + cursor.result_count += len(cursor.first_page_pks) + _results = [] + for _es8_hit in es8_response['hits']['hits']: + _card_iri = _es8_hit['fields']['indexcard_iri'][0] + _results.append(CardsearchResult( + card_iri=_card_iri, + card_pk=_es8_hit['_id'], + text_match_evidence=list(self._gather_textmatch_evidence(_card_iri, _es8_hit)), + )) + if cursor.is_first_page() and cursor.first_page_pks: + # revisiting first page; reproduce original random order + _ordering_by_id = { + _id: _i + for (_i, _id) in enumerate(cursor.first_page_pks) + } + _results.sort(key=lambda _r: _ordering_by_id[_r.card_pk]) + else: + _should_start_reproducible_randomness = ( + cursor.random_sort + and cursor.is_first_page() + and not cursor.first_page_pks + and not cursor.has_many_more() + and any( + not _filter.is_type_filter() # look for a non-default filter + for _filter in cardsearch_params.cardsearch_filter_set + ) + ) + if _should_start_reproducible_randomness: + cursor.first_page_pks = tuple(_result.card_pk for _result in _results) + _relatedproperty_list: list[PropertypathUsage] = [] + if cardsearch_params.related_property_paths: + _relatedproperty_list.extend( + PropertypathUsage(property_path=_path, usage_count=0) + for _path in cardsearch_params.related_property_paths + ) + _relatedproperty_by_path = { + _result.property_path: _result + for _result in _relatedproperty_list + } + for _bucket in es8_response['aggregations']['agg_related_propertypath_usage']['buckets']: + _path = tuple(json.loads(_bucket['key'])) + _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] + return CardsearchResponse( + total_result_count=( + TROVE['ten-thousands-and-more'] + if cursor.has_many_more() + else cursor.result_count + ), + search_result_page=_results, + related_propertypath_results=_relatedproperty_list, + next_page_cursor=cursor.next_cursor(), + prev_page_cursor=cursor.prev_cursor(), + first_page_cursor=cursor.first_cursor(), + ) + + def _gather_textmatch_evidence(self, card_iri, es8_hit) -> Iterator[TextMatchEvidence]: + for _field, _snippets in es8_hit.get('highlight', {}).items(): + (_, _, _encoded_path) = _field.rpartition('.') + _property_path = _parse_path_field_name(_encoded_path) + for _snippet in _snippets: + yield TextMatchEvidence( + property_path=_property_path, + matching_highlight=rdf.literal(_snippet), + card_iri=card_iri, + ) + + +### +# assorted helper functions + +def _bucketlist(agg_result: dict) -> list[str]: + return [ + _bucket['key'] + for _bucket in agg_result['buckets'] + ] + + +def _daterange_value(datevalue: str): + _cleanvalue = datevalue.strip() + if re.fullmatch(r'\d{4,}', _cleanvalue): + return f'{_cleanvalue}||/y' + if re.fullmatch(r'\d{4,}-\d{2}', _cleanvalue): + return f'{_cleanvalue}||/M' + if re.fullmatch(r'\d{4,}-\d{2}-\d{2}', _cleanvalue): + return f'{_cleanvalue}||/d' + raise ValueError(f'bad date value "{datevalue}"') + + +def _depth_field_name(depth: int) -> str: + return f'depth{depth}' + + +def _parse_path_field_name(path_field_name: str) -> ts.Propertypath: + # inverse of propertypath_as_field_name + _list = json.loads(base64.urlsafe_b64decode(path_field_name.encode()).decode()) + assert isinstance(_list, list) + assert all(isinstance(_item, str) for _item in _list) + return tuple(_list) + + +def _any_query(queries: abc.Collection[dict]): + if len(queries) == 1: + (_query,) = queries + return _query + return {'bool': {'should': list(queries), 'minimum_should_match': 1}} + + +def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[ts.Propertypath], nested_path: str): + _suffuniq_iri_paths = [] + _glob_path_lengths = [] + for _path in propertypath_set: + if is_globpath(_path): + _glob_path_lengths.append(len(_path)) + else: + _suffuniq_iri_paths.append(ts.propertypath_as_keyword(_path)) + if _suffuniq_iri_paths and _glob_path_lengths: + return {'bool': { + 'minimum_should_match': 1, + 'should': [ + {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}}, + {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}}, + ], + }} + if _glob_path_lengths: + return {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}} + return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} + + +@dataclasses.dataclass +class _SimpleCursor: + start_index: int + page_size: int + result_count: int | None # use -1 to indicate "many more" + + MAX_INDEX: ClassVar[int] = ts.VALUESEARCH_MAX + + @classmethod + def from_page_param(cls, page: PageParam) -> '_SimpleCursor': + if page.cursor: + return decode_cursor_dataclass(page.cursor, cls) + assert page.size is not None + return cls( + start_index=0, + page_size=page.size, + result_count=None, # should be set when results are in + ) + + def next_cursor(self) -> str | None: + if not self.result_count: + return None + _next = dataclasses.replace(self, start_index=(self.start_index + self.page_size)) + return ( + encode_cursor_dataclass(_next) + if _next.is_valid_cursor() + else None + ) + + def prev_cursor(self) -> str | None: + _prev = dataclasses.replace(self, start_index=(self.start_index - self.page_size)) + return ( + encode_cursor_dataclass(_prev) + if _prev.is_valid_cursor() + else None + ) + + def first_cursor(self) -> str | None: + if self.is_first_page(): + return None + return encode_cursor_dataclass(dataclasses.replace(self, start_index=0)) + + def is_first_page(self) -> bool: + return self.start_index == 0 + + def has_many_more(self) -> bool: + return self.result_count == -1 + + def max_index(self) -> int: + return ( + self.MAX_INDEX + if self.has_many_more() + else min(self.result_count or 0, self.MAX_INDEX) + ) + + def is_valid_cursor(self) -> bool: + return 0 <= self.start_index < self.max_index() + + +@dataclasses.dataclass +class _CardsearchCursor(_SimpleCursor): + random_sort: bool # how to sort by relevance to nothingness? randomness! + first_page_pks: tuple[str, ...] = () + + MAX_INDEX: ClassVar[int] = ts.CARDSEARCH_MAX + + @classmethod + def from_cardsearch_params(cls, params: CardsearchParams) -> '_CardsearchCursor': + if params.page.cursor: + return decode_cursor_dataclass(params.page.cursor, cls) + assert params.page.size is not None + return cls( + start_index=0, + page_size=params.page.size, + result_count=None, # should be set when results are in + random_sort=( + not params.sort_list + and not params.cardsearch_textsegment_set + ), + ) + + def cardsearch_start_index(self) -> int: + if self.is_first_page() or not self.random_sort: + return self.start_index + return self.start_index - len(self.first_page_pks) + + def first_cursor(self) -> str | None: + if self.random_sort and not self.first_page_pks: + return None + return super().prev_cursor() + + def prev_cursor(self) -> str | None: + if self.random_sort and not self.first_page_pks: + return None + return super().prev_cursor() diff --git a/share/search/index_strategy/trovesearch_irivalues.py b/share/search/index_strategy/trovesearch_irivalues.py new file mode 100644 index 000000000..7d40a1860 --- /dev/null +++ b/share/search/index_strategy/trovesearch_irivalues.py @@ -0,0 +1,99 @@ +import typing + +from share.search import messages +from share.search.index_strategy.elastic8 import Elastic8IndexStrategy +from share.util.checksum_iri import ChecksumIri +from . import _trovesearch_util as ts + + +class TrovesearchMentionsIndexStrategy(Elastic8IndexStrategy): + CURRENT_STRATEGY_CHECKSUM = ChecksumIri( + checksumalgorithm_name='sha-256', + salt='TrovesearchMentionsIndexStrategy', + hexdigest='...', + ) + + # abstract method from IndexStrategy + @property + def supported_message_types(self): + return { + messages.MessageType.UPDATE_INDEXCARD, + messages.MessageType.BACKFILL_INDEXCARD, + } + + # abstract method from IndexStrategy + @property + def backfill_message_type(self): + return messages.MessageType.BACKFILL_INDEXCARD + + # abstract method from Elastic8IndexStrategy + def index_settings(self): + return {} + + # abstract method from Elastic8IndexStrategy + def index_mappings(self): + return { + 'dynamic': 'false', + 'properties': { + 'iri': ts.IRI_KEYWORD_MAPPING, # include sameAs + 'indexcard_iri': ts.KEYWORD_MAPPING, + 'indexcard_pk': ts.KEYWORD_MAPPING, + 'propertypath_from_focus': ts.KEYWORD_MAPPING, + 'depth_from_focus': ts.KEYWORD_MAPPING, + # flattened properties (dynamic sub-properties with keyword values) + 'iri_by_relative_propertypath': ts.FLATTENED_MAPPING, + 'iri_by_relative_depth': ts.FLATTENED_MAPPING, + # dynamic properties (see dynamic_templates, below) + 'dynamics': { + 'type': 'object', + 'properties': { + 'text_by_relative_propertypath': {'type': 'object', 'dynamic': True}, + 'text_by_relative_depth': {'type': 'object', 'dynamic': True}, + 'date_by_relative_propertypath': {'type': 'object', 'dynamic': True}, + }, + }, + }, + 'dynamic_templates': [ + {'dynamic_text_by_path': { + 'path_match': 'dynamics.text_by_relative_propertypath.*', + 'mapping': ts.TEXT_MAPPING, + }}, + {'dynamic_text_by_depth': { + 'path_match': 'dynamics.text_by_relative_depth.*', + 'mapping': ts.TEXT_MAPPING, + }}, + {'dynamic_date': { + 'path_match': 'dynamics.date_by_relative_propertypath.*', + 'mapping': { + 'type': 'date', + 'format': 'strict_date_optional_time', + }, + }}, + ], + } + + def before_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: typing.Iterable[str]): + if messages_chunk.message_type in ( + messages.MessageType.UPDATE_INDEXCARD, + messages.MessageType.BACKFILL_INDEXCARD, + ): + self.es8_client.delete_by_query( + index=list(indexnames), + query={'terms': {'indexcard_pk': messages_chunk.target_ids_chunk}}, + ) + + # abstract method from Elastic8IndexStrategy + def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): + _indexcard_rdf_qs = ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) + for _indexcard_rdf in _indexcard_rdf_qs: + for _doc_id, _iri_usage_doc in self._build_iri_usage_docs(_indexcard_rdf): + _index_action = self.build_index_action(_doc_id, _iri_usage_doc) + + def _build_iri_usage_docs(self, indexcard_rdf: trove_db.IndexcardRdf): + _graphwalk = ts.GraphWalk( + rdf.RdfGraph(_indexcard_rdf.as_rdf_tripledict()), + _indexcard_rdf.focus_iri, + ) + # TODO: skip iris already in a static thesaurus + ... + diff --git a/share/search/messages.py b/share/search/messages.py index 3eeda7204..7010a010c 100644 --- a/share/search/messages.py +++ b/share/search/messages.py @@ -18,11 +18,8 @@ class MessageType(enum.Enum): # for indexcard-based indexes: UPDATE_INDEXCARD = 'update-indexcard' BACKFILL_INDEXCARD = 'backfill-indexcard' - # for identifier-based indexes: (TODO: remove?) - IDENTIFIER_INDEXED = 'identifier-indexed' - BACKFILL_IDENTIFIER = 'backfill-identifier' # for aggregating identifier usage across index cards: - IDENTIFIER_USED = 'identifier-used' + IDENTIFIER_USAGE = 'identifier-used' BACKFILL_IDENTIFIER_USAGE = 'backfill-identifier-usage' @classmethod @@ -44,9 +41,7 @@ class IntMessageType(enum.IntEnum): BACKFILL_SUID = 6 UPDATE_INDEXCARD = 7 BACKFILL_INDEXCARD = 8 - IDENTIFIER_INDEXED = 9 - BACKFILL_IDENTIFIER = 10 - IDENTIFIER_USED = 11 + IDENTIFIER_USAGE = 11 BACKFILL_IDENTIFIER_USAGE = 12 @@ -61,7 +56,6 @@ def _enum_keys(an_enum_class): BACKFILL_MESSAGE_TYPES = { MessageType.BACKFILL_SUID, MessageType.BACKFILL_INDEXCARD, - MessageType.BACKFILL_IDENTIFIER, MessageType.BACKFILL_IDENTIFIER_USAGE, } diff --git a/share/tasks/__init__.py b/share/tasks/__init__.py index c78bd4ada..4c4baecbe 100644 --- a/share/tasks/__init__.py +++ b/share/tasks/__init__.py @@ -9,7 +9,7 @@ from share.harvest.scheduler import HarvestScheduler from share import models as db from share.search.index_messenger import IndexMessenger -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy from share.search.messages import MessageType from share.tasks.jobs import HarvestJobConsumer from share.util.source_stat import SourceStatus @@ -61,7 +61,7 @@ def schedule_index_backfill(self, index_backfill_pk): _index_backfill = db.IndexBackfill.objects.get(pk=index_backfill_pk) _index_backfill.pls_note_scheduling_has_begun() try: - _index_strategy = IndexStrategy.get_by_name(_index_backfill.index_strategy_name) + _index_strategy = index_strategy.get_index_strategy(_index_backfill.index_strategy_name) _messenger = IndexMessenger(celery_app=self.app, index_strategys=[_index_strategy]) _messagetype = _index_strategy.backfill_message_type assert _messagetype in _index_strategy.supported_message_types diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html index 4a1940835..9c3e0ac27 100644 --- a/templates/admin/search-indexes.html +++ b/templates/admin/search-indexes.html @@ -40,7 +40,7 @@

current index: {{index {% trans "is default for searching" %} {% trans "doc count" %} {% trans "actions" %} - {% trans "backfill" %} + {% trans "links" %} {{ indexes.current.status.creation_date|default:"--" }} @@ -89,10 +89,11 @@

current index: {{index {% if indexes.current.backfill.backfill_admin_url %}

- {{ indexes.current.backfill.backfill_status }} + {% trans "backfill" %}:{{ indexes.current.backfill.backfill_status }}

- {% else %} - -- + {% endif %} + {% if indexes.current.status.creation_date %} +

{% trans "mappings" %}

{% endif %} @@ -148,7 +149,11 @@

prior indexes

{% endif %} - {{ index_status.specific_indexname }} + {{ index_status.specific_indexname }} + {% if index_status.creation_date %} +

({% trans "mappings" %})

+ {% endif %} + {% endfor %} diff --git a/tests/_testutil.py b/tests/_testutil.py deleted file mode 100644 index 40bbc2f9f..000000000 --- a/tests/_testutil.py +++ /dev/null @@ -1,12 +0,0 @@ -from unittest import mock - - -def patch_feature_flag(*flag_names, up=True): - from share.models.feature_flag import FeatureFlag - _old_isup = FeatureFlag.objects.flag_is_up - - def _patched_isup(flag_name): - if flag_name in flag_names: - return up - return _old_isup(flag_name) - return mock.patch.object(FeatureFlag.objects, 'flag_is_up', new=_patched_isup) diff --git a/tests/api/test_elasticsearch.py b/tests/api/test_elasticsearch.py index cb2510ffe..13e6688f5 100644 --- a/tests/api/test_elasticsearch.py +++ b/tests/api/test_elasticsearch.py @@ -52,10 +52,10 @@ def test_search(self): '/api/v2/search/creativeworks/_search?q=foo', '/api/v2/search/creativeworks/_search/?q=foo', ) - with mock.patch('api.search.views.IndexStrategy') as mock_IndexStrategy: + with mock.patch('api.search.views.index_strategy') as _mock_index_strategy_module: mock_handle_search = ( - mock_IndexStrategy - .get_for_sharev2_search + _mock_index_strategy_module + .get_index_for_sharev2_search .return_value .pls_handle_search__sharev2_backcompat ) diff --git a/tests/api/test_feeds.py b/tests/api/test_feeds.py index a08cb1069..49a016664 100644 --- a/tests/api/test_feeds.py +++ b/tests/api/test_feeds.py @@ -52,7 +52,7 @@ def fake_items(self, Graph): json.loads(formatted_item) for formatted_item in formatted_items ] - with mock.patch('api.views.feeds.IndexStrategy.get_for_sharev2_search') as mock_get_for_searching: + with mock.patch('api.views.feeds.index_strategy.get_index_for_sharev2_search') as mock_get_for_searching: mock_strategy = mock_get_for_searching.return_value mock_strategy.pls_handle_search__sharev2_backcompat.return_value = { 'hits': { diff --git a/tests/share/bin/test_sharectl.py b/tests/share/bin/test_sharectl.py index d8f557b17..e39c6140c 100644 --- a/tests/share/bin/test_sharectl.py +++ b/tests/share/bin/test_sharectl.py @@ -8,6 +8,8 @@ from share.bin.util import execute_cmd import share.version +from tests.share.search import patch_index_strategies + def run_sharectl(*args): """run sharectl, assert that it returned as expected, and return its stdout @@ -39,7 +41,7 @@ def test_purge(self, indexnames): def _get_specific_index(indexname): return mock_specific_indexes[indexname] - with mock.patch('share.bin.search.IndexStrategy.get_specific_index', wraps=_get_specific_index) as mock_get_specific: + with mock.patch('share.bin.search.index_strategy.get_specific_index', wraps=_get_specific_index) as mock_get_specific: run_sharectl('search', 'purge', *indexnames) assert mock_get_specific.mock_calls == [ mock.call(indexname) @@ -49,20 +51,20 @@ def _get_specific_index(indexname): mock_specific_index.pls_delete.assert_called_once_with() def test_setup_initial(self, settings): - expected_indexes = ['baz', 'bar', 'foo'] - mock_index_strategys = [ - mock.Mock() - for _ in expected_indexes - ] - with mock.patch('share.bin.search.IndexStrategy.all_strategies', return_value=mock_index_strategys): + _expected_indexes = ['baz', 'bar', 'foo'] + _mock_index_strategys = { + _name: mock.Mock() + for _name in _expected_indexes + } + with patch_index_strategies(_mock_index_strategys): run_sharectl('search', 'setup', '--initial') - for mock_index_strategy in mock_index_strategys: + for mock_index_strategy in _mock_index_strategys.values(): mock_specific_index = mock_index_strategy.for_current_index.return_value assert mock_specific_index.pls_setup.mock_calls == [mock.call(skip_backfill=True)] def test_setup_index(self): mock_index_strategy = mock.Mock() - with mock.patch('share.bin.search.IndexStrategy.get_by_name', return_value=mock_index_strategy): + with mock.patch('share.bin.search.index_strategy.get_index_strategy', return_value=mock_index_strategy): run_sharectl('search', 'setup', 'foo') mock_current_index = mock_index_strategy.for_current_index.return_value assert mock_current_index.pls_setup.mock_calls == [mock.call(skip_backfill=False)] diff --git a/tests/share/search/__init__.py b/tests/share/search/__init__.py new file mode 100644 index 000000000..fb9a98ebc --- /dev/null +++ b/tests/share/search/__init__.py @@ -0,0 +1,18 @@ +import contextlib +from unittest import mock + +from share.search import index_strategy + + +@contextlib.contextmanager +def patch_index_strategies(strategies: dict[str, index_strategy.IndexStrategy]): + index_strategy.all_index_strategies.cache_clear() + with mock.patch.object( + index_strategy, + 'all_index_strategies', + return_value=strategies, + ): + breakpoint() + yield + breakpoint() + index_strategy.all_index_strategies.cache_clear() diff --git a/tests/share/search/conftest.py b/tests/share/search/conftest.py index 65fe44825..b87757372 100644 --- a/tests/share/search/conftest.py +++ b/tests/share/search/conftest.py @@ -4,35 +4,10 @@ @pytest.fixture -def fake_elastic_strategies(settings): - settings.ELASTICSEARCH = { - **settings.ELASTICSEARCH, - 'INDEX_STRATEGIES': { - 'my_es5_strategy': { - 'CLUSTER_SETTINGS': {'URL': 'blah'}, - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic5.Sharev2Elastic5IndexStrategy', - }, - 'my_es8_strategy': { - 'CLUSTER_SETTINGS': {'URL': 'bleh'}, - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic8.Sharev2Elastic8IndexStrategy', - }, - 'another_es8_strategy': { - 'CLUSTER_SETTINGS': {'URL': 'bluh'}, - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic8.Sharev2Elastic8IndexStrategy', - }, - }, - } - return tuple(settings.ELASTICSEARCH['INDEX_STRATEGIES'].keys()) - - -@pytest.fixture -def mock_elastic_clients(fake_elastic_strategies): - with mock.patch('share.search.index_strategy.sharev2_elastic5.elasticsearch5') as es5_mockpackage: - with mock.patch('share.search.index_strategy.elastic8.elasticsearch8') as es8_mockpackage: - es5_mockclient = es5_mockpackage.Elasticsearch.return_value - es8_mockclient = es8_mockpackage.Elasticsearch.return_value - yield { - 'my_es5_strategy': es5_mockclient, - 'my_es8_strategy': es8_mockclient, - 'another_es8_strategy': es8_mockclient, - } +def mock_elastic_clients(settings): + # set elastic urls to non-empty but non-usable values + settings.ELASTICSEARCH5_URL = 'fake://bleh' + settings.ELASTICSEARCH8_URL = 'fake://bluh' + with mock.patch('share.search.index_strategy.sharev2_elastic5.elasticsearch5'): + with mock.patch('share.search.index_strategy.elastic8.elasticsearch8'): + yield diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py new file mode 100644 index 000000000..b2a2bbec5 --- /dev/null +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -0,0 +1,363 @@ +from typing import Iterable, Iterator +from datetime import date +import itertools +from urllib.parse import urlencode + +from primitive_metadata import primitive_rdf as rdf + +from tests import factories +from share.search import messages +from trove import models as trove_db +from trove.trovesearch.search_params import CardsearchParams, ValuesearchParams +from trove.vocab.namespaces import RDFS, TROVE, RDF, DCTERMS, OWL, FOAF +from ._with_real_services import RealElasticTestCase + + +BLARG = rdf.IriNamespace('https://blarg.example/blarg/') + + +class CommonTrovesearchTests(RealElasticTestCase): + _indexcard_focus_by_uuid: dict[str, str] + + def setUp(self): + super().setUp() + self._indexcard_focus_by_uuid = {} + + def test_for_smoke_without_daemon(self): + _indexcard = self._create_indexcard( + focus_iri=BLARG.hello, + rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, + ) + _messages_chunk = messages.MessagesChunk( + messages.MessageType.UPDATE_INDEXCARD, + [_indexcard.id], + ) + self._assert_happypath_without_daemon( + _messages_chunk, + expected_doc_count=1, + ) + + def test_for_smoke_with_daemon(self): + _indexcard = self._create_indexcard( + focus_iri=BLARG.hello, + rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, + ) + _messages_chunk = messages.MessagesChunk( + messages.MessageType.UPDATE_INDEXCARD, + [_indexcard.id], + ) + self._assert_happypath_with_daemon( + _messages_chunk, + expected_doc_count=1, + ) + + def test_cardsearch(self): + self._fill_test_data_for_querying() + for _queryparams, _expected_result_iris in self.cardsearch_cases(): + _cardsearch_params = CardsearchParams.from_querystring(urlencode(_queryparams)) + assert isinstance(_cardsearch_params, CardsearchParams) + _cardsearch_response = self.current_index.pls_handle_cardsearch(_cardsearch_params) + # assumes all results fit on one page + _actual_result_iris = { + self._indexcard_focus_by_uuid[_result.card_uuid] + for _result in _cardsearch_response.search_result_page + } + self.assertEqual(_expected_result_iris, _actual_result_iris) + + def test_valuesearch(self): + self._fill_test_data_for_querying() + _valuesearch_cases = itertools.chain( + self.valuesearch_simple_cases(), + self.valuesearch_complex_cases(), + ) + for _queryparams, _expected_values in _valuesearch_cases: + _valuesearch_params = ValuesearchParams.from_querystring(urlencode(_queryparams)) + assert isinstance(_valuesearch_params, ValuesearchParams) + _valuesearch_response = self.current_index.pls_handle_valuesearch(_valuesearch_params) + # assumes all results fit on one page + _actual_values = { + _result.value_iri or _result.value_value + for _result in _valuesearch_response.search_result_page + } + self.assertEqual(_expected_values, _actual_values) + + def _fill_test_data_for_querying(self): + self._index_indexcards([ + self._create_indexcard(BLARG.a, { + BLARG.a: { + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, + DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('aaaa')}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, + DCTERMS.references: {BLARG.b, BLARG.c}, + DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, + }, + BLARG.someone: { + FOAF.name: {rdf.literal('some one')}, + }, + BLARG.b: { + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, + DCTERMS.title: {rdf.literal('bbbb')}, + DCTERMS.references: {BLARG.c}, + }, + BLARG.c: { + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, + DCTERMS.title: {rdf.literal('cccc')}, + }, + }), + self._create_indexcard(BLARG.b, { + BLARG.b: { + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.b_same}, + DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('bbbb')}, + DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, + DCTERMS.references: {BLARG.c}, + DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, + }, + BLARG.someone: { + FOAF.name: {rdf.literal('some one')}, + }, + BLARG.c: { + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, + DCTERMS.title: {rdf.literal('cccc')}, + }, + }), + self._create_indexcard(BLARG.c, { + BLARG.c: { + RDF.type: {BLARG.Thing}, + DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, + DCTERMS.creator: {BLARG.someone_else}, + DCTERMS.title: {rdf.literal('cccc')}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, + DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, + }, + BLARG.someone_else: { + FOAF.name: {rdf.literal('some one else')}, + }, + }), + ]) + + def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: + # using data from _fill_test_data_for_querying + yield ( + {'cardSearchFilter[creator]': BLARG.someone}, + {BLARG.a, BLARG.b}, + ) + yield ( + {'cardSearchFilter[creator]': ','.join((BLARG.someone_else, BLARG.someone))}, + {BLARG.a, BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchFilter[resourceType]': BLARG.Thing}, + {BLARG.a, BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchFilter[resourceType]': BLARG.Nothing}, + set(), + ) + yield ( + {'cardSearchFilter[references]': BLARG.b}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[references]': BLARG.c}, + {BLARG.a, BLARG.b}, + ) + yield ( + {'cardSearchFilter[references.references]': BLARG.c}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[references.references][is-present]': ''}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[references.references.subject][is-present]': ''}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[references.references][is-absent]': ''}, + {BLARG.c, BLARG.b}, + ) + yield ( + {'cardSearchFilter[references.references.subject][is-absent]': ''}, + {BLARG.c, BLARG.b}, + ) + yield ( + {'cardSearchFilter[subject]': BLARG.subj_ac}, + {BLARG.c, BLARG.a}, + ) + yield ( + {'cardSearchFilter[subject][none-of]': BLARG.subj_ac}, + {BLARG.b}, + ) + yield ( + { + 'cardSearchFilter[subject]': BLARG.subj_bc, + 'cardSearchFilter[creator]': BLARG.someone, + }, + {BLARG.b}, + ) + yield ( + { + 'cardSearchFilter[subject]': BLARG.subj_bc, + 'cardSearchText[*]': 'cccc', + }, + {BLARG.c}, + ) + yield ( + { + 'cardSearchFilter[resourceType]': ','.join((BLARG.Thing, BLARG.Another, BLARG.Nothing)), + 'cardSearchFilter[subject]': BLARG.subj_bc, + 'cardSearchText[*,creator.name]': 'else', + }, + {BLARG.c}, + ) + yield ( + { + 'cardSearchFilter[resourceType]': BLARG.Nothing, + 'cardSearchFilter[subject]': BLARG.subj_bc, + 'cardSearchText[*,creator.name]': 'else', + }, + set(), + ) + yield ( + {'cardSearchText[*,creator.name]': 'some'}, + {BLARG.a, BLARG.b, BLARG.c}, + ) + yield ( + { + 'cardSearchFilter[dateCreated]': '1999', + 'cardSearchText[*]': '', + }, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[dateCreated]': '1999-12'}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[dateCreated]': '1999-11'}, + set(), + ) + yield ( + {'cardSearchFilter[dateCreated]': '2012-12-31'}, + {BLARG.b}, + ) + yield ( + {'cardSearchFilter[dateCreated][after]': '2030'}, + set(), + ) + yield ( + {'cardSearchFilter[dateCreated][after]': '2011'}, + {BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchFilter[dateCreated][before]': '2012-12'}, + {BLARG.a}, + ) + yield ( + {'cardSearchText': 'bbbb'}, + {BLARG.b}, + ) + yield ( + {'cardSearchText': '-bbbb'}, + {BLARG.a, BLARG.c}, + ) + yield ( + {'cardSearchText': 'danger'}, + {BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchText': 'dangre'}, + {BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchText': '"dangre"'}, + set(), + ) + yield ( + {'cardSearchText': 'danger -repulsive'}, + {BLARG.c}, + ) + yield ( + {'cardSearchText': '"nothing valued is here"'}, + {BLARG.a}, + ) + yield ( + {'cardSearchText': '"nothing valued here"'}, + set(), + ) + yield ( + {'cardSearchText': '"what is here"'}, + {BLARG.b}, + ) + + def valuesearch_simple_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: + yield ( + {'valueSearchPropertyPath': 'references'}, + {BLARG.b, BLARG.c}, + ) + yield ( + {'valueSearchPropertyPath': 'dateCreated'}, + {'1999', '2012', '2024'}, + ) + # TODO: more + + def valuesearch_complex_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: + yield ( + { + 'valueSearchPropertyPath': 'references', + 'valueSearchFilter[resourceType]': BLARG.Thing, + }, + {BLARG.b, BLARG.c}, + ) + yield ( + { + 'valueSearchPropertyPath': 'references', + 'valueSearchText': 'bbbb', + }, + {BLARG.b}, + ) + # TODO: more + + def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): + _messages_chunk = messages.MessagesChunk( + messages.MessageType.UPDATE_INDEXCARD, + [_indexcard.id for _indexcard in indexcards], + ) + self.assertTrue(all( + _response.is_done + for _response in self.index_strategy.pls_handle_messages_chunk(_messages_chunk) + )) + self.current_index.pls_refresh() + + def _create_indexcard(self, focus_iri: str, rdf_tripledict: rdf.RdfTripleDictionary) -> trove_db.Indexcard: + _suid = factories.SourceUniqueIdentifierFactory() + _raw = factories.RawDatumFactory( + suid=_suid, + ) + _indexcard = trove_db.Indexcard.objects.create( + source_record_suid=_suid, + ) + # an osfmap_json card is required for indexing, but not used in these tests + trove_db.DerivedIndexcard.objects.create( + upriver_indexcard=_indexcard, + deriver_identifier=trove_db.ResourceIdentifier.objects.get_or_create_for_iri(TROVE['derive/osfmap_json']), + ) + trove_db.LatestIndexcardRdf.objects.create( + from_raw_datum=_raw, + indexcard=_indexcard, + focus_iri=focus_iri, + rdf_as_turtle=rdf.turtle_from_tripledict(rdf_tripledict), + turtle_checksum_iri='foo', # not enforced + ) + self._indexcard_focus_by_uuid[str(_indexcard.uuid)] = focus_iri + return _indexcard diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index 3a88879e5..46f133121 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -1,15 +1,13 @@ import contextlib -import unittest from unittest import mock -from django.test import override_settings, TransactionTestCase -from django.conf import settings +from django.test import TransactionTestCase from django.db import connections from project.celery import app as celery_app from share.search.daemon import IndexerDaemonControl from share.search.index_messenger import IndexMessenger -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy # base class for testing IndexStrategy subclasses with actual elasticsearch. @@ -19,18 +17,12 @@ class RealElasticTestCase(TransactionTestCase): serialized_rollback = True # for TransactionTestCase; restore db after # required for subclasses - strategy_name_for_real: str - strategy_name_for_test: str - - @classmethod - def setUpClass(cls): - cls.__original_es_settings = settings.ELASTICSEARCH + def get_index_strategy(self) -> index_strategy.IndexStrategy: + raise NotImplementedError(f'{self.__class__} must implement `get_index_strategy`') def setUp(self): super().setUp() self.enterContext(mock.patch('share.models.core._setup_user_token_and_groups')) - self.enterContext(self._settings_for_test()) - IndexStrategy.clear_strategy_cache() self.index_strategy = self.get_index_strategy() self.index_messenger = IndexMessenger( celery_app=celery_app, @@ -43,7 +35,6 @@ def setUp(self): def tearDown(self): super().tearDown() self.current_index.pls_delete() - IndexStrategy.clear_strategy_cache() # HACK: copied from TransactionTestCase._fixture_setup; restores db # to the state from before TransactionTestCase clobbered it (relies # on how django 3.2 implements `serialized_rollback = True`, above) @@ -57,43 +48,15 @@ def enterContext(self, context_manager): self.addCleanup(lambda: context_manager.__exit__(None, None, None)) return result - def get_index_strategy(self): - return IndexStrategy.get_by_name(self.strategy_name_for_test) - @contextlib.contextmanager def _daemon_up(self): - _daemon_control = IndexerDaemonControl( - celery_app, - daemonthread_context=self._settings_for_test, # will be called in daemonthread - ) + _daemon_control = IndexerDaemonControl(celery_app) _daemon_control.start_daemonthreads_for_strategy(self.get_index_strategy()) try: yield _daemon_control finally: _daemon_control.stop_daemonthreads(wait=True) - @contextlib.contextmanager - def _settings_for_test(self): - try: - _real_strategy_settings = ( - self.__original_es_settings - ['INDEX_STRATEGIES'] - [self.strategy_name_for_real] - ) - except KeyError: - raise unittest.SkipTest( - f'index strategy "{self.strategy_name_for_real}" not configured in' - " ELASTICSEARCH['INDEX_STRATEGIES'] (perhaps missing env)" - ) - _new_es_settings = { - **self.__original_es_settings, - 'INDEX_STRATEGIES': { # wipe out all configured strategies - self.strategy_name_for_test: _real_strategy_settings, - } - } - with override_settings(ELASTICSEARCH=_new_es_settings): - yield - # for test methods on subclasses to call: def _assert_happypath_without_daemon(self, messages_chunk, expected_doc_count): _responses = list(self.index_strategy.pls_handle_messages_chunk(messages_chunk)) diff --git a/tests/share/search/index_strategy/test_sharev2_elastic5.py b/tests/share/search/index_strategy/test_sharev2_elastic5.py index 729eab0fb..6b1618301 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic5.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic5.py @@ -1,19 +1,20 @@ import json +import unittest + +from django.conf import settings from tests import factories from share.search import messages +from share.search.index_strategy.sharev2_elastic5 import Sharev2Elastic5IndexStrategy from share.util import IDObfuscator from ._with_real_services import RealElasticTestCase +@unittest.skipUnless(settings.ELASTICSEARCH5_URL, 'missing ELASTICSEARCH5_URL setting') class TestSharev2Elastic5(RealElasticTestCase): # for RealElasticTestCase - strategy_name_for_real = 'sharev2_elastic5' - strategy_name_for_test = 'test_sharev2_elastic5' - - # override method from RealElasticTestCase def get_index_strategy(self): - index_strategy = super().get_index_strategy() + index_strategy = Sharev2Elastic5IndexStrategy('test_sharev2_elastic5') if not index_strategy.STATIC_INDEXNAME.startswith('test_'): index_strategy.STATIC_INDEXNAME = f'test_{index_strategy.STATIC_INDEXNAME}' return index_strategy diff --git a/tests/share/search/index_strategy/test_sharev2_elastic8.py b/tests/share/search/index_strategy/test_sharev2_elastic8.py index 0385cece3..7b1c76845 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic8.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic8.py @@ -2,14 +2,15 @@ from tests import factories from share.search import messages +from share.search.index_strategy.sharev2_elastic8 import Sharev2Elastic8IndexStrategy from share.util import IDObfuscator from ._with_real_services import RealElasticTestCase class TestSharev2Elastic8(RealElasticTestCase): # for RealElasticTestCase - strategy_name_for_real = 'sharev2_elastic8' - strategy_name_for_test = 'test_sharev2_elastic8' + def get_index_strategy(self): + return Sharev2Elastic8IndexStrategy('test_sharev2_elastic8') def setUp(self): super().setUp() diff --git a/tests/share/search/index_strategy/test_base_index_strategy.py b/tests/share/search/index_strategy/test_strategy_selection.py similarity index 55% rename from tests/share/search/index_strategy/test_base_index_strategy.py rename to tests/share/search/index_strategy/test_strategy_selection.py index d53cd37af..a7c8f60bb 100644 --- a/tests/share/search/index_strategy/test_base_index_strategy.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -1,30 +1,40 @@ +# TODO: update import pytest from share.search.exceptions import IndexStrategyError from share.search.index_strategy import ( + all_index_strategies, + get_index_strategy, + get_specific_index, + get_index_for_sharev2_search, IndexStrategy, sharev2_elastic5, sharev2_elastic8, + trove_indexcard_flats, + trovesearch_indexcard, + trovesearch_excessive, ) @pytest.fixture -def expected_strategy_classes(fake_elastic_strategies): +def expected_strategy_classes(): return { - 'my_es5_strategy': sharev2_elastic5.Sharev2Elastic5IndexStrategy, - 'my_es8_strategy': sharev2_elastic8.Sharev2Elastic8IndexStrategy, - 'another_es8_strategy': sharev2_elastic8.Sharev2Elastic8IndexStrategy, + 'sharev2_elastic5': sharev2_elastic5.Sharev2Elastic5IndexStrategy, + 'sharev2_elastic8': sharev2_elastic8.Sharev2Elastic8IndexStrategy, + 'trove_indexcard_flats': trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy, + 'trovesearch_indexcard': trovesearch_indexcard.TrovesearchIndexcardIndexStrategy, + 'trovesearch_excessive': trovesearch_excessive.TrovesearchExcessiveIndexStrategy, } class TestBaseIndexStrategy: - def test_get_by_name(self, mock_elastic_clients, expected_strategy_classes): + def test_get_index_strategy(self, mock_elastic_clients, expected_strategy_classes): for strategy_name, expected_strategy_class in expected_strategy_classes.items(): - index_strategy = IndexStrategy.get_by_name(strategy_name) + index_strategy = get_index_strategy(strategy_name) assert isinstance(index_strategy, expected_strategy_class) - def test_all_strategies(self, mock_elastic_clients, expected_strategy_classes): - all_strategys = tuple(IndexStrategy.all_strategies()) + def test_all_index_strategies(self, mock_elastic_clients, expected_strategy_classes): + all_strategys = tuple(all_index_strategies().values()) assert len(all_strategys) == len(expected_strategy_classes) strategy_names = {index_strategy.name for index_strategy in all_strategys} assert strategy_names == set(expected_strategy_classes.keys()) @@ -34,36 +44,29 @@ def test_all_strategies(self, mock_elastic_clients, expected_strategy_classes): assert issubclass(index_strategy.SpecificIndex, IndexStrategy.SpecificIndex) assert index_strategy.SpecificIndex is not IndexStrategy.SpecificIndex - def test_get_by_specific_indexname(self, mock_elastic_clients, expected_strategy_classes, fake_elastic_strategies): + def test_get_by_specific_indexname(self, mock_elastic_clients, expected_strategy_classes): for strategy_name, expected_strategy_class in expected_strategy_classes.items(): - indexname_prefix = IndexStrategy.get_by_name(strategy_name).indexname_prefix + indexname_prefix = get_index_strategy(strategy_name).indexname_prefix specific_indexname = ''.join((indexname_prefix, 'foo')) - specific_index = IndexStrategy.get_specific_index(specific_indexname) + specific_index = get_specific_index(specific_indexname) assert isinstance(specific_index.index_strategy, expected_strategy_class) assert isinstance(specific_index, expected_strategy_class.SpecificIndex) assert specific_index.indexname == specific_indexname bad_indexname = 'foo_foo' # assumed to not start with index prefix with pytest.raises(IndexStrategyError): - IndexStrategy.get_specific_index(bad_indexname) + get_specific_index(bad_indexname) @pytest.mark.django_db - def test_get_by_request(self, mock_elastic_clients, fake_elastic_strategies): - IndexStrategy.clear_strategy_cache() - for strategy_name in mock_elastic_clients.keys(): - index_strategy = IndexStrategy.get_by_name(strategy_name) + def test_get_by_request(self, mock_elastic_clients): + for strategy_name, index_strategy in all_index_strategies().items(): good_requests = [ strategy_name, index_strategy.current_indexname, ''.join((index_strategy.indexname_prefix, 'foo')), ] for good_request in good_requests: - specific_index = IndexStrategy.get_for_sharev2_search(good_request) + specific_index = get_index_for_sharev2_search(good_request) assert isinstance(specific_index, index_strategy.SpecificIndex) assert specific_index.index_strategy is index_strategy - # bad calls: - with pytest.raises(IndexStrategyError): - IndexStrategy.get_for_sharev2_search('bad-request') - with pytest.raises(IndexStrategyError): - IndexStrategy.get_for_sharev2_search() - with pytest.raises(IndexStrategyError): - IndexStrategy.get_for_sharev2_search(requested_name=None) + with pytest.raises(IndexStrategyError): + get_index_for_sharev2_search('bad-request') diff --git a/tests/share/search/index_strategy/test_trove_indexcard_flats.py b/tests/share/search/index_strategy/test_trove_indexcard_flats.py index be321a710..3cf84ec82 100644 --- a/tests/share/search/index_strategy/test_trove_indexcard_flats.py +++ b/tests/share/search/index_strategy/test_trove_indexcard_flats.py @@ -1,320 +1,9 @@ -from typing import Iterable, Iterator -from datetime import date -from urllib.parse import urlencode +from share.search.index_strategy.trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy -from primitive_metadata import primitive_rdf as rdf +from . import _common_trovesearch_tests -from tests import factories -from share.search import messages -from trove import models as trove_db -from trove.trovesearch.search_params import CardsearchParams -from trove.vocab.namespaces import RDFS, TROVE, RDF, DCTERMS, OWL, FOAF -from ._with_real_services import RealElasticTestCase - -BLARG = rdf.IriNamespace('https://blarg.example/blarg/') - - -class TestTroveIndexcardFlats(RealElasticTestCase): +class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests): # for RealElasticTestCase - strategy_name_for_real = 'trove_indexcard_flats' - strategy_name_for_test = 'test_trove_indexcard_flats' - - _indexcard_focus_by_uuid: dict[str, str] - - def setUp(self): - super().setUp() - self._indexcard_focus_by_uuid = {} - - def test_for_smoke_without_daemon(self): - _indexcard = self._create_indexcard( - focus_iri=BLARG.hello, - rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, - ) - _messages_chunk = messages.MessagesChunk( - messages.MessageType.UPDATE_INDEXCARD, - [_indexcard.id], - ) - self._assert_happypath_without_daemon( - _messages_chunk, - expected_doc_count=1, - ) - - def test_for_smoke_with_daemon(self): - _indexcard = self._create_indexcard( - focus_iri=BLARG.hello, - rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, - ) - _messages_chunk = messages.MessagesChunk( - messages.MessageType.UPDATE_INDEXCARD, - [_indexcard.id], - ) - self._assert_happypath_with_daemon( - _messages_chunk, - expected_doc_count=1, - ) - - def test_cardsearch(self): - self._fill_test_data_for_querying() - for _queryparams, _expected_result_iris in self._cardsearch_cases(): - _cardsearch_params = CardsearchParams.from_querystring(urlencode(_queryparams)) - _cardsearch_response = self.current_index.pls_handle_cardsearch(_cardsearch_params) - # assumes all results fit on one page - _actual_result_iris = { - self._indexcard_focus_by_uuid[_result.card_uuid()] - for _result in _cardsearch_response.search_result_page - } - self.assertEqual(_expected_result_iris, _actual_result_iris) - - def _fill_test_data_for_querying(self): - self._index_indexcards([ - self._create_indexcard(BLARG.a, { - BLARG.a: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, - DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('aaaa')}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, - DCTERMS.references: {BLARG.b, BLARG.c}, - DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, - }, - BLARG.someone: { - FOAF.name: {rdf.literal('some one')}, - }, - BLARG.b: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, - DCTERMS.title: {rdf.literal('bbbb')}, - DCTERMS.references: {BLARG.c}, - }, - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, - DCTERMS.title: {rdf.literal('cccc')}, - }, - }), - self._create_indexcard(BLARG.b, { - BLARG.b: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.b_same}, - DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('bbbb')}, - DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, - DCTERMS.references: {BLARG.c}, - DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, - }, - BLARG.someone: { - FOAF.name: {rdf.literal('some one')}, - }, - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, - DCTERMS.title: {rdf.literal('cccc')}, - }, - }), - self._create_indexcard(BLARG.c, { - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, - DCTERMS.creator: {BLARG.someone_else}, - DCTERMS.title: {rdf.literal('cccc')}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, - DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, - }, - BLARG.someone_else: { - FOAF.name: {rdf.literal('some one else')}, - }, - }), - ]) - - def _cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: - # using data from _fill_test_data_for_querying - yield ( - {'cardSearchFilter[creator]': BLARG.someone}, - {BLARG.a, BLARG.b}, - ) - yield ( - {'cardSearchFilter[creator]': ','.join((BLARG.someone_else, BLARG.someone))}, - {BLARG.a, BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchFilter[resourceType]': BLARG.Thing}, - {BLARG.a, BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchFilter[resourceType]': BLARG.Nothing}, - set(), - ) - yield ( - {'cardSearchFilter[references]': BLARG.b}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[references]': BLARG.c}, - {BLARG.a, BLARG.b}, - ) - yield ( - {'cardSearchFilter[references.references]': BLARG.c}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[references.references][is-present]': ''}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[references.references.subject][is-present]': ''}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[references.references][is-absent]': ''}, - {BLARG.c, BLARG.b}, - ) - yield ( - {'cardSearchFilter[references.references.subject][is-absent]': ''}, - {BLARG.c, BLARG.b}, - ) - yield ( - {'cardSearchFilter[subject]': BLARG.subj_ac}, - {BLARG.c, BLARG.a}, - ) - yield ( - {'cardSearchFilter[subject][none-of]': BLARG.subj_ac}, - {BLARG.b}, - ) - yield ( - { - 'cardSearchFilter[subject]': BLARG.subj_bc, - 'cardSearchFilter[creator]': BLARG.someone, - }, - {BLARG.b}, - ) - yield ( - { - 'cardSearchFilter[subject]': BLARG.subj_bc, - 'cardSearchText[*]': 'cccc', - }, - {BLARG.c}, - ) - yield ( - { - 'cardSearchFilter[resourceType]': ','.join((BLARG.Thing, BLARG.Another, BLARG.Nothing)), - 'cardSearchFilter[subject]': BLARG.subj_bc, - 'cardSearchText[*,creator.name]': 'else', - }, - {BLARG.c}, - ) - yield ( - { - 'cardSearchFilter[resourceType]': BLARG.Nothing, - 'cardSearchFilter[subject]': BLARG.subj_bc, - 'cardSearchText[*,creator.name]': 'else', - }, - set(), - ) - yield ( - {'cardSearchText[*,creator.name]': 'some'}, - {BLARG.a, BLARG.b, BLARG.c}, - ) - yield ( - { - 'cardSearchFilter[dateCreated]': '1999', - 'cardSearchText[*]': '', - }, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[dateCreated]': '1999-12'}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[dateCreated]': '1999-11'}, - set(), - ) - yield ( - {'cardSearchFilter[dateCreated]': '2012-12-31'}, - {BLARG.b}, - ) - yield ( - {'cardSearchFilter[dateCreated][after]': '2030'}, - set(), - ) - yield ( - {'cardSearchFilter[dateCreated][after]': '2011'}, - {BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchFilter[dateCreated][before]': '2012-12'}, - {BLARG.a}, - ) - yield ( - {'cardSearchText': 'bbbb'}, - {BLARG.b}, - ) - yield ( - {'cardSearchText': '-bbbb'}, - {BLARG.a, BLARG.c}, - ) - yield ( - {'cardSearchText': 'danger'}, - {BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchText': 'dangre'}, - {BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchText': '"dangre"'}, - set(), - ) - yield ( - {'cardSearchText': 'danger -repulsive'}, - {BLARG.c}, - ) - yield ( - {'cardSearchText': '"nothing valued is here"'}, - {BLARG.a}, - ) - yield ( - {'cardSearchText': '"nothing valued here"'}, - set(), - ) - yield ( - {'cardSearchText': '"what is here"'}, - {BLARG.b}, - ) - - def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): - _messages_chunk = messages.MessagesChunk( - messages.MessageType.UPDATE_INDEXCARD, - [_indexcard.id for _indexcard in indexcards], - ) - self.assertTrue(all( - _response.is_done - for _response in self.index_strategy.pls_handle_messages_chunk(_messages_chunk) - )) - self.current_index.pls_refresh() - - def _create_indexcard(self, focus_iri: str, rdf_tripledict: rdf.RdfTripleDictionary) -> trove_db.Indexcard: - _suid = factories.SourceUniqueIdentifierFactory() - _raw = factories.RawDatumFactory( - suid=_suid, - ) - _indexcard = trove_db.Indexcard.objects.create( - source_record_suid=_suid, - ) - # an osfmap_json card is required for indexing, but not used in these tests - trove_db.DerivedIndexcard.objects.create( - upriver_indexcard=_indexcard, - deriver_identifier=trove_db.ResourceIdentifier.objects.get_or_create_for_iri(TROVE['derive/osfmap_json']), - ) - trove_db.LatestIndexcardRdf.objects.create( - from_raw_datum=_raw, - indexcard=_indexcard, - focus_iri=focus_iri, - rdf_as_turtle=rdf.turtle_from_tripledict(rdf_tripledict), - turtle_checksum_iri='foo', # not enforced - ) - self._indexcard_focus_by_uuid[str(_indexcard.uuid)] = focus_iri - return _indexcard + def get_index_strategy(self): + return TroveIndexcardFlatsIndexStrategy('test_trove_indexcard_flats') diff --git a/tests/share/search/index_strategy/test_trovesearch_excessive.py b/tests/share/search/index_strategy/test_trovesearch_excessive.py new file mode 100644 index 000000000..490be63f4 --- /dev/null +++ b/tests/share/search/index_strategy/test_trovesearch_excessive.py @@ -0,0 +1,11 @@ +import unittest + +from share.search.index_strategy.trovesearch_excessive import TrovesearchExcessiveIndexStrategy +from . import _common_trovesearch_tests + + +#@unittest.skip('wip') +class TestTrovesearchExcessive(_common_trovesearch_tests.CommonTrovesearchTests): + # for RealElasticTestCase + def get_index_strategy(self): + return TrovesearchExcessiveIndexStrategy('test_trovesearch_excessive') diff --git a/tests/share/search/index_strategy/test_trovesearch_indexcard.py b/tests/share/search/index_strategy/test_trovesearch_indexcard.py new file mode 100644 index 000000000..16d12cd55 --- /dev/null +++ b/tests/share/search/index_strategy/test_trovesearch_indexcard.py @@ -0,0 +1,15 @@ +import unittest + +from share.search.index_strategy.trovesearch_indexcard import TrovesearchIndexcardIndexStrategy +from . import _common_trovesearch_tests + + +#@unittest.skip('wip') +class TestTrovesearchIndexcard(_common_trovesearch_tests.CommonTrovesearchTests): + # for RealElasticTestCase + def get_index_strategy(self): + return TrovesearchIndexcardIndexStrategy('test_trovesearch_indexcard') + + # override CommonTrovesearchTests + def valuesearch_complex_cases(self): + yield from () # "complex" valuesearches are the ones this indexcard strategy can't handle diff --git a/tests/share/search/index_strategy/test_trovesearch_iri_usage.py b/tests/share/search/index_strategy/test_trovesearch_iri_usage.py new file mode 100644 index 000000000..a1b51119e --- /dev/null +++ b/tests/share/search/index_strategy/test_trovesearch_iri_usage.py @@ -0,0 +1,12 @@ +import unittest + +from share.search.index_strategy.trovesearch_excessive import TrovesearchExcessiveIndexStrategy +from . import _common_trovesearch_tests + + +#@unittest.skip('wip') +class TestTrovesearchExcessive(_common_trovesearch_tests.CommonTrovesearchTests): + # for RealElasticTestCase + def get_index_strategy(self): + return TrovesearchExcessiveIndexStrategy('test_trovesearch_excessive') + diff --git a/tests/share/search/test_admin_workflow.py b/tests/share/search/test_admin_workflow.py index 2460f5ccc..6a1ee9a03 100644 --- a/tests/share/search/test_admin_workflow.py +++ b/tests/share/search/test_admin_workflow.py @@ -4,18 +4,18 @@ import pytest from share.models import ShareUser -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy @pytest.mark.django_db -def test_admin_search_indexes_view(fake_elastic_strategies, mock_elastic_clients): +def test_admin_search_indexes_view(mock_elastic_clients): credentials = {'username': 'test-test-test', 'password': 'password-password'} ShareUser.objects.create_superuser(**credentials) client = Client() client.login(**credentials) with mock.patch('share.search.index_strategy.elastic8.elasticsearch8'): resp = client.get('/admin/search-indexes') - for strategy_name in fake_elastic_strategies: - index_strategy = IndexStrategy.get_by_name(strategy_name) - expected_header = f'

current index: {index_strategy.current_indexname}

' + for strategy_name in index_strategy.all_index_strategies(): + _index_strategy = index_strategy.get_index_strategy(strategy_name) + expected_header = f'

current index: {_index_strategy.current_indexname}

' assert expected_header.encode() in resp.content diff --git a/trove/models/indexcard.py b/trove/models/indexcard.py index 5d4ca9441..21005d1e9 100644 --- a/trove/models/indexcard.py +++ b/trove/models/indexcard.py @@ -7,7 +7,6 @@ from primitive_metadata import primitive_rdf as rdf from share import models as share_db # TODO: break this dependency -from share.search.index_messenger import IndexMessenger from share.util.checksum_iri import ChecksumIri from trove.exceptions import DigestiveError from trove.models.resource_identifier import ResourceIdentifier @@ -46,6 +45,7 @@ def save_indexcards_from_tripledicts( .filter(id__in=_seen_focus_identifier_ids.intersection(_focus_identifier_ids)) ) raise DigestiveError(f'duplicate focus iris: {list(_duplicates)}') + _seen_focus_identifier_ids.update(_focus_identifier_ids) _indexcards.append(_indexcard) # cards seen previously on this suid (but not this time) treated as deleted for _indexcard_to_delete in ( @@ -220,6 +220,8 @@ def pls_delete(self): .filter(upriver_indexcard=self) .delete() ) + # TODO: rearrange to avoid local import + from share.search.index_messenger import IndexMessenger IndexMessenger().notify_indexcard_update([self]) def __repr__(self): diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 14d3a6673..9c459375f 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -473,7 +473,7 @@ def to_querydict(self) -> QueryDict: class ValuesearchParams(CardsearchParams): # includes fields from CardsearchParams, because a # valuesearch is always in context of a cardsearch - valuesearch_propertypath_set: frozenset[tuple[str, ...]] + valuesearch_propertypath: tuple[str, ...] valuesearch_textsegment_set: frozenset[Textsegment] valuesearch_filter_set: frozenset[SearchFilter] @@ -485,14 +485,14 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: raise trove_exceptions.MissingRequiredQueryParam('valueSearchPropertyPath') return { **super().parse_queryparams(queryparams), - 'valuesearch_propertypath_set': _parse_propertypath_set(_raw_propertypath, allow_globs=False), + 'valuesearch_propertypath': _parse_propertypath(_raw_propertypath, allow_globs=False), 'valuesearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'valueSearchText'), 'valuesearch_filter_set': SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'), } def to_querydict(self): _querydict = super().to_querydict() - _querydict['valueSearchPropertyPath'] = propertypath_set_key(self.valuesearch_propertypath_set) + _querydict['valueSearchPropertyPath'] = propertypath_key(self.valuesearch_propertypath) for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('valueSearchText', self.valuesearch_textsegment_set): _querydict[_qp_name] = _qp_value for _filter in self.valuesearch_filter_set: @@ -512,7 +512,15 @@ def valuesearch_type_iris(self): ### -# local helpers +# helper functions + +def is_globpath(path: tuple[str, ...]) -> bool: + return all(_pathstep == GLOB_PATHSTEP for _pathstep in path) + + +def make_globpath(length: int) -> tuple[str, ...]: + return ONE_GLOB_PROPERTYPATH * length + def propertypathstep_key(pathstep: str) -> str: if pathstep == GLOB_PATHSTEP: diff --git a/trove/trovesearch/search_response.py b/trove/trovesearch/search_response.py index bd7a25f28..ee09f5c0d 100644 --- a/trove/trovesearch/search_response.py +++ b/trove/trovesearch/search_response.py @@ -26,13 +26,16 @@ class TextMatchEvidence: class CardsearchResult: text_match_evidence: Iterable[TextMatchEvidence] card_iri: str + card_uuid: str = '' + card_pk: str = '' # TODO: use or remove - def card_uuid(self): - # card iri has the uuid at the end - return primitive_rdf.iri_minus_namespace( - self.card_iri, - namespace=trove_indexcard_namespace(), - ) + def __post_init__(self): + if not self.card_uuid: + # card iri has the uuid at the end + self.card_uuid = primitive_rdf.iri_minus_namespace( + self.card_iri, + namespace=trove_indexcard_namespace(), + ) @dataclasses.dataclass @@ -42,13 +45,12 @@ class CardsearchResponse: next_page_cursor: Optional[str] prev_page_cursor: Optional[str] first_page_cursor: Optional[str] - filtervalue_info: Iterable['ValuesearchResult'] related_propertypath_results: Iterable['PropertypathUsage'] @dataclasses.dataclass class PropertypathUsage: - property_path: tuple[str] + property_path: tuple[str, ...] usage_count: int diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 31bf85d48..e027ef7a9 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -79,13 +79,13 @@ @trovesearch_by_indexstrategy.gatherer(TROVE.propertyPath, focustype_iris={TROVE.Valuesearch}) def gather_valuesearch_propertypath(focus, *, search_params, **kwargs): - yield from _multi_propertypath_twoples(search_params.valuesearch_propertypath_set) + yield from _single_propertypath_twoples(search_params.valuesearch_propertypath) @trovesearch_by_indexstrategy.gatherer(TROVE.valueSearchFilter) def gather_valuesearch_filter(focus, *, search_params, **kwargs): for _filter in search_params.valuesearch_filter_set: - yield (TROVE.valueSearchFilter, _filter_as_blanknode(_filter, {})) + yield (TROVE.valueSearchFilter, _filter_as_blanknode(_filter)) @trovesearch_by_indexstrategy.gatherer( @@ -131,13 +131,9 @@ def gather_cardsearch(focus, *, specific_index, search_params, **kwargs): ] if _relatedproperty_list: yield (TROVE.relatedPropertyList, sequence(_relatedproperty_list)) - # filter-values from search params, with any additional info - _valueinfo_by_iri = {} - for _filtervalue in _cardsearch_resp.filtervalue_info: - _value_info = _valuesearch_result_as_json(_filtervalue) - _valueinfo_by_iri[_filtervalue.value_iri] = _value_info + # filter-values from search params for _filter in search_params.cardsearch_filter_set: - yield (TROVE.cardSearchFilter, _filter_as_blanknode(_filter, _valueinfo_by_iri)) + yield (TROVE.cardSearchFilter, _filter_as_blanknode(_filter)) @trovesearch_by_indexstrategy.gatherer( @@ -261,7 +257,7 @@ def gather_card(focus, *, deriver_iri, **kwargs): ### # local helpers -def _filter_as_blanknode(search_filter, valueinfo_by_iri) -> frozenset: +def _filter_as_blanknode(search_filter) -> frozenset: _filter_twoples = [ (TROVE.filterType, search_filter.operator.value), *_multi_propertypath_twoples(search_filter.propertypath_set), @@ -269,10 +265,7 @@ def _filter_as_blanknode(search_filter, valueinfo_by_iri) -> frozenset: if not search_filter.operator.is_valueless_operator(): for _value in search_filter.value_set: if search_filter.operator.is_iri_operator(): - _valueinfo = ( - valueinfo_by_iri.get(_value) - or _osfmap_or_unknown_iri_as_json(_value) - ) + _valueinfo = _osfmap_or_unknown_iri_as_json(_value) else: _valueinfo = literal_json({'@value': _value}) _filter_twoples.append((TROVE.filterValue, _valueinfo)) diff --git a/trove/views/search.py b/trove/views/search.py index c303eb5b0..4173fd5e7 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -4,7 +4,7 @@ from django.views import View from primitive_metadata import gather -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy from trove import exceptions as trove_exceptions from trove.trovesearch.search_params import ( CardsearchParams, @@ -79,7 +79,7 @@ def _parse_request(request: http.HttpRequest, renderer, search_params_dataclass) _search_params = search_params_dataclass.from_querystring( request.META['QUERY_STRING'], ) - _specific_index = IndexStrategy.get_for_trove_search(_search_params.index_strategy_name) + _specific_index = index_strategy.get_index_for_trovesearch(_search_params) # TODO: 404 for unknown strategy _search_gathering = trovesearch_by_indexstrategy.new_gathering({ 'search_params': _search_params, From 30d30e44ea8a10c8906a96810272b4d17cafb592 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 22 Oct 2024 14:24:10 -0400 Subject: [PATCH 02/14] wip (removing unneeded changes) --- _TODO.txt | 171 ---- share/admin/search.py | 1 - share/models/feature_flag.py | 1 - share/search/index_strategy/__init__.py | 14 +- .../index_strategy/trovesearch_excessive.py | 93 -- .../index_strategy/trovesearch_indexcard.py | 858 ------------------ .../index_strategy/trovesearch_irivalues.py | 99 -- share/search/messages.py | 6 - templates/admin/search-indexes.html | 7 +- tests/share/search/__init__.py | 2 - .../search/index_strategy/test_elastic8.py | 8 +- .../index_strategy/test_strategy_selection.py | 4 - .../test_trovesearch_excessive.py | 11 - .../test_trovesearch_indexcard.py | 15 - .../test_trovesearch_iri_usage.py | 12 - 15 files changed, 8 insertions(+), 1294 deletions(-) delete mode 100644 _TODO.txt delete mode 100644 share/search/index_strategy/trovesearch_excessive.py delete mode 100644 share/search/index_strategy/trovesearch_indexcard.py delete mode 100644 share/search/index_strategy/trovesearch_irivalues.py delete mode 100644 tests/share/search/index_strategy/test_trovesearch_excessive.py delete mode 100644 tests/share/search/index_strategy/test_trovesearch_indexcard.py delete mode 100644 tests/share/search/index_strategy/test_trovesearch_iri_usage.py diff --git a/_TODO.txt b/_TODO.txt deleted file mode 100644 index caac45940..000000000 --- a/_TODO.txt +++ /dev/null @@ -1,171 +0,0 @@ -using trove for a dashboard of metrics -====================================== - -on frontend... - -a dashboard has a consistent `cardSearchFilter` set - -for each metadata property (or property-path) of interest, -make a request to `/trove/index-value-search` with that `valueSearchPropertyPath` -and the dashboard's `cardSearchFilter` set - - - -denormalized IndexStrategy -========================== - -current mappings: - simple: - indexcard_uuid - focus_iri - suffuniq_focus_iri - source_record_identifier - source_config_label - iri_paths_present - iri_paths_present_suffuniq - flattened: - flat_iri_values - flat_iri_values_suffuniq - nested: (THE PROBLEM) - nested_iri... - nested_date... - nested_text... - - -to denormalize for performance (removing (most) `nested` mappings) -while supporting existing api used by osf-search... - -edges to consider: -- `cardSearchText[property.path]` - - dynamic template for text values per property-path (...to limited depth?) -- `valueSearchFilter[resourceType]` - - dynamic template for iri values per resource-type? -- `valueSearchText` - - ...new index for value-search? - - ...maybe can use the same dynamic fields added for `cardSearchText[property.path]`? - ...but how to keep the text associated with the iri value... - - ...could keep the old `nested` garbage around, but only use it when `valueSearchText`? -- `cardSearchFilter[sameAs][iri-prefix]=https://orcid.org/` - - new filter operator -- `cardSearchText[*.*.*]`, `cardSearchFilter[*.*.*]` - - dynamic templates for values by depth? - - -possible future card-index mappings: - simple: - indexcard_uuid - suid.source_config_label - suid.source_record_identifier - focus_iri.exact - focus_iri.suffuniq - propertypaths_present - flattened: - iri_by_propertypath.exact.* - iri_by_propertypath.suffuniq.* - iri_by_propertypath_length.exact.* - iri_by_propertypath_length.suffuniq.* - dynamic: (used instead of the old nested fields for most queries) - dynamics.text_by_propertypath.* - dynamics.text_by_propertypath_length.* - dynamics.date_by_propertypath.* - (maybe) dynamics.number_by_propertypath.* - nested: (ONLY for index-value-search with `valueSearchText` or `valueSearchFilter[resourceType]`) - iri_usage - iri.exact - iri.suffuniq - propertypath - propertypath_length - type_iri.exact - type_iri.suffuniq - name_text - title_text - label_text - namelike_text (combined three) - - -multiple strategies? -==================== -after reluctantly accepting `nested` for certain value-searches... how about multiple index strategies? - -select suitable index-strategy based on query - -most queries go to a more constrained index-strategy with a smaller, faster, -completely non-nested index (calling it "trovesearch_indexcard") - -queries that need the extra complexity go to a more complex index-strategy -with larger, slower index (calling it "trovesearch_excessive") - -however... even simple value-searches need to get metadata about each iri value -(at least `rdf:type` and something name-like (`dcterms:title`, `foaf:name`, `rdfs:label`...)) --- without the `nested` mapping, there's not a good way (that i see) to do that in a single query - -so how about a third index strategy just for looking up iri-value metadata? -(calling it "trovesearch_irivalues") - - -trovesearch_indexcard (one per indexcard): - simple: - indexcard_iri - indexcard_pk - suid.source_config_label - suid.source_record_identifier - focus_iri.exact - focus_iri.suffuniq - propertypaths_present - flattened: - iri_by_propertypath.* - iri_by_depth.* - dynamic: - dynamics.text_by_propertypath.* - dynamics.text_by_depth.* - dynamics.date_by_propertypath.* - - -trovesearch_irivalues (one per (indexcard, iri) pair) - simple: - iri.exact (includes sameAs synonyms) - iri.suffuniq (includes sameAs synonyms) - indexcard_iri - indexcard_pk - propertypath_from_focus - depth_from_focus - flattened: - iri_by_relative_propertypath.* - iri_by_relative_depth.* - dynamic: - dynamics.text_by_relative_propertypath.* - dynamics.text_by_relative_depth.* - dynamics.date_by_relative_propertypath.* - - -trovesearch_excessive: - (all fields from trovesearch_indexcard, plus a nested field with - fields from (or similar to) trovesearch_irivalues) - - -...ok maybe, but revisiting "trovesearch_irivalues (one per (indexcard, iri) pair)", -that's a looot of documents, and awful wasteful for the common case of commonly used iris, -and trickier to remove docs for iri values no longer used - -returning to an old idea discarded from the first "index-card-search" implementation... -how about an index with (only) one doc per referenced iri? would need to: -- use IDENTIFIER_USAGE/BACKFILL_IDENTIFIER_USAGE messages - emit after non-backfill indexcard indexing, perhaps deduped within each message chunk -- index strategy should, for each identifier message: - query for indexcards that include that identifier, - aggregate metadata included in those indexcards about that identifier, - store document describing that identifier and its usage - -important to account for erroneous sameAs assertions (make it easy to undo) - -revised trovesearch_irivalues (one per iri) - simple: - iri - used_at_propertypath - flattened: - iri_by_relative_propertypath.* - iri_by_relative_depth.* - dynamic: - dynamics.text_by_relative_propertypath.* - dynamics.text_by_relative_depth.* - dynamics.date_by_relative_propertypath.* diff --git a/share/admin/search.py b/share/admin/search.py index ea8a254e9..fbf2446b0 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -48,7 +48,6 @@ def _search_url_prefix(): def _mappings_url_prefix(): - # return reverse('admin:search-index-mappings', kwargs={'index_name': ''}) return '/admin/search-index-mappings/' diff --git a/share/models/feature_flag.py b/share/models/feature_flag.py index f2f9c57cc..df0903122 100644 --- a/share/models/feature_flag.py +++ b/share/models/feature_flag.py @@ -31,7 +31,6 @@ class FeatureFlag(models.Model): IGNORE_SHAREV2_INGEST = 'ignore_sharev2_ingest' SUGGEST_CREATOR_FACET = 'suggest_creator_facet' FORBID_UNTRUSTED_FEED = 'forbid_untrusted_feed' - TROVESEARCH_POLYSTRAT = 'trovesearch_polystrat' # name _should_ be one of the constants above, but that is not enforced by `choices` name = models.TextField(unique=True) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 2ea608bd1..a9330b71d 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -10,9 +10,6 @@ from .sharev2_elastic5 import Sharev2Elastic5IndexStrategy from .sharev2_elastic8 import Sharev2Elastic8IndexStrategy from .trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy -from .trovesearch_indexcard import TrovesearchIndexcardIndexStrategy -from .trovesearch_irivalues import TrovesearchIrivaluesIndexStrategy -from .trovesearch_excessive import TrovesearchExcessiveIndexStrategy from ._base import IndexStrategy @@ -40,9 +37,6 @@ def _iter_all_index_strategies(): if settings.ELASTICSEARCH8_URL: yield Sharev2Elastic8IndexStrategy(name='sharev2_elastic8') yield TroveIndexcardFlatsIndexStrategy(name='trove_indexcard_flats') - yield TrovesearchIndexcardIndexStrategy(name='trovesearch_indexcard') - yield TrovesearchIrivaluesIndexStrategy(name='trovesearch_irivalues') - yield TrovesearchExcessiveIndexStrategy(name='trovesearch_excessive') def get_index_strategy(strategyname: str) -> IndexStrategy: @@ -87,12 +81,6 @@ def get_index_for_sharev2_search(requested_name=None) -> IndexStrategy.SpecificI def get_index_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy.SpecificIndex: if params.index_strategy_name: # specific strategy requested _name = params.index_strategy_name - elif not FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_POLYSTRAT): - _name = 'trove_indexcard_flats' else: - _name = ( - 'trovesearch_indexcard' - if TrovesearchIndexcardIndexStrategy.works_with_params(params) - else 'trovesearch_excessive' - ) + _name = 'trove_indexcard_flats' return get_specific_index(_name, for_search=True) diff --git a/share/search/index_strategy/trovesearch_excessive.py b/share/search/index_strategy/trovesearch_excessive.py deleted file mode 100644 index 1cb6ae6b2..000000000 --- a/share/search/index_strategy/trovesearch_excessive.py +++ /dev/null @@ -1,93 +0,0 @@ -import typing - -from primitive_metadata import primitive_rdf as rdf - -from share.util.checksum_iri import ChecksumIri - -from . import _trovesearch_util as ts -from .trovesearch_indexcard import TrovesearchIndexcardIndexStrategy as IndexcardStrategy - - -class TrovesearchExcessiveIndexStrategy(IndexcardStrategy): - '''a more complicated version of the "indexcard" trovesearch strategy - - for `index-value-search` queries that the flatter index can't handle - ''' - CURRENT_STRATEGY_CHECKSUM = ChecksumIri( - checksumalgorithm_name='sha-256', - salt='TrovesearchExcessiveIndexStrategy', - hexdigest='...', - ) - - # override TrovesearchIndexcardIndexStrategy - def index_mappings(self): - _mappings = super().index_mappings() - _namelike_text_mapping = { - **ts.TEXT_MAPPING, - 'fields': {'keyword': ts.KEYWORD_MAPPING}, - 'copy_to': 'iri_usage.namelike_text', - } - # add nested properties - # (warning: SLOW, use only when needed (and do be sure to question that need)) - _mappings['properties']['iri_usage'] = { - 'type': 'nested', - 'properties': { - 'iri': ts.IRI_KEYWORD_MAPPING, # include sameAs - 'propertypath_from_focus': ts.KEYWORD_MAPPING, - 'depth_from_focus': ts.KEYWORD_MAPPING, - # flattened properties (dynamic sub-properties with keyword values) - 'relative_iri_by_propertypath': ts.FLATTENED_MAPPING, - 'relative_iri_by_depth': ts.FLATTENED_MAPPING, - # text properties (only a few) - 'name_text': _namelike_text_mapping, - 'title_text': _namelike_text_mapping, - 'label_text': _namelike_text_mapping, - 'namelike_text': {'type': 'text'}, - }, - } - return _mappings - - # override TrovesearchIndexcardIndexStrategy - class _SourcedocBuilder(IndexcardStrategy._SourcedocBuilder): - # override TrovesearchIndexcardIndexStrategy._SourcedocBuilder - def build(self): - _sourcedoc = super().build() - _sourcedoc['iri_usage'] = self._nested_iri_usages() - return _sourcedoc - - def _nested_iri_usages(self) -> list: - return list(filter(bool, ( - self._iri_usage_sourcedoc(_iri, _paths) - for _iri, _paths in self._fullwalk.paths_by_iri.items() - ))) - - def _iri_usage_sourcedoc(self, iri: str, paths: set[ts.Propertypath]) -> dict | None: - _shortwalk = self._fullwalk.shortwalk(iri) - return { - 'iri': self._exact_and_suffuniq_iris([iri], _shortwalk), - 'propertypath_from_focus': list(map(ts.propertypath_as_keyword, paths)), - 'depth_from_focus': list(map(len, paths)), - 'iri_by_propertypath': self._iris_by_propertypath(_shortwalk), - 'iri_by_depth': self._iris_by_depth(_shortwalk), - 'dynamics': { - 'text_by_propertypath': self._texts_by_propertypath(_shortwalk), - 'text_by_depth': self._texts_by_depth(_shortwalk), - 'date_by_propertypath': self._dates_by_propertypath(_shortwalk), - }, - } - - def _gather_text_values(self, focus_iri, pathset) -> typing.Iterator[str]: - for _obj in self.rdfdoc.q(focus_iri, pathset): - if isinstance(_obj, rdf.Literal): - yield _obj.unicode_value - - # override TrovesearchIndexcardIndexStrategy - class _ValuesearchQueryBuilder(IndexcardStrategy._ValuesearchQueryBuilder): - ... - - # override _CardsearchQueryBuilder - def _additional_cardsearch_filters(self) -> list[dict]: - # TODO: consider - return [{'term': {'propertypaths_present': ts.propertypath_as_keyword( - self.params.valuesearch_propertypath - )}}] diff --git a/share/search/index_strategy/trovesearch_indexcard.py b/share/search/index_strategy/trovesearch_indexcard.py deleted file mode 100644 index 89ee0f2a0..000000000 --- a/share/search/index_strategy/trovesearch_indexcard.py +++ /dev/null @@ -1,858 +0,0 @@ -from __future__ import annotations -import base64 -from collections import abc, defaultdict -import dataclasses -import functools -import json -import logging -import re -from typing import Iterable, ClassVar, Iterator - -from django.conf import settings -import elasticsearch8 -from primitive_metadata import primitive_rdf as rdf - -from share.search import exceptions -from share.search import messages -from share.search.index_strategy.elastic8 import Elastic8IndexStrategy -from share.search.index_strategy._util import encode_cursor_dataclass, decode_cursor_dataclass -from share.util.checksum_iri import ChecksumIri -from trove import models as trove_db -from trove.trovesearch.search_params import ( - CardsearchParams, - ValuesearchParams, - SearchFilter, - Textsegment, - PageParam, - is_globpath, -) -from trove.trovesearch.search_response import ( - CardsearchResponse, - ValuesearchResponse, - TextMatchEvidence, - CardsearchResult, - ValuesearchResult, - PropertypathUsage, -) -from trove.vocab.osfmap import is_date_property -from trove.vocab.namespaces import TROVE -from . import _trovesearch_util as ts - - -logger = logging.getLogger(__name__) - - -class TrovesearchIndexcardIndexStrategy(Elastic8IndexStrategy): - CURRENT_STRATEGY_CHECKSUM = ChecksumIri( - checksumalgorithm_name='sha-256', - salt='TrovesearchIndexcardIndexStrategy', - hexdigest='...', - ) - - @classmethod - def works_with_params(cls, params: CardsearchParams): - return ( - not isinstance(params, ValuesearchParams) - or ( # constraints on valuesearch: - not params.valuesearch_textsegment_set - and all( - _filter.is_sameas_filter() - for _filter in params.valuesearch_filter_set - ) - ) - ) - - # abstract method from IndexStrategy - @property - def supported_message_types(self): - return { - messages.MessageType.UPDATE_INDEXCARD, - messages.MessageType.BACKFILL_INDEXCARD, - } - - # abstract method from IndexStrategy - @property - def backfill_message_type(self): - return messages.MessageType.BACKFILL_INDEXCARD - - # abstract method from Elastic8IndexStrategy - def index_settings(self): - return {} - - # abstract method from Elastic8IndexStrategy - def index_mappings(self): - return { - 'dynamic': 'false', - 'properties': { - # simple keyword properties - 'indexcard_iri': ts.KEYWORD_MAPPING, - 'indexcard_pk': ts.KEYWORD_MAPPING, - 'suid': { - 'type': 'object', - 'properties': { - 'source_config_label': ts.KEYWORD_MAPPING, - 'source_record_identifier': ts.KEYWORD_MAPPING, - }, - }, - 'focus_iri': ts.IRI_KEYWORD_MAPPING, - 'propertypaths_present': ts.KEYWORD_MAPPING, - # flattened properties (dynamic sub-properties with keyword values) - 'iri_by_propertypath': ts.FLATTENED_MAPPING, - 'iri_by_depth': ts.FLATTENED_MAPPING, - # dynamic properties (see dynamic_templates, below) - 'dynamics': { - 'type': 'object', - 'properties': { - 'text_by_propertypath': {'type': 'object', 'dynamic': True}, - 'text_by_depth': {'type': 'object', 'dynamic': True}, - 'date_by_propertypath': {'type': 'object', 'dynamic': True}, - }, - }, - }, - 'dynamic_templates': [ - {'dynamic_text_by_path': { - 'path_match': 'dynamics.text_by_propertypath.*', - 'mapping': ts.TEXT_MAPPING, - }}, - {'dynamic_text_by_depth': { - 'path_match': 'dynamics.text_by_depth.*', - 'mapping': ts.TEXT_MAPPING, - }}, - {'dynamic_date': { - 'path_match': 'dynamics.date_by_propertypath.*', - 'mapping': { - 'type': 'date', - 'format': 'strict_date_optional_time', - }, - }}, - ], - } - - # abstract method from Elastic8IndexStrategy - def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - _indexcard_rdf_qs = ( - ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) - .select_related('indexcard__source_record_suid__source_config') - ) - _remaining_indexcard_pks = set(messages_chunk.target_ids_chunk) - for _indexcard_rdf in _indexcard_rdf_qs: - _docbuilder = self._SourcedocBuilder(_indexcard_rdf) - if not _docbuilder.should_skip(): # if skipped, will be deleted - _indexcard_pk = _indexcard_rdf.indexcard_id - _index_action = self.build_index_action( - doc_id=str(_indexcard_pk), - doc_source=_docbuilder.build(), - ) - _remaining_indexcard_pks.discard(_indexcard_pk) - yield _indexcard_pk, _index_action - # delete any that don't have "latest" rdf and derived osfmap_json - for _indexcard_pk in _remaining_indexcard_pks: - yield _indexcard_pk, self.build_delete_action(_indexcard_pk) - - ### - # implement abstract IndexStrategy.SpecificIndex - - class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): - - # abstract method from IndexStrategy.SpecificIndex - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - return self.index_strategy.es8_client.search( - index=self.indexname, - body={ - **(request_body or {}), - 'track_total_hits': True, - }, - params=(request_queryparams or {}), - ) - - # abstract method from IndexStrategy.SpecificIndex - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse: - _querybuilder = self.index_strategy._CardsearchQueryBuilder(cardsearch_params) - _search_kwargs = _querybuilder.build() - _cursor = _querybuilder.cardsearch_cursor - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - index=self.indexname, - source=False, # no need to get _source, identifiers are enough - docvalue_fields=['indexcard_iri'], - highlight={ # TODO: only one field gets highlighted? - 'require_field_match': False, - 'fields': {'dynamics.text_by_propertypath.*': {}}, - }, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self.index_strategy._cardsearch_response(cardsearch_params, _es8_response, _cursor) - - # abstract method from IndexStrategy.SpecificIndex - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: - _querybuilder = self.index_strategy._ValuesearchQueryBuilder(valuesearch_params) - _search_kwargs = _querybuilder.build() - _cursor = _querybuilder.valuesearch_cursor - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - index=self.indexname, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self.index_strategy._valuesearch_response(valuesearch_params, _es8_response, _cursor) - - ### - # building sourcedocs - - @dataclasses.dataclass - class _SourcedocBuilder: - '''build an elasticsearch sourcedoc for an rdf document - ''' - indexcard_rdf: trove_db.IndexcardRdf - indexcard: trove_db.Indexcard = dataclasses.field(init=False) - rdfdoc: rdf.RdfTripleDictionary = dataclasses.field(init=False) - focus_iri: str = dataclasses.field(init=False) - - def __post_init__(self) -> None: - self.indexcard = self.indexcard_rdf.indexcard - self.rdfdoc = rdf.RdfGraph(self.indexcard_rdf.as_rdf_tripledict()) - self.focus_iri = self.indexcard_rdf.focus_iri - - def should_skip(self) -> bool: - _suid = self.indexcard.source_record_suid - return ( - # skip cards that belong to an obsolete suid with a later duplicate - _suid.has_forecompat_replacement() - # ...or that are without some value for name/title/label - or not any(self.rdfdoc.q(self.focus_iri, ts.NAMELIKE_PROPERTIES)) - ) - - def build(self) -> dict: - _sourcedoc = { - 'indexcard_iri': self.indexcard.get_iri(), - 'indexcard_pk': str(self.indexcard.pk), - 'suid': { - 'source_record_identifier': self.indexcard.source_record_suid.identifier, - 'source_config_label': self.indexcard.source_record_suid.source_config.label, - }, - 'focus_iri': self._exact_and_suffuniq_iris([self.focus_iri], self._fullwalk), - 'propertypaths_present': self._propertypaths_present(self._fullwalk), - 'iri_by_propertypath': self._iris_by_propertypath(self._fullwalk), - 'iri_by_depth': self._iris_by_depth(self._fullwalk), - 'dynamics': { - 'text_by_propertypath': self._texts_by_propertypath(self._fullwalk), - 'text_by_depth': self._texts_by_depth(self._fullwalk), - 'date_by_propertypath': self._dates_by_propertypath(self._fullwalk), - }, - } - return _sourcedoc - - @functools.cached_property - def _fullwalk(self) -> ts.GraphWalk: - return ts.GraphWalk(self.rdfdoc, self.focus_iri) - - def _propertypaths_present(self, walk: ts.GraphWalk): - return [ - ts.propertypath_as_keyword(_path) - for _path in walk.paths_walked - ] - - def _iris_by_propertypath(self, walk: ts.GraphWalk): - return { - ts.propertypath_as_field_name(_path): ts.suffuniq_iris(walk.iris_synonyms(_iris)) - for _path, _iris in walk.iri_values.items() - } - - def _iris_by_depth(self, walk: ts.GraphWalk): - _by_depth: dict[int, set[str]] = defaultdict(set) - for _path, _iris in walk.iri_values.items(): - _by_depth[len(_path)].update(_iris) - return { - _depth_field_name(_depth): ts.suffuniq_iris(walk.iris_synonyms(_iris)) - for _depth, _iris in _by_depth.items() - } - - def _texts_by_propertypath(self, walk: ts.GraphWalk): - return { - ts.propertypath_as_field_name(_path): list(_value_set) - for _path, _value_set in walk.text_values.items() - } - - def _texts_by_depth(self, walk: ts.GraphWalk): - _by_depth: dict[int, set[str]] = defaultdict(set) - for _path, _value_set in walk.text_values.items(): - _by_depth[len(_path)].update(_value_set) - return { - _depth_field_name(_depth): list(_value_set) - for _depth, _value_set in _by_depth.items() - } - - def _dates_by_propertypath(self, walk: ts.GraphWalk): - return { - ts.propertypath_as_field_name(_path): [ - _date.isoformat() - for _date in _value_set - ] - for _path, _value_set in walk.date_values.items() - } - - def _exact_and_suffuniq_iris(self, iris: Iterable[str], walk: ts.GraphWalk): - _synonyms = walk.iris_synonyms(iris) - return { - 'exact': list(_synonyms), - 'suffuniq': ts.suffuniq_iris(_synonyms), - } - - ### - # building queries - - @dataclasses.dataclass - class _CardsearchQueryBuilder: - params: CardsearchParams - - def build(self): - return { - 'query': self._cardsearch_query(), - 'aggs': self._cardsearch_aggs(), - 'sort': list(self._cardsearch_sorts()) or None, - 'from_': self.cardsearch_cursor.cardsearch_start_index(), - 'size': self.cardsearch_cursor.page_size, - } - - @functools.cached_property - def cardsearch_cursor(self): - return _CardsearchCursor.from_cardsearch_params(self.params) - - @property - def relevance_matters(self) -> bool: - return not self.cardsearch_cursor.random_sort - - def _cardsearch_query(self) -> dict: - _bool_query = { - 'filter': self._additional_cardsearch_filters(), - 'must': [], - 'must_not': [], - 'should': [], - } - # iri-keyword filters - for _searchfilter in self.params.cardsearch_filter_set: - if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: - _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: - _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: - _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: - _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator.is_date_operator(): - _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) - else: - raise ValueError(f'unknown filter operator {_searchfilter.operator}') - # text-based queries - for _boolkey, _textquery in self._cardsearch_text_boolparts(): - _bool_query[_boolkey].append(_textquery) - return self._wrap_bool_query(_bool_query) - - def _wrap_bool_query(self, bool_query_innards) -> dict: - # note: may modify bool_query_innards in-place - _cursor = self.cardsearch_cursor - if not _cursor or not _cursor.random_sort: - # no need for randomness - return {'bool': bool_query_innards} - if not _cursor.first_page_pks: - # independent random sample - return { - 'function_score': { - 'query': {'bool': bool_query_innards}, - 'boost_mode': 'replace', - 'random_score': {}, # default random_score is fast and unpredictable - }, - } - _firstpage_filter = {'terms': {'indexcard_pk': _cursor.first_page_pks}} - if _cursor.is_first_page(): - # returning to a first page previously visited - bool_query_innards['filter'].append(_firstpage_filter) - return {'bool': bool_query_innards} - # get a subsequent page using reproducible randomness - bool_query_innards['must_not'].append(_firstpage_filter) - return { - 'function_score': { - 'query': {'bool': bool_query_innards}, - 'boost_mode': 'replace', - 'random_score': { - 'seed': ''.join(_cursor.first_page_pks), - 'field': 'indexcard_pk', - }, - }, - } - - def _additional_cardsearch_filters(self) -> list[dict]: - return [] # for overriding - - def _cardsearch_aggs(self): - _aggs = {} - if self.params.related_property_paths: - _aggs['agg_related_propertypath_usage'] = {'terms': { - 'field': 'propertypaths_present', - 'include': [ - ts.propertypath_as_keyword(_path) - for _path in self.params.related_property_paths - ], - 'size': len(self.params.related_property_paths), - }} - return _aggs - - def _cardsearch_presence_query(self, search_filter) -> dict: - return _any_query([ - self._cardsearch_path_presence_query(_path) - for _path in search_filter.propertypath_set - ]) - - def _cardsearch_path_presence_query(self, path: ts.Propertypath): - return {'term': {'propertypaths_present': ts.propertypath_as_keyword(path)}} - - def _cardsearch_iri_filter(self, search_filter) -> dict: - _iris = ts.suffuniq_iris(search_filter.value_set) - return _any_query([ - self._cardsearch_path_iri_query(_path, _iris) - for _path in search_filter.propertypath_set - ]) - - def _cardsearch_path_iri_query(self, path, suffuniq_iris): - _field = ( - f'iri_by_propertypath.{ts.propertypath_as_field_name(path)}' - if not is_globpath(path) - else f'iri_by_depth.{_depth_field_name(len(path))}' - ) - return {'terms': {_field: suffuniq_iris}} - - def _cardsearch_date_filter(self, search_filter): - return _any_query([ - self._date_filter_for_path(_path, search_filter.operator, search_filter.value_set) - for _path in search_filter.propertypath_set - ]) - - def _date_filter_for_path(self, path, filter_operator, value_set): - _field = f'dynamics.date_by_propertypath.{ts.propertypath_as_field_name(path)}' - if filter_operator == SearchFilter.FilterOperator.BEFORE: - _value = min(value_set) # rely on string-comparable isoformat - return {'range': {_field: {'lt': _daterange_value(_value)}}} - elif filter_operator == SearchFilter.FilterOperator.AFTER: - _value = max(value_set) # rely on string-comparable isoformat - return {'range': {_field: {'gt': _daterange_value(_value)}}} - elif filter_operator == SearchFilter.FilterOperator.AT_DATE: - return _any_query([ - {'range': {_field: {'gte': _filtervalue, 'lte': _filtervalue}}} - for _filtervalue in map(_daterange_value, value_set) - ]) - else: - raise ValueError(f'invalid date filter operator (got {filter_operator})') - - def _cardsearch_sorts(self): - for _sortparam in self.params.sort_list: - _pathfield = ts.propertypath_as_field_name((_sortparam.property_iri,)) - _fieldpath = f'dynamics.date_by_propertypath.{_pathfield}' - _order = 'desc' if _sortparam.descending else 'asc' - yield {_fieldpath: _order} - - def _cardsearch_text_boolparts(self) -> Iterator[tuple[str, dict]]: - for _textsegment in self.params.cardsearch_textsegment_set: - if _textsegment.is_negated: - yield 'must_not', self._exact_text_query(_textsegment) - elif not _textsegment.is_fuzzy: - yield 'must', self._exact_text_query(_textsegment) - else: - yield 'must', self._fuzzy_text_must_query(_textsegment) - if self.relevance_matters: - yield 'should', self._fuzzy_text_should_query(_textsegment) - - def _text_field_name(self, propertypath: ts.Propertypath): - return ( - f'dynamics.text_by_propertypath.{ts.propertypath_as_field_name(propertypath)}' - if not is_globpath(propertypath) - else f'dynamics.text_by_depth.{_depth_field_name(len(propertypath))}' - ) - - def _exact_text_query(self, textsegment: Textsegment) -> dict: - # TODO: textsegment.is_openended (prefix query) - return _any_query([ - {'match_phrase': {self._text_field_name(_path): {'query': textsegment.text}}} - for _path in textsegment.propertypath_set - ]) - - def _fuzzy_text_must_query(self, textsegment: Textsegment) -> dict: - # TODO: textsegment.is_openended (prefix query) - return _any_query([ - {'match': { - self._text_field_name(_path): { - 'query': textsegment.text, - 'fuzziness': 'AUTO', - # TODO: consider 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) - }, - }} - for _path in textsegment.propertypath_set - ]) - - def _fuzzy_text_should_query(self, textsegment: Textsegment): - _slop = len(textsegment.text.split()) - return _any_query([ - {'match_phrase': { - self._text_field_name(_path): {'query': textsegment.text, 'slop': _slop}, - }} - for _path in textsegment.propertypath_set - ]) - - class _ValuesearchQueryBuilder(_CardsearchQueryBuilder): - params: ValuesearchParams - - # override _CardsearchQueryBuilder - def build(self): - if self._is_date_valuesearch(): - _aggs = self._valuesearch_date_aggs() - else: - _aggs = self._valuesearch_iri_aggs() - return dict( - query=self._cardsearch_query(), - size=0, # ignore cardsearch hits; just want the aggs - aggs=_aggs, - ) - - @functools.cached_property - def valuesearch_cursor(self): - return _SimpleCursor.from_page_param(self.params.page) - - # override _CardsearchQueryBuilder - @property - def relevance_matters(self) -> bool: - return False # valuesearch always ordered by count - - def _is_date_valuesearch(self) -> bool: - return is_date_property(self.params.valuesearch_propertypath[-1]) - - def _valuesearch_iri_aggs(self): - _propertypath = self.params.valuesearch_propertypath - _field = f'iri_by_propertypath.{ts.propertypath_as_field_name(_propertypath)}' - _terms_agg: dict = {'field': _field} - _specific_iris = list(set(self.params.valuesearch_iris())) - if _specific_iris: - _terms_agg['include'] = _specific_iris - _terms_agg['size'] = len(_specific_iris) - return {'agg_valuesearch_iris': {'terms': _terms_agg}} - - def _valuesearch_date_aggs(self): - _propertypath = self.params.valuesearch_propertypath - _field = f'date_by_propertypath.{ts.propertypath_as_field_name(_propertypath)}' - _aggs = { - 'agg_valuesearch_dates': { - 'date_histogram': { - 'field': _field, - 'calendar_interval': 'year', - 'format': 'yyyy', - 'order': {'_key': 'desc'}, - 'min_doc_count': 1, - }, - }, - } - return _aggs - - ### - # normalizing search responses - - def _valuesearch_response( - self, - valuesearch_params: ValuesearchParams, - es8_response: dict, - cursor: '_SimpleCursor', - ) -> ValuesearchResponse: - _iri_aggs = es8_response['aggregations'].get('agg_valuesearch_iris') - if _iri_aggs: - _buckets = _iri_aggs['buckets'] - _bucket_count = len(_buckets) - # WARNING: terribly inefficient pagination (part two) - _page_end_index = cursor.start_index + cursor.page_size - _bucket_page = _buckets[cursor.start_index:_page_end_index] # discard prior pages - cursor.result_count = ( - -1 # "many more" - if (_bucket_count > _page_end_index) # agg includes one more, if there - else _bucket_count - ) - return ValuesearchResponse( - search_result_page=[ - self._valuesearch_iri_result(_iri_bucket) - for _iri_bucket in _bucket_page - ], - next_page_cursor=cursor.next_cursor(), - prev_page_cursor=cursor.prev_cursor(), - first_page_cursor=cursor.first_cursor(), - ) - else: # assume date - _year_buckets = ( - es8_response['aggregations'] - ['agg_valuesearch_dates'] - ['buckets'] - ) - return ValuesearchResponse( - search_result_page=[ - self._valuesearch_date_result(_year_bucket) - for _year_bucket in _year_buckets - ], - ) - - def _valuesearch_iri_result(self, iri_bucket) -> ValuesearchResult: - return ValuesearchResult( - value_iri=iri_bucket['key'], - # TODO: get type and text somehow - value_type=_bucketlist(iri_bucket.get('type_iri', [])), - name_text=_bucketlist(iri_bucket.get('name_text', [])), - title_text=_bucketlist(iri_bucket.get('title_text', [])), - label_text=_bucketlist(iri_bucket.get('label_text', [])), - match_count=iri_bucket['doc_count'], - ) - - def _valuesearch_date_result(self, date_bucket) -> ValuesearchResult: - return ValuesearchResult( - value_iri=None, - value_value=date_bucket['key_as_string'], - label_text=(date_bucket['key_as_string'],), - match_count=date_bucket['doc_count'], - ) - - def _cardsearch_response( - self, - cardsearch_params: CardsearchParams, - es8_response: dict, - cursor: '_CardsearchCursor', - ) -> CardsearchResponse: - _es8_total = es8_response['hits']['total'] - if _es8_total['relation'] != 'eq': - cursor.result_count = -1 # "too many" - else: # exact (and small) count - cursor.result_count = _es8_total['value'] - if cursor.random_sort and not cursor.is_first_page(): - # account for the filtered-out first page - assert cursor.result_count is not None - cursor.result_count += len(cursor.first_page_pks) - _results = [] - for _es8_hit in es8_response['hits']['hits']: - _card_iri = _es8_hit['fields']['indexcard_iri'][0] - _results.append(CardsearchResult( - card_iri=_card_iri, - card_pk=_es8_hit['_id'], - text_match_evidence=list(self._gather_textmatch_evidence(_card_iri, _es8_hit)), - )) - if cursor.is_first_page() and cursor.first_page_pks: - # revisiting first page; reproduce original random order - _ordering_by_id = { - _id: _i - for (_i, _id) in enumerate(cursor.first_page_pks) - } - _results.sort(key=lambda _r: _ordering_by_id[_r.card_pk]) - else: - _should_start_reproducible_randomness = ( - cursor.random_sort - and cursor.is_first_page() - and not cursor.first_page_pks - and not cursor.has_many_more() - and any( - not _filter.is_type_filter() # look for a non-default filter - for _filter in cardsearch_params.cardsearch_filter_set - ) - ) - if _should_start_reproducible_randomness: - cursor.first_page_pks = tuple(_result.card_pk for _result in _results) - _relatedproperty_list: list[PropertypathUsage] = [] - if cardsearch_params.related_property_paths: - _relatedproperty_list.extend( - PropertypathUsage(property_path=_path, usage_count=0) - for _path in cardsearch_params.related_property_paths - ) - _relatedproperty_by_path = { - _result.property_path: _result - for _result in _relatedproperty_list - } - for _bucket in es8_response['aggregations']['agg_related_propertypath_usage']['buckets']: - _path = tuple(json.loads(_bucket['key'])) - _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] - return CardsearchResponse( - total_result_count=( - TROVE['ten-thousands-and-more'] - if cursor.has_many_more() - else cursor.result_count - ), - search_result_page=_results, - related_propertypath_results=_relatedproperty_list, - next_page_cursor=cursor.next_cursor(), - prev_page_cursor=cursor.prev_cursor(), - first_page_cursor=cursor.first_cursor(), - ) - - def _gather_textmatch_evidence(self, card_iri, es8_hit) -> Iterator[TextMatchEvidence]: - for _field, _snippets in es8_hit.get('highlight', {}).items(): - (_, _, _encoded_path) = _field.rpartition('.') - _property_path = _parse_path_field_name(_encoded_path) - for _snippet in _snippets: - yield TextMatchEvidence( - property_path=_property_path, - matching_highlight=rdf.literal(_snippet), - card_iri=card_iri, - ) - - -### -# assorted helper functions - -def _bucketlist(agg_result: dict) -> list[str]: - return [ - _bucket['key'] - for _bucket in agg_result['buckets'] - ] - - -def _daterange_value(datevalue: str): - _cleanvalue = datevalue.strip() - if re.fullmatch(r'\d{4,}', _cleanvalue): - return f'{_cleanvalue}||/y' - if re.fullmatch(r'\d{4,}-\d{2}', _cleanvalue): - return f'{_cleanvalue}||/M' - if re.fullmatch(r'\d{4,}-\d{2}-\d{2}', _cleanvalue): - return f'{_cleanvalue}||/d' - raise ValueError(f'bad date value "{datevalue}"') - - -def _depth_field_name(depth: int) -> str: - return f'depth{depth}' - - -def _parse_path_field_name(path_field_name: str) -> ts.Propertypath: - # inverse of propertypath_as_field_name - _list = json.loads(base64.urlsafe_b64decode(path_field_name.encode()).decode()) - assert isinstance(_list, list) - assert all(isinstance(_item, str) for _item in _list) - return tuple(_list) - - -def _any_query(queries: abc.Collection[dict]): - if len(queries) == 1: - (_query,) = queries - return _query - return {'bool': {'should': list(queries), 'minimum_should_match': 1}} - - -def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[ts.Propertypath], nested_path: str): - _suffuniq_iri_paths = [] - _glob_path_lengths = [] - for _path in propertypath_set: - if is_globpath(_path): - _glob_path_lengths.append(len(_path)) - else: - _suffuniq_iri_paths.append(ts.propertypath_as_keyword(_path)) - if _suffuniq_iri_paths and _glob_path_lengths: - return {'bool': { - 'minimum_should_match': 1, - 'should': [ - {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}}, - {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}}, - ], - }} - if _glob_path_lengths: - return {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}} - return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} - - -@dataclasses.dataclass -class _SimpleCursor: - start_index: int - page_size: int - result_count: int | None # use -1 to indicate "many more" - - MAX_INDEX: ClassVar[int] = ts.VALUESEARCH_MAX - - @classmethod - def from_page_param(cls, page: PageParam) -> '_SimpleCursor': - if page.cursor: - return decode_cursor_dataclass(page.cursor, cls) - assert page.size is not None - return cls( - start_index=0, - page_size=page.size, - result_count=None, # should be set when results are in - ) - - def next_cursor(self) -> str | None: - if not self.result_count: - return None - _next = dataclasses.replace(self, start_index=(self.start_index + self.page_size)) - return ( - encode_cursor_dataclass(_next) - if _next.is_valid_cursor() - else None - ) - - def prev_cursor(self) -> str | None: - _prev = dataclasses.replace(self, start_index=(self.start_index - self.page_size)) - return ( - encode_cursor_dataclass(_prev) - if _prev.is_valid_cursor() - else None - ) - - def first_cursor(self) -> str | None: - if self.is_first_page(): - return None - return encode_cursor_dataclass(dataclasses.replace(self, start_index=0)) - - def is_first_page(self) -> bool: - return self.start_index == 0 - - def has_many_more(self) -> bool: - return self.result_count == -1 - - def max_index(self) -> int: - return ( - self.MAX_INDEX - if self.has_many_more() - else min(self.result_count or 0, self.MAX_INDEX) - ) - - def is_valid_cursor(self) -> bool: - return 0 <= self.start_index < self.max_index() - - -@dataclasses.dataclass -class _CardsearchCursor(_SimpleCursor): - random_sort: bool # how to sort by relevance to nothingness? randomness! - first_page_pks: tuple[str, ...] = () - - MAX_INDEX: ClassVar[int] = ts.CARDSEARCH_MAX - - @classmethod - def from_cardsearch_params(cls, params: CardsearchParams) -> '_CardsearchCursor': - if params.page.cursor: - return decode_cursor_dataclass(params.page.cursor, cls) - assert params.page.size is not None - return cls( - start_index=0, - page_size=params.page.size, - result_count=None, # should be set when results are in - random_sort=( - not params.sort_list - and not params.cardsearch_textsegment_set - ), - ) - - def cardsearch_start_index(self) -> int: - if self.is_first_page() or not self.random_sort: - return self.start_index - return self.start_index - len(self.first_page_pks) - - def first_cursor(self) -> str | None: - if self.random_sort and not self.first_page_pks: - return None - return super().prev_cursor() - - def prev_cursor(self) -> str | None: - if self.random_sort and not self.first_page_pks: - return None - return super().prev_cursor() diff --git a/share/search/index_strategy/trovesearch_irivalues.py b/share/search/index_strategy/trovesearch_irivalues.py deleted file mode 100644 index 7d40a1860..000000000 --- a/share/search/index_strategy/trovesearch_irivalues.py +++ /dev/null @@ -1,99 +0,0 @@ -import typing - -from share.search import messages -from share.search.index_strategy.elastic8 import Elastic8IndexStrategy -from share.util.checksum_iri import ChecksumIri -from . import _trovesearch_util as ts - - -class TrovesearchMentionsIndexStrategy(Elastic8IndexStrategy): - CURRENT_STRATEGY_CHECKSUM = ChecksumIri( - checksumalgorithm_name='sha-256', - salt='TrovesearchMentionsIndexStrategy', - hexdigest='...', - ) - - # abstract method from IndexStrategy - @property - def supported_message_types(self): - return { - messages.MessageType.UPDATE_INDEXCARD, - messages.MessageType.BACKFILL_INDEXCARD, - } - - # abstract method from IndexStrategy - @property - def backfill_message_type(self): - return messages.MessageType.BACKFILL_INDEXCARD - - # abstract method from Elastic8IndexStrategy - def index_settings(self): - return {} - - # abstract method from Elastic8IndexStrategy - def index_mappings(self): - return { - 'dynamic': 'false', - 'properties': { - 'iri': ts.IRI_KEYWORD_MAPPING, # include sameAs - 'indexcard_iri': ts.KEYWORD_MAPPING, - 'indexcard_pk': ts.KEYWORD_MAPPING, - 'propertypath_from_focus': ts.KEYWORD_MAPPING, - 'depth_from_focus': ts.KEYWORD_MAPPING, - # flattened properties (dynamic sub-properties with keyword values) - 'iri_by_relative_propertypath': ts.FLATTENED_MAPPING, - 'iri_by_relative_depth': ts.FLATTENED_MAPPING, - # dynamic properties (see dynamic_templates, below) - 'dynamics': { - 'type': 'object', - 'properties': { - 'text_by_relative_propertypath': {'type': 'object', 'dynamic': True}, - 'text_by_relative_depth': {'type': 'object', 'dynamic': True}, - 'date_by_relative_propertypath': {'type': 'object', 'dynamic': True}, - }, - }, - }, - 'dynamic_templates': [ - {'dynamic_text_by_path': { - 'path_match': 'dynamics.text_by_relative_propertypath.*', - 'mapping': ts.TEXT_MAPPING, - }}, - {'dynamic_text_by_depth': { - 'path_match': 'dynamics.text_by_relative_depth.*', - 'mapping': ts.TEXT_MAPPING, - }}, - {'dynamic_date': { - 'path_match': 'dynamics.date_by_relative_propertypath.*', - 'mapping': { - 'type': 'date', - 'format': 'strict_date_optional_time', - }, - }}, - ], - } - - def before_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: typing.Iterable[str]): - if messages_chunk.message_type in ( - messages.MessageType.UPDATE_INDEXCARD, - messages.MessageType.BACKFILL_INDEXCARD, - ): - self.es8_client.delete_by_query( - index=list(indexnames), - query={'terms': {'indexcard_pk': messages_chunk.target_ids_chunk}}, - ) - - # abstract method from Elastic8IndexStrategy - def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - _indexcard_rdf_qs = ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) - for _indexcard_rdf in _indexcard_rdf_qs: - for _doc_id, _iri_usage_doc in self._build_iri_usage_docs(_indexcard_rdf): - _index_action = self.build_index_action(_doc_id, _iri_usage_doc) - - def _build_iri_usage_docs(self, indexcard_rdf: trove_db.IndexcardRdf): - _graphwalk = ts.GraphWalk( - rdf.RdfGraph(_indexcard_rdf.as_rdf_tripledict()), - _indexcard_rdf.focus_iri, - ) - # TODO: skip iris already in a static thesaurus - ... - diff --git a/share/search/messages.py b/share/search/messages.py index 7010a010c..a3930b42c 100644 --- a/share/search/messages.py +++ b/share/search/messages.py @@ -18,9 +18,6 @@ class MessageType(enum.Enum): # for indexcard-based indexes: UPDATE_INDEXCARD = 'update-indexcard' BACKFILL_INDEXCARD = 'backfill-indexcard' - # for aggregating identifier usage across index cards: - IDENTIFIER_USAGE = 'identifier-used' - BACKFILL_IDENTIFIER_USAGE = 'backfill-identifier-usage' @classmethod def from_int(cls, message_type_int: int): @@ -41,8 +38,6 @@ class IntMessageType(enum.IntEnum): BACKFILL_SUID = 6 UPDATE_INDEXCARD = 7 BACKFILL_INDEXCARD = 8 - IDENTIFIER_USAGE = 11 - BACKFILL_IDENTIFIER_USAGE = 12 if __debug__: @@ -56,7 +51,6 @@ def _enum_keys(an_enum_class): BACKFILL_MESSAGE_TYPES = { MessageType.BACKFILL_SUID, MessageType.BACKFILL_INDEXCARD, - MessageType.BACKFILL_IDENTIFIER_USAGE, } diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html index 9c3e0ac27..30d5e11a1 100644 --- a/templates/admin/search-indexes.html +++ b/templates/admin/search-indexes.html @@ -109,6 +109,7 @@

prior indexes

{% trans "is default for searching" %} {% trans "doc count" %} {% trans "actions" %} + {% trans "links" %} {% trans "index name" %} {% for index_status in indexes.prior %} @@ -149,11 +150,11 @@

prior indexes

{% endif %} - {{ index_status.specific_indexname }} - {% if index_status.creation_date %} -

({% trans "mappings" %})

+ {% if index_status.creation_date %} +

{% trans "mappings" %}

{% endif %} + {{ index_status.specific_indexname }} {% endfor %} diff --git a/tests/share/search/__init__.py b/tests/share/search/__init__.py index fb9a98ebc..a7a49aaf9 100644 --- a/tests/share/search/__init__.py +++ b/tests/share/search/__init__.py @@ -12,7 +12,5 @@ def patch_index_strategies(strategies: dict[str, index_strategy.IndexStrategy]): 'all_index_strategies', return_value=strategies, ): - breakpoint() yield - breakpoint() index_strategy.all_index_strategies.cache_clear() diff --git a/tests/share/search/index_strategy/test_elastic8.py b/tests/share/search/index_strategy/test_elastic8.py index 4eeeef385..5de732690 100644 --- a/tests/share/search/index_strategy/test_elastic8.py +++ b/tests/share/search/index_strategy/test_elastic8.py @@ -46,11 +46,9 @@ def mock_es_client(self): yield es8_mockclient @pytest.fixture - def fake_strategy(self, mock_es_client): - strat = FakeElastic8IndexStrategy( - name='fake_es8', - cluster_settings={'URL': 'http://nowhere.example:12345/'}, - ) + def fake_strategy(self, mock_es_client, settings): + settings.ELASTICSEARCH8_URL = 'http://nowhere.example:12345/' + strat = FakeElastic8IndexStrategy(name='fake_es8') strat.assert_strategy_is_current() return strat diff --git a/tests/share/search/index_strategy/test_strategy_selection.py b/tests/share/search/index_strategy/test_strategy_selection.py index a7c8f60bb..219674385 100644 --- a/tests/share/search/index_strategy/test_strategy_selection.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -11,8 +11,6 @@ sharev2_elastic5, sharev2_elastic8, trove_indexcard_flats, - trovesearch_indexcard, - trovesearch_excessive, ) @@ -22,8 +20,6 @@ def expected_strategy_classes(): 'sharev2_elastic5': sharev2_elastic5.Sharev2Elastic5IndexStrategy, 'sharev2_elastic8': sharev2_elastic8.Sharev2Elastic8IndexStrategy, 'trove_indexcard_flats': trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy, - 'trovesearch_indexcard': trovesearch_indexcard.TrovesearchIndexcardIndexStrategy, - 'trovesearch_excessive': trovesearch_excessive.TrovesearchExcessiveIndexStrategy, } diff --git a/tests/share/search/index_strategy/test_trovesearch_excessive.py b/tests/share/search/index_strategy/test_trovesearch_excessive.py deleted file mode 100644 index 490be63f4..000000000 --- a/tests/share/search/index_strategy/test_trovesearch_excessive.py +++ /dev/null @@ -1,11 +0,0 @@ -import unittest - -from share.search.index_strategy.trovesearch_excessive import TrovesearchExcessiveIndexStrategy -from . import _common_trovesearch_tests - - -#@unittest.skip('wip') -class TestTrovesearchExcessive(_common_trovesearch_tests.CommonTrovesearchTests): - # for RealElasticTestCase - def get_index_strategy(self): - return TrovesearchExcessiveIndexStrategy('test_trovesearch_excessive') diff --git a/tests/share/search/index_strategy/test_trovesearch_indexcard.py b/tests/share/search/index_strategy/test_trovesearch_indexcard.py deleted file mode 100644 index 16d12cd55..000000000 --- a/tests/share/search/index_strategy/test_trovesearch_indexcard.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest - -from share.search.index_strategy.trovesearch_indexcard import TrovesearchIndexcardIndexStrategy -from . import _common_trovesearch_tests - - -#@unittest.skip('wip') -class TestTrovesearchIndexcard(_common_trovesearch_tests.CommonTrovesearchTests): - # for RealElasticTestCase - def get_index_strategy(self): - return TrovesearchIndexcardIndexStrategy('test_trovesearch_indexcard') - - # override CommonTrovesearchTests - def valuesearch_complex_cases(self): - yield from () # "complex" valuesearches are the ones this indexcard strategy can't handle diff --git a/tests/share/search/index_strategy/test_trovesearch_iri_usage.py b/tests/share/search/index_strategy/test_trovesearch_iri_usage.py deleted file mode 100644 index a1b51119e..000000000 --- a/tests/share/search/index_strategy/test_trovesearch_iri_usage.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest - -from share.search.index_strategy.trovesearch_excessive import TrovesearchExcessiveIndexStrategy -from . import _common_trovesearch_tests - - -#@unittest.skip('wip') -class TestTrovesearchExcessive(_common_trovesearch_tests.CommonTrovesearchTests): - # for RealElasticTestCase - def get_index_strategy(self): - return TrovesearchExcessiveIndexStrategy('test_trovesearch_excessive') - From 350dc992664099ea52c5f607ab10a4cbc1885a73 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 23 Oct 2024 17:16:32 -0400 Subject: [PATCH 03/14] add trovesearch_denorm index strategy --- share/models/feature_flag.py | 1 + share/search/index_strategy/__init__.py | 4 + .../index_strategy/_trovesearch_util.py | 98 +- .../index_strategy/trove_indexcard_flats.py | 30 +- .../index_strategy/trovesearch_denorm.py | 975 ++++++++++++++++++ .../index_strategy/test_strategy_selection.py | 2 + .../index_strategy/test_trovesearch_denorm.py | 9 + trove/exceptions.py | 4 + trove/trovesearch/search_params.py | 12 + trove/vocab/namespaces.py | 2 + trove/vocab/trove.py | 2 +- 11 files changed, 1079 insertions(+), 60 deletions(-) create mode 100644 share/search/index_strategy/trovesearch_denorm.py create mode 100644 tests/share/search/index_strategy/test_trovesearch_denorm.py diff --git a/share/models/feature_flag.py b/share/models/feature_flag.py index df0903122..518baec67 100644 --- a/share/models/feature_flag.py +++ b/share/models/feature_flag.py @@ -31,6 +31,7 @@ class FeatureFlag(models.Model): IGNORE_SHAREV2_INGEST = 'ignore_sharev2_ingest' SUGGEST_CREATOR_FACET = 'suggest_creator_facet' FORBID_UNTRUSTED_FEED = 'forbid_untrusted_feed' + TROVESEARCH_DENORMILY = 'trovesearch_denormily' # name _should_ be one of the constants above, but that is not enforced by `choices` name = models.TextField(unique=True) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index a9330b71d..297702475 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -10,6 +10,7 @@ from .sharev2_elastic5 import Sharev2Elastic5IndexStrategy from .sharev2_elastic8 import Sharev2Elastic8IndexStrategy from .trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy +from .trovesearch_denorm import TrovesearchDenormIndexStrategy from ._base import IndexStrategy @@ -37,6 +38,7 @@ def _iter_all_index_strategies(): if settings.ELASTICSEARCH8_URL: yield Sharev2Elastic8IndexStrategy(name='sharev2_elastic8') yield TroveIndexcardFlatsIndexStrategy(name='trove_indexcard_flats') + yield TrovesearchDenormIndexStrategy(name='trovesearch_denorm') def get_index_strategy(strategyname: str) -> IndexStrategy: @@ -81,6 +83,8 @@ def get_index_for_sharev2_search(requested_name=None) -> IndexStrategy.SpecificI def get_index_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy.SpecificIndex: if params.index_strategy_name: # specific strategy requested _name = params.index_strategy_name + elif FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY): + _name = 'trovesearch_denorm' else: _name = 'trove_indexcard_flats' return get_specific_index(_name, for_search=True) diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py index 2e97cea82..f051b537c 100644 --- a/share/search/index_strategy/_trovesearch_util.py +++ b/share/search/index_strategy/_trovesearch_util.py @@ -22,9 +22,11 @@ FOAF, OSFMAP, OWL, + RDF, RDFS, SKOS, TROVE, + XSD, ) from trove.vocab.osfmap import is_date_property @@ -92,16 +94,42 @@ def latest_rdf_for_indexcard_pks(indexcard_pks): ) +def iri_synonyms(iri: str, rdfdoc: rdf.RdfGraph) -> set[str]: + # note: extremely limited inference -- assumes objects of owl:sameAs are not used as subjects + _synonyms = ( + _synonym + for _synonym in rdfdoc.q(iri, OWL.sameAs) + if is_worthwhile_iri(_synonym) + ) + return {iri, *_synonyms} + + +def iris_synonyms(iris: typing.Iterable[str], rdfdoc: rdf.RdfGraph) -> set[str]: + return { + _synonym + for _iri in iris + for _synonym in iri_synonyms(_iri, rdfdoc) + } + + def propertypath_as_keyword(path: Propertypath) -> str: - return json.dumps(path if is_globpath(path) else [ + assert not is_globpath(path) + return json.dumps([ get_sufficiently_unique_iri(_iri) for _iri in path ]) def propertypath_as_field_name(path: Propertypath) -> str: - _path_keyword = propertypath_as_keyword(path) - return base64.urlsafe_b64encode(_path_keyword.encode()).decode() + return b64(propertypath_as_keyword(path)) + + +def b64(value: str) -> str: + return base64.urlsafe_b64encode(value.encode()).decode() + + +def b64_reverse(b64_str: str) -> str: + return base64.urlsafe_b64decode(b64_str.encode()).decode() def suffuniq_iris(iris: typing.Iterable[str]) -> list[str]: @@ -112,22 +140,28 @@ def suffuniq_iris(iris: typing.Iterable[str]) -> list[str]: }) +def _dict_of_sets(): + return defaultdict(set) + + @dataclasses.dataclass class GraphWalk: rdfdoc: rdf.RdfGraph focus_iri: str - recursive: bool = True + already_visiting: set[str] = dataclasses.field(default_factory=set) iri_values: dict[Propertypath, set[str]] = dataclasses.field( - default_factory=lambda: defaultdict(set), + default_factory=_dict_of_sets, ) text_values: dict[Propertypath, set[rdf.Literal]] = dataclasses.field( - default_factory=lambda: defaultdict(set), + default_factory=_dict_of_sets, ) date_values: dict[Propertypath, set[datetime.date]] = dataclasses.field( - default_factory=lambda: defaultdict(set), + default_factory=_dict_of_sets, + ) + integer_values: dict[Propertypath, set[int]] = dataclasses.field( + default_factory=_dict_of_sets, ) paths_walked: set[Propertypath] = dataclasses.field(default_factory=set) - _visiting: set[str] = dataclasses.field(default_factory=set) def __post_init__(self): for _walk_path, _walk_obj in self._walk_from_subject(self.focus_iri): @@ -144,33 +178,33 @@ def __post_init__(self): else: self.date_values[_walk_path].add(_parsed_date) elif isinstance(_walk_obj, rdf.Literal): - self.text_values[_walk_path].add(_walk_obj.unicode_value) + if XSD.integer in _walk_obj.datatype_iris: + self.integer_values[_walk_path].add(_walk_obj) + if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris): + self.text_values[_walk_path].add(_walk_obj.unicode_value) - def shortwalk(self, from_iri: str) -> GraphWalk: + def shortwalk_from(self, from_iri: str) -> GraphWalk: return GraphWalk( self.rdfdoc, - self.focus_iri, - recursive=False, + from_iri, + already_visiting={self.focus_iri}, ) def _walk_from_subject( self, - iri_or_blanknode: str | rdf.Blanknode, + iri: str, path_so_far: tuple[str, ...] = (), ) -> typing.Iterator[tuple[Propertypath, rdf.RdfObject]]: '''walk the graph from the given subject, yielding (pathkey, obj) for every reachable object ''' - with self._visit(iri_or_blanknode): - _twoples = ( - iri_or_blanknode - if isinstance(iri_or_blanknode, frozenset) - else self.rdfdoc.tripledict.get(iri_or_blanknode, {}) - ) + if iri in self.already_visiting: + return + with self._visit(iri): + _twoples = self.rdfdoc.tripledict.get(iri, {}) for _next_steps, _obj in walk_twoples(_twoples): _path = (*path_so_far, *_next_steps) yield (_path, _obj) - if self.recursive and isinstance(_obj, str) and (_obj not in self._visiting): - # step further for iri or blanknode + if isinstance(_obj, str): # step further for iri yield from self._walk_from_subject(_obj, path_so_far=_path) @functools.cached_property @@ -181,28 +215,12 @@ def paths_by_iri(self) -> defaultdict[str, set[Propertypath]]: _paths_by_iri[_iri].add(_path) return _paths_by_iri - def iri_synonyms(self, iri: str) -> set[str]: - # note: extremely limited inference -- assumes objects of owl:sameAs are not used as subjects - _synonyms = ( - _synonym - for _synonym in self.rdfdoc.q(iri, OWL.sameAs) - if is_worthwhile_iri(_synonym) - ) - return {iri, *_synonyms} - - def iris_synonyms(self, iris: typing.Iterable[str]) -> set[str]: - return { - _synonym - for _iri in iris - for _synonym in self.iri_synonyms(_iri) - } - @contextlib.contextmanager def _visit(self, focus_obj): - assert focus_obj not in self._visiting - self._visiting.add(focus_obj) + assert focus_obj not in self.already_visiting + self.already_visiting.add(focus_obj) yield - self._visiting.discard(focus_obj) + self.already_visiting.discard(focus_obj) def walk_twoples( diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index a88946b45..37a01439f 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -39,28 +39,20 @@ ) from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword from trove.vocab.osfmap import is_date_property -from trove.vocab.namespaces import TROVE, FOAF, RDF, RDFS, DCTERMS, OWL, SKOS, OSFMAP - - -logger = logging.getLogger(__name__) - - -TITLE_PROPERTIES = (DCTERMS.title,) -NAME_PROPERTIES = (FOAF.name, OSFMAP.fileName) -LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel) -NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES) - - -SKIPPABLE_PROPERTIES = ( - OSFMAP.contains, +from trove.vocab.namespaces import TROVE, RDF, OWL +from ._trovesearch_util import ( + TITLE_PROPERTIES, + NAME_PROPERTIES, + LABEL_PROPERTIES, + NAMELIKE_PROPERTIES, + VALUESEARCH_MAX, + CARDSEARCH_MAX, + KEYWORD_LENGTH_MAX, + SKIPPABLE_PROPERTIES, ) -VALUESEARCH_MAX = 234 -CARDSEARCH_MAX = 9997 - -KEYWORD_LENGTH_MAX = 8191 # skip keyword terms that might exceed lucene's internal limit -# (see https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html) +logger = logging.getLogger(__name__) class TroveIndexcardFlatsIndexStrategy(Elastic8IndexStrategy): diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py new file mode 100644 index 000000000..87a3514b9 --- /dev/null +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -0,0 +1,975 @@ +from __future__ import annotations +from collections import abc, defaultdict +import dataclasses +import functools +import json +import logging +import re +from typing import ( + ClassVar, + Iterable, + Iterator, + Literal, +) + +from django.conf import settings +import elasticsearch8 +from primitive_metadata import primitive_rdf as rdf + +from share.search import exceptions +from share.search import messages +from share.search.index_strategy.elastic8 import Elastic8IndexStrategy +from share.search.index_strategy._util import encode_cursor_dataclass, decode_cursor_dataclass +from share.util.checksum_iri import ChecksumIri +from trove import models as trove_db +from trove.trovesearch.search_params import ( + CardsearchParams, + ValuesearchParams, + SearchFilter, + Textsegment, + PageParam, + is_globpath, +) +from trove.trovesearch.search_response import ( + CardsearchResponse, + ValuesearchResponse, + TextMatchEvidence, + CardsearchResult, + ValuesearchResult, + PropertypathUsage, +) +from trove.vocab.osfmap import is_date_property +from trove.vocab.namespaces import TROVE, OWL, RDF +from . import _trovesearch_util as ts + + +logger = logging.getLogger(__name__) + + +class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): + CURRENT_STRATEGY_CHECKSUM = ChecksumIri( + checksumalgorithm_name='sha-256', + salt='TrovesearchDenormIndexStrategy', + hexdigest='fa8fe6459f658877f84620412dcab5e2e70d0c949d8977354c586dca99ff2f28', + ) + + # abstract method from IndexStrategy + @property + def supported_message_types(self): + return { + messages.MessageType.UPDATE_INDEXCARD, + messages.MessageType.BACKFILL_INDEXCARD, + } + + # abstract method from IndexStrategy + @property + def backfill_message_type(self): + return messages.MessageType.BACKFILL_INDEXCARD + + # abstract method from Elastic8IndexStrategy + def index_settings(self): + return {} + + # abstract method from Elastic8IndexStrategy + def index_mappings(self): + return { + 'dynamic': 'false', + 'dynamic_templates': self._dynamic_templates(), + 'properties': { + 'card': {'properties': self._card_mappings()}, + 'iri_value': {'properties': self._iri_value_mappings()}, + }, + } + + def _dynamic_templates(self): + return [ + {'dynamic_text_by_propertypath': { + 'path_match': '*.text_by_propertypath.*', + 'mapping': ts.TEXT_MAPPING, + }}, + {'dynamic_text_by_depth': { + 'path_match': '*.text_by_depth.*', + 'mapping': ts.TEXT_MAPPING, + }}, + {'dynamic_date_by_propertypath': { + 'path_match': '*.date_by_propertypath.*', + 'mapping': { + 'type': 'date', + 'format': 'strict_date_optional_time', + }, + }}, + {'dynamic_int_by_propertypath': { + 'path_match': '*.int_by_propertypath.*', + 'mapping': {'type': 'long'}, + }}, + ] + + def _card_mappings(self): + return { + # simple keyword properties + 'card_iri': ts.KEYWORD_MAPPING, + 'card_pk': ts.KEYWORD_MAPPING, + 'suid': { + 'type': 'object', + 'properties': { + 'source_config_label': ts.KEYWORD_MAPPING, + 'source_record_identifier': ts.KEYWORD_MAPPING, + }, + }, + **self._paths_and_values_mappings(), + } + + def _iri_value_mappings(self): + return { + 'value_iri': ts.KEYWORD_MAPPING, + 'value_name': ts.KEYWORD_MAPPING, + 'value_title': ts.KEYWORD_MAPPING, + 'value_label': ts.KEYWORD_MAPPING, + 'at_card_propertypaths': ts.KEYWORD_MAPPING, + **self._paths_and_values_mappings(), + } + + def _paths_and_values_mappings(self): + return { + 'focus_iri': ts.IRI_KEYWORD_MAPPING, + 'propertypaths_present': ts.KEYWORD_MAPPING, + # flattened properties (dynamic sub-properties with keyword values) + 'iri_by_propertypath': ts.FLATTENED_MAPPING, + 'iri_by_depth': ts.FLATTENED_MAPPING, + # dynamic properties (see `_dynamic_templates`) + 'text_by_propertypath': {'type': 'object', 'dynamic': True}, + 'text_by_depth': {'type': 'object', 'dynamic': True}, + 'date_by_propertypath': {'type': 'object', 'dynamic': True}, + 'int_by_propertypath': {'type': 'object', 'dynamic': True}, + } + + # override method from Elastic8IndexStrategy + def before_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: Iterable[str]): + # delete all per-value docs (to account for missing values) + self.es8_client.delete_by_query( + index=list(indexnames), + query={'bool': {'must': [ + {'terms': {'card.pk': messages_chunk.target_ids_chunk}}, + {'exists': {'field': 'iri_value.value_iri'}}, + ]}}, + ) + # (possible optimization: instead, hold onto doc_ids and (in `after_chunk`?) + # delete_by_query excluding those) + + # abstract method from Elastic8IndexStrategy + def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): + _indexcard_rdf_qs = ( + ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) + .select_related('indexcard__source_record_suid__source_config') + ) + _remaining_indexcard_pks = set(messages_chunk.target_ids_chunk) + for _indexcard_rdf in _indexcard_rdf_qs: + _docbuilder = self._SourcedocBuilder(_indexcard_rdf) + if not _docbuilder.should_skip(): # if skipped, will be deleted + _indexcard_pk = _indexcard_rdf.indexcard_id + for _doc_id, _doc in _docbuilder.build_docs(): + _index_action = self.build_index_action( + doc_id=_doc_id, + doc_source=_doc, + ) + yield _indexcard_pk, _index_action + _remaining_indexcard_pks.discard(_indexcard_pk) + # delete any that were skipped for any reason + for _indexcard_pk in _remaining_indexcard_pks: + yield _indexcard_pk, self.build_delete_action(_indexcard_pk) + + ### + # implement abstract IndexStrategy.SpecificIndex + + class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): + + # abstract method from IndexStrategy.SpecificIndex + def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + return self.index_strategy.es8_client.search( + index=self.indexname, + body={ + **(request_body or {}), + 'track_total_hits': True, + }, + params=(request_queryparams or {}), + ) + + # abstract method from IndexStrategy.SpecificIndex + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse: + _querybuilder = _CardsearchQueryBuilder(cardsearch_params) + _search_kwargs = _querybuilder.build() + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.index_strategy.es8_client.search( + index=self.indexname, + source=False, # no need to get _source, identifiers are enough + docvalue_fields=['card.card_iri'], + highlight={ # TODO: only one field gets highlighted? + 'require_field_match': False, + 'fields': {'card.text_by_propertypath.*': {}}, + }, + **_search_kwargs, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self.index_strategy._cardsearch_response(cardsearch_params, _es8_response, _querybuilder.cursor) + + # abstract method from IndexStrategy.SpecificIndex + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: + _path = valuesearch_params.valuesearch_propertypath + _cursor = _SimpleCursor.from_page_param(valuesearch_params.page) + _query = ( + _build_date_valuesearch(valuesearch_params, _cursor) + if is_date_property(_path[-1]) + else _build_iri_valuesearch(valuesearch_params, _cursor) + ) + if settings.DEBUG: + logger.info(json.dumps(_query, indent=2)) + try: + _es8_response = self.index_strategy.es8_client.search( + **_query, + index=self.indexname, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self.index_strategy._valuesearch_response(valuesearch_params, _es8_response, _cursor) + + ### + # building sourcedocs + + @dataclasses.dataclass + class _SourcedocBuilder: + '''build elasticsearch sourcedocs for an rdf document + ''' + indexcard_rdf: trove_db.IndexcardRdf + indexcard: trove_db.Indexcard = dataclasses.field(init=False) + rdfdoc: rdf.RdfTripleDictionary = dataclasses.field(init=False) + focus_iri: str = dataclasses.field(init=False) + + def __post_init__(self) -> None: + self.indexcard = self.indexcard_rdf.indexcard + self.rdfdoc = rdf.RdfGraph(self.indexcard_rdf.as_rdf_tripledict()) + self.focus_iri = self.indexcard_rdf.focus_iri + + def should_skip(self) -> bool: + _suid = self.indexcard.source_record_suid + return ( + # skip cards that belong to an obsolete suid with a later duplicate + _suid.has_forecompat_replacement() + # ...or that are without some value for name/title/label + or not any(self.rdfdoc.q(self.focus_iri, ts.NAMELIKE_PROPERTIES)) + ) + + def build_docs(self) -> Iterator[tuple[str, dict]]: + # index once without `iri_value` + yield self._doc_id(), {'card': self._card_subdoc} + for _iri in self._fullwalk.paths_by_iri: + yield self._doc_id(_iri), { + 'card': self._card_subdoc, + 'iri_value': self._iri_value_subdoc(_iri), + } + + def _doc_id(self, value_iri=None) -> str: + _card_pk = str(self.indexcard.pk) + return ( + _card_pk + if value_iri is None + else f'{_card_pk}-{ts.b64(value_iri)}' + ) + + @functools.cached_property + def _fullwalk(self) -> ts.GraphWalk: + return ts.GraphWalk(self.rdfdoc, self.focus_iri) + + @functools.cached_property + def _card_subdoc(self) -> dict: + return { + 'card_iri': self.indexcard.get_iri(), + 'card_pk': str(self.indexcard.pk), + 'suid': { + 'source_record_identifier': self.indexcard.source_record_suid.identifier, + 'source_config_label': self.indexcard.source_record_suid.source_config.label, + }, + **self._paths_and_values(self._fullwalk), + } + + def _iri_value_subdoc(self, iri: str) -> dict: + _shortwalk = self._fullwalk.shortwalk_from(iri) + return { + 'value_iri': iri, + 'value_iris': self._exact_and_suffuniq_iris(iri), + 'value_name': list(self._texts_at_properties(_shortwalk, ts.NAME_PROPERTIES)), + 'value_title': list(self._texts_at_properties(_shortwalk, ts.TITLE_PROPERTIES)), + 'value_label': list(self._texts_at_properties(_shortwalk, ts.LABEL_PROPERTIES)), + 'at_card_propertypaths': [ + ts.propertypath_as_keyword(_path) + for _path in self._fullwalk.paths_by_iri[iri] + ], + **self._paths_and_values(_shortwalk), + } + + def _paths_and_values(self, walk: ts.GraphWalk): + return { + 'focus_iri': self._exact_and_suffuniq_iris(walk.focus_iri), + 'propertypaths_present': self._propertypaths_present(walk), + 'iri_by_propertypath': self._iris_by_propertypath(walk), + 'iri_by_depth': self._iris_by_depth(walk), + 'text_by_propertypath': self._texts_by_propertypath(walk), + 'text_by_depth': self._texts_by_depth(walk), + 'date_by_propertypath': self._dates_by_propertypath(walk), + 'int_by_propertypath': self._ints_by_propertypath(walk), + } + + def _propertypaths_present(self, walk: ts.GraphWalk): + return [ + ts.propertypath_as_keyword(_path) + for _path in walk.paths_walked + ] + + def _iris_by_propertypath(self, walk: ts.GraphWalk): + return { + ts.propertypath_as_field_name(_path): ts.suffuniq_iris(ts.iris_synonyms(_iris, self.rdfdoc)) + for _path, _iris in walk.iri_values.items() + } + + def _iris_by_depth(self, walk: ts.GraphWalk): + _by_depth: dict[int, set[str]] = defaultdict(set) + for _path, _iris in walk.iri_values.items(): + _by_depth[len(_path)].update(_iris) + return { + _depth_field_name(_depth): ts.suffuniq_iris(ts.iris_synonyms(_iris, self.rdfdoc)) + for _depth, _iris in _by_depth.items() + } + + def _texts_by_propertypath(self, walk: ts.GraphWalk): + return { + ts.propertypath_as_field_name(_path): list(_value_set) + for _path, _value_set in walk.text_values.items() + } + + def _texts_at_properties(self, walk: ts.GraphWalk, properties: Iterable[str]): + for _property in properties: + yield from walk.text_values.get((_property,), []) + + def _texts_by_depth(self, walk: ts.GraphWalk): + _by_depth: dict[int, set[str]] = defaultdict(set) + for _path, _value_set in walk.text_values.items(): + _by_depth[len(_path)].update(_value_set) + return { + _depth_field_name(_depth): list(_value_set) + for _depth, _value_set in _by_depth.items() + } + + def _dates_by_propertypath(self, walk: ts.GraphWalk): + return { + ts.propertypath_as_field_name(_path): [ + _date.isoformat() + for _date in _value_set + ] + for _path, _value_set in walk.date_values.items() + } + + def _ints_by_propertypath(self, walk: ts.GraphWalk): + return { + ts.propertypath_as_field_name(_path): list(_value_set) + for _path, _value_set in walk.integer_values.items() + } + + def _exact_and_suffuniq_iris(self, iri: str): + _synonyms = ts.iri_synonyms(iri, self.rdfdoc) + return { + 'exact': list(_synonyms), + 'suffuniq': ts.suffuniq_iris(_synonyms), + } + + ### + # normalizing search responses + + def _valuesearch_response( + self, + valuesearch_params: ValuesearchParams, + es8_response: dict, + cursor: _SimpleCursor, + ) -> ValuesearchResponse: + _iri_aggs = es8_response['aggregations'].get('agg_valuesearch_iris') + if _iri_aggs: + _buckets = _iri_aggs['buckets'] + _bucket_count = len(_buckets) + # WARNING: terribly inefficient pagination (part two) + _page_end_index = cursor.start_index + cursor.page_size + _bucket_page = _buckets[cursor.start_index:_page_end_index] # discard prior pages + cursor.result_count = ( + -1 # "many more" + if (_bucket_count > _page_end_index) # agg includes one more, if there + else _bucket_count + ) + return ValuesearchResponse( + search_result_page=[ + self._valuesearch_iri_result(_iri_bucket) + for _iri_bucket in _bucket_page + ], + next_page_cursor=cursor.next_cursor(), + prev_page_cursor=cursor.prev_cursor(), + first_page_cursor=cursor.first_cursor(), + ) + else: # assume date + _year_buckets = ( + es8_response['aggregations'] + ['agg_valuesearch_dates'] + ['buckets'] + ) + return ValuesearchResponse( + search_result_page=[ + self._valuesearch_date_result(_year_bucket) + for _year_bucket in _year_buckets + ], + ) + + def _valuesearch_iri_result(self, iri_bucket) -> ValuesearchResult: + return ValuesearchResult( + value_iri=iri_bucket['key'], + # TODO: get type and text somehow + value_type=_bucketlist(iri_bucket.get('agg_type_iri', [])), + name_text=_bucketlist(iri_bucket.get('agg_value_name', [])), + title_text=_bucketlist(iri_bucket.get('agg_value_title', [])), + label_text=_bucketlist(iri_bucket.get('agg_value_label', [])), + match_count=iri_bucket['doc_count'], + ) + + def _valuesearch_date_result(self, date_bucket) -> ValuesearchResult: + return ValuesearchResult( + value_iri=None, + value_value=date_bucket['key_as_string'], + label_text=(date_bucket['key_as_string'],), + match_count=date_bucket['doc_count'], + ) + + def _cardsearch_response( + self, + cardsearch_params: CardsearchParams, + es8_response: dict, + cursor: '_CardsearchCursor', + ) -> CardsearchResponse: + _es8_total = es8_response['hits']['total'] + if _es8_total['relation'] != 'eq': + cursor.result_count = -1 # "too many" + else: # exact (and small) count + cursor.result_count = _es8_total['value'] + if cursor.random_sort and not cursor.is_first_page(): + # account for the filtered-out first page + assert cursor.result_count is not None + cursor.result_count += len(cursor.first_page_pks) + _results = [] + for _es8_hit in es8_response['hits']['hits']: + _card_iri = _es8_hit['fields']['card.card_iri'][0] + _results.append(CardsearchResult( + card_iri=_card_iri, + card_pk=_es8_hit['_id'], + text_match_evidence=list(self._gather_textmatch_evidence(_card_iri, _es8_hit)), + )) + if cursor.is_first_page() and cursor.first_page_pks: + # revisiting first page; reproduce original random order + _ordering_by_id = { + _id: _i + for (_i, _id) in enumerate(cursor.first_page_pks) + } + _results.sort(key=lambda _r: _ordering_by_id[_r.card_pk]) + else: + _should_start_reproducible_randomness = ( + cursor.random_sort + and cursor.is_first_page() + and not cursor.first_page_pks + and not cursor.has_many_more() + and any( + not _filter.is_type_filter() # look for a non-default filter + for _filter in cardsearch_params.cardsearch_filter_set + ) + ) + if _should_start_reproducible_randomness: + cursor.first_page_pks = tuple(_result.card_pk for _result in _results) + _relatedproperty_list: list[PropertypathUsage] = [] + if cardsearch_params.related_property_paths: + _relatedproperty_list.extend( + PropertypathUsage(property_path=_path, usage_count=0) + for _path in cardsearch_params.related_property_paths + ) + _relatedproperty_by_path = { + _result.property_path: _result + for _result in _relatedproperty_list + } + for _bucket in es8_response['aggregations']['agg_related_propertypath_usage']['buckets']: + _path = tuple(json.loads(_bucket['key'])) + _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] + return CardsearchResponse( + total_result_count=( + TROVE['ten-thousands-and-more'] + if cursor.has_many_more() + else cursor.result_count + ), + search_result_page=_results, + related_propertypath_results=_relatedproperty_list, + next_page_cursor=cursor.next_cursor(), + prev_page_cursor=cursor.prev_cursor(), + first_page_cursor=cursor.first_cursor(), + ) + + def _gather_textmatch_evidence(self, card_iri, es8_hit) -> Iterator[TextMatchEvidence]: + for _field, _snippets in es8_hit.get('highlight', {}).items(): + (_, _, _encoded_path) = _field.rpartition('.') + _property_path = _parse_path_field_name(_encoded_path) + for _snippet in _snippets: + yield TextMatchEvidence( + property_path=_property_path, + matching_highlight=rdf.literal(_snippet), + card_iri=card_iri, + ) + + +### +# building queries + +@dataclasses.dataclass +class _BoolBuilder: + bool_innards: dict[str, list[dict]] = dataclasses.field( + default_factory=lambda: { + 'filter': [], + 'must_not': [], + 'must': [], + 'should': [], + }, + ) + + def as_query(self): + return {'bool': self.bool_innards} + + def add_boolpart(self, key: str, query: dict) -> None: + self.bool_innards[key].append(query) + + def add_boolparts(self, boolparts: Iterator[tuple[str, dict]]): + for _key, _query in boolparts: + self.add_boolpart(_key, _query) + + +@dataclasses.dataclass +class _QueryHelper: + base_field: Literal['card', 'iri_value'] + textsegment_set: frozenset[Textsegment] + filter_set: frozenset[SearchFilter] + relevance_matters: bool + + def boolparts(self) -> Iterator[tuple[str, dict]]: + yield from self.iri_boolparts() + yield from self.text_boolparts() + + def iri_boolparts(self) -> Iterator[tuple[str, dict]]: + # iri-keyword filters + for _searchfilter in self.filter_set: + if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: + yield 'must_not', self._iri_filter(_searchfilter) + elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: + yield 'filter', self._iri_filter(_searchfilter) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: + yield 'filter', self._presence_query(_searchfilter) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: + yield 'must_not', self._presence_query(_searchfilter) + elif _searchfilter.operator.is_date_operator(): + yield 'filter', self._date_filter(_searchfilter) + else: + raise ValueError(f'unknown filter operator {_searchfilter.operator}') + + def text_boolparts(self) -> Iterator[tuple[str, dict]]: + # text-based queries + for _textsegment in self.textsegment_set: + if _textsegment.is_negated: + yield 'must_not', self._exact_text_query(_textsegment) + elif not _textsegment.is_fuzzy: + yield 'must', self._exact_text_query(_textsegment) + else: + yield 'must', self._fuzzy_text_must_query(_textsegment) + if self.relevance_matters: + yield 'should', self._fuzzy_text_should_query(_textsegment) + + def _presence_query(self, search_filter) -> dict: + return _any_query([ + self._path_presence_query(_path) + for _path in search_filter.propertypath_set + ]) + + def _path_presence_query(self, path: ts.Propertypath): + _field = f'{self.base_field}.propertypaths_present' + return {'term': {_field: ts.propertypath_as_keyword(path)}} + + def _iri_filter(self, search_filter) -> dict: + _iris = ts.suffuniq_iris(search_filter.value_set) + return _any_query([ + self._path_iri_query(_path, _iris) + for _path in search_filter.propertypath_set + ]) + + def _path_iri_query(self, path, suffuniq_iris): + if path == (OWL.sameAs,): + _field = f'{self.base_field}.focus_iri.suffuniq' + elif is_globpath(path): + _field = f'{self.base_field}.iri_by_depth.{_depth_field_name(len(path))}' + else: + _field = f'{self.base_field}.iri_by_propertypath.{_path_field_name(path)}' + return {'terms': {_field: suffuniq_iris}} + + def _date_filter(self, search_filter): + return _any_query([ + self._date_filter_for_path(_path, search_filter.operator, search_filter.value_set) + for _path in search_filter.propertypath_set + ]) + + def _date_filter_for_path(self, path, filter_operator, value_set): + _field = f'{self.base_field}.date_by_propertypath.{_path_field_name(path)}' + if filter_operator == SearchFilter.FilterOperator.BEFORE: + _value = min(value_set) # rely on string-comparable isoformat + return {'range': {_field: {'lt': _daterange_value(_value)}}} + elif filter_operator == SearchFilter.FilterOperator.AFTER: + _value = max(value_set) # rely on string-comparable isoformat + return {'range': {_field: {'gt': _daterange_value(_value)}}} + elif filter_operator == SearchFilter.FilterOperator.AT_DATE: + return _any_query([ + {'range': {_field: {'gte': _filtervalue, 'lte': _filtervalue}}} + for _filtervalue in map(_daterange_value, value_set) + ]) + else: + raise ValueError(f'invalid date filter operator (got {filter_operator})') + + def _text_field_name(self, propertypath: ts.Propertypath): + return ( + f'{self.base_field}.text_by_depth.{_depth_field_name(len(propertypath))}' + if is_globpath(propertypath) + else f'{self.base_field}.text_by_propertypath.{ts.propertypath_as_field_name(propertypath)}' + ) + + def _exact_text_query(self, textsegment: Textsegment) -> dict: + # TODO: textsegment.is_openended (prefix query) + return _any_query([ + {'match_phrase': {self._text_field_name(_path): {'query': textsegment.text}}} + for _path in textsegment.propertypath_set + ]) + + def _fuzzy_text_must_query(self, textsegment: Textsegment) -> dict: + # TODO: textsegment.is_openended (prefix query) + return _any_query([ + {'match': { + self._text_field_name(_path): { + 'query': textsegment.text, + 'fuzziness': 'AUTO', + # TODO: consider 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) + }, + }} + for _path in textsegment.propertypath_set + ]) + + def _fuzzy_text_should_query(self, textsegment: Textsegment): + _slop = len(textsegment.text.split()) + return _any_query([ + {'match_phrase': { + self._text_field_name(_path): {'query': textsegment.text, 'slop': _slop}, + }} + for _path in textsegment.propertypath_set + ]) + + +@dataclasses.dataclass +class _CardsearchQueryBuilder: + params: CardsearchParams + + def build(self): + return { + 'query': self._cardsearch_query(), + 'aggs': self._cardsearch_aggs(), + 'sort': list(self._cardsearch_sorts()) or None, + 'from_': self.cursor.cardsearch_start_index(), + 'size': self.cursor.page_size, + } + + @functools.cached_property + def cursor(self): + return _CardsearchCursor.from_cardsearch_params(self.params) + + def _cardsearch_query(self) -> dict: + _bool = _BoolBuilder() + _bool.add_boolparts( + _QueryHelper( + base_field='card', + textsegment_set=self.params.cardsearch_textsegment_set, + filter_set=self.params.cardsearch_filter_set, + relevance_matters=(not self.params.sort_list), + ).boolparts(), + ) + # exclude iri_value docs (possible optimization: separate indexes) + _bool.add_boolpart('must_not', {'exists': {'field': 'iri_value'}}) + return ( + self._randomly_ordered_query(_bool) + if self.cursor.random_sort + else _bool.as_query() + ) + + def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: + if not self.cursor.first_page_pks: + # independent random sample + return { + 'function_score': { + 'query': _bool.as_query(), + 'boost_mode': 'replace', + 'random_score': {}, # default random_score is fast and unpredictable + }, + } + _firstpage_filter = {'terms': {'card.pk': self.cursor.first_page_pks}} + if self.cursor.is_first_page(): + # returning to a first page previously visited + _bool.add_boolpart('filter', _firstpage_filter) + return _bool.as_query() + # get a subsequent page using reproducible randomness + _bool.add_boolpart('must_not', _firstpage_filter) + return { + 'function_score': { + 'query': _bool.as_query(), + 'boost_mode': 'replace', + 'random_score': { + 'seed': ''.join(self.cursor.first_page_pks), + 'field': 'card.pk', + }, + }, + } + + def _cardsearch_aggs(self): + _aggs = {} + if self.params.related_property_paths: + _aggs['agg_related_propertypath_usage'] = {'terms': { + 'field': 'card.propertypaths_present', + 'include': [ + ts.propertypath_as_keyword(_path) + for _path in self.params.related_property_paths + ], + 'size': len(self.params.related_property_paths), + }} + return _aggs + + def _cardsearch_sorts(self): + for _sortparam in self.params.sort_list: + _path = (_sortparam.property_iri,) + _field = f'card.date_by_propertypath.{_path_field_name(_path)}' + _order = 'desc' if _sortparam.descending else 'asc' + yield {_field: _order} + + +def _build_iri_valuesearch(params: ValuesearchParams, cursor: _SimpleCursor) -> dict: + _path = params.valuesearch_propertypath + _bool = _BoolBuilder() + _bool.add_boolpart('filter', {'term': { + 'iri_value.at_card_propertypaths': ts.propertypath_as_keyword(_path), + }}) + _bool.add_boolparts( + _QueryHelper( + base_field='card', + textsegment_set=params.cardsearch_textsegment_set, + filter_set=params.cardsearch_filter_set, + relevance_matters=False, + ).boolparts(), + ) + _bool.add_boolparts( + _QueryHelper( + base_field='iri_value', + textsegment_set=params.valuesearch_textsegment_set, + filter_set=params.valuesearch_filter_set, + relevance_matters=False, + ).boolparts() + ) + return { + 'query': _bool.as_query(), + 'size': 0, # ignore hits; just want the aggs + 'aggs': { + 'agg_valuesearch_iris': { + 'terms': { + 'field': 'iri_value.value_iri', + # WARNING: terribly inefficient pagination (part one) + 'size': cursor.start_index + cursor.page_size + 1, + }, + 'aggs': { + 'agg_type_iri': {'terms': { + 'field': f'iri_value.iri_by_propertypath.{_path_field_name((RDF.type,))}', + }}, + 'agg_value_name': {'terms': {'field': 'iri_value.value_name'}}, + 'agg_value_title': {'terms': {'field': 'iri_value.value_title'}}, + 'agg_value_label': {'terms': {'field': 'iri_value.value_label'}}, + }, + }, + }, + } + + +def _build_date_valuesearch(params: ValuesearchParams, cursor: _SimpleCursor) -> dict: + assert not params.valuesearch_textsegment_set + assert not params.valuesearch_filter_set + _bool = _BoolBuilder() + _bool.add_boolparts( + _QueryHelper( + base_field='card', + textsegment_set=params.cardsearch_textsegment_set, + filter_set=params.cardsearch_filter_set, + relevance_matters=False, + ).boolparts(), + ) + # exclude iri_value docs (possible optimization: separate indexes) + _bool.add_boolpart('must_not', {'exists': {'field': 'iri_value'}}) + _field = f'card.date_by_propertypath.{_path_field_name(params.valuesearch_propertypath)}' + return { + 'query': _bool.as_query(), + 'size': 0, # ignore hits; just want the aggs + 'aggs': {'agg_valuesearch_dates': { + 'date_histogram': { + 'field': _field, + 'calendar_interval': 'year', + 'format': 'yyyy', + 'order': {'_key': 'desc'}, + 'min_doc_count': 1, + }, + }} + } + + +### +# assorted helper functions + +def _bucketlist(agg_result: dict) -> list[str]: + return [ + _bucket['key'] + for _bucket in agg_result['buckets'] + ] + + +def _daterange_value(datevalue: str): + _cleanvalue = datevalue.strip() + if re.fullmatch(r'\d{4,}', _cleanvalue): + return f'{_cleanvalue}||/y' + if re.fullmatch(r'\d{4,}-\d{2}', _cleanvalue): + return f'{_cleanvalue}||/M' + if re.fullmatch(r'\d{4,}-\d{2}-\d{2}', _cleanvalue): + return f'{_cleanvalue}||/d' + raise ValueError(f'bad date value "{datevalue}"') + + +def _depth_field_name(depth: int) -> str: + return f'depth{depth}' + + +def _path_field_name(path: ts.Propertypath) -> str: + return ts.b64(ts.propertypath_as_keyword(path)) + + +def _parse_path_field_name(path_field_name: str) -> ts.Propertypath: + # inverse of propertypath_as_field_name + _list = json.loads(ts.b64_reverse(path_field_name)) + assert isinstance(_list, list) + assert all(isinstance(_item, str) for _item in _list) + return tuple(_list) + + +def _any_query(queries: abc.Collection[dict]): + if len(queries) == 1: + (_query,) = queries + return _query + return {'bool': {'should': list(queries), 'minimum_should_match': 1}} + + +@dataclasses.dataclass +class _SimpleCursor: + start_index: int + page_size: int + result_count: int | None # use -1 to indicate "many more" + + MAX_INDEX: ClassVar[int] = ts.VALUESEARCH_MAX + + @classmethod + def from_page_param(cls, page: PageParam) -> '_SimpleCursor': + if page.cursor: + return decode_cursor_dataclass(page.cursor, cls) + assert page.size is not None + return cls( + start_index=0, + page_size=page.size, + result_count=None, # should be set when results are in + ) + + def next_cursor(self) -> str | None: + if not self.result_count: + return None + _next = dataclasses.replace(self, start_index=(self.start_index + self.page_size)) + return ( + encode_cursor_dataclass(_next) + if _next.is_valid_cursor() + else None + ) + + def prev_cursor(self) -> str | None: + _prev = dataclasses.replace(self, start_index=(self.start_index - self.page_size)) + return ( + encode_cursor_dataclass(_prev) + if _prev.is_valid_cursor() + else None + ) + + def first_cursor(self) -> str | None: + if self.is_first_page(): + return None + return encode_cursor_dataclass(dataclasses.replace(self, start_index=0)) + + def is_first_page(self) -> bool: + return self.start_index == 0 + + def has_many_more(self) -> bool: + return self.result_count == -1 + + def max_index(self) -> int: + return ( + self.MAX_INDEX + if self.has_many_more() + else min(self.result_count or 0, self.MAX_INDEX) + ) + + def is_valid_cursor(self) -> bool: + return 0 <= self.start_index < self.max_index() + + +@dataclasses.dataclass +class _CardsearchCursor(_SimpleCursor): + random_sort: bool # how to sort by relevance to nothingness? randomness! + first_page_pks: tuple[str, ...] = () + + MAX_INDEX: ClassVar[int] = ts.CARDSEARCH_MAX + + @classmethod + def from_cardsearch_params(cls, params: CardsearchParams) -> '_CardsearchCursor': + if params.page.cursor: + return decode_cursor_dataclass(params.page.cursor, cls) + assert params.page.size is not None + return cls( + start_index=0, + page_size=params.page.size, + result_count=None, # should be set when results are in + random_sort=( + not params.sort_list + and not params.cardsearch_textsegment_set + ), + ) + + def cardsearch_start_index(self) -> int: + if self.is_first_page() or not self.random_sort: + return self.start_index + return self.start_index - len(self.first_page_pks) + + def first_cursor(self) -> str | None: + if self.random_sort and not self.first_page_pks: + return None + return super().prev_cursor() + + def prev_cursor(self) -> str | None: + if self.random_sort and not self.first_page_pks: + return None + return super().prev_cursor() diff --git a/tests/share/search/index_strategy/test_strategy_selection.py b/tests/share/search/index_strategy/test_strategy_selection.py index 219674385..e24fb0a1a 100644 --- a/tests/share/search/index_strategy/test_strategy_selection.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -11,6 +11,7 @@ sharev2_elastic5, sharev2_elastic8, trove_indexcard_flats, + trovesearch_denorm, ) @@ -20,6 +21,7 @@ def expected_strategy_classes(): 'sharev2_elastic5': sharev2_elastic5.Sharev2Elastic5IndexStrategy, 'sharev2_elastic8': sharev2_elastic8.Sharev2Elastic8IndexStrategy, 'trove_indexcard_flats': trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy, + 'trovesearch_denorm': trovesearch_denorm.TrovesearchDenormIndexStrategy, } diff --git a/tests/share/search/index_strategy/test_trovesearch_denorm.py b/tests/share/search/index_strategy/test_trovesearch_denorm.py new file mode 100644 index 000000000..60a0e9771 --- /dev/null +++ b/tests/share/search/index_strategy/test_trovesearch_denorm.py @@ -0,0 +1,9 @@ +from share.search.index_strategy.trovesearch_denorm import TrovesearchDenormIndexStrategy + +from . import _common_trovesearch_tests + + +class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests): + # for RealElasticTestCase + def get_index_strategy(self): + return TrovesearchDenormIndexStrategy('test_trovesearch_denorm') diff --git a/trove/exceptions.py b/trove/exceptions.py index 516f6c200..ff69be61d 100644 --- a/trove/exceptions.py +++ b/trove/exceptions.py @@ -70,6 +70,10 @@ class InvalidPropertyPath(RequestParsingError): pass +class InvalidQueryParams(RequestParsingError): + pass + + ### # rendering a response diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 9c459375f..6f20e8b5c 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -490,6 +490,18 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: 'valuesearch_filter_set': SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'), } + def __post_init__(self): + if is_date_property(self.valuesearch_propertypath[-1]): + # date-value limitations + if self.valuesearch_textsegment_set: + raise trove_exceptions.InvalidQueryParams( + 'valueSearchText may not be used with valueSearchPropertyPath leading to a "date" property', + ) + if self.valuesearch_filter_set: + raise trove_exceptions.InvalidQueryParams( + 'valueSearchFilter may not be used with valueSearchPropertyPath leading to a "date" property', + ) + def to_querydict(self): _querydict = super().to_querydict() _querydict['valueSearchPropertyPath'] = propertypath_key(self.valuesearch_propertypath) diff --git a/trove/vocab/namespaces.py b/trove/vocab/namespaces.py index 9402fd26c..73c7438b2 100644 --- a/trove/vocab/namespaces.py +++ b/trove/vocab/namespaces.py @@ -10,6 +10,7 @@ DCAT, PROV, SKOS, + XSD, DEFAULT_SHORTHAND, ) @@ -30,6 +31,7 @@ 'SHAREv2', 'SKOS', 'TROVE', + 'XSD', 'NAMESPACES_SHORTHAND', ) diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index 9efa03a32..b7106f3dd 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -235,7 +235,7 @@ def trove_browse_link(iri: str): TROVE.valueSearchFilter, TROVE.pageSize, TROVE.pageCursor, - TROVE.sort, + # TROVE.sort, # TROVE.include, }, RDFS.label: {literal('index-value-search', language='en')}, From bc00775935328378c379d20457c71bfd1a607c2d Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 25 Oct 2024 09:18:16 -0400 Subject: [PATCH 04/14] tidy, consolidate, and index supplements --- .../index_strategy/_trovesearch_util.py | 7 ++--- .../index_strategy/trove_indexcard_flats.py | 19 ++---------- .../index_strategy/trovesearch_denorm.py | 31 +++++++++---------- share/shell_util.py | 8 ++--- 4 files changed, 23 insertions(+), 42 deletions(-) diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py index f051b537c..909290d50 100644 --- a/share/search/index_strategy/_trovesearch_util.py +++ b/share/search/index_strategy/_trovesearch_util.py @@ -80,7 +80,7 @@ def latest_rdf_for_indexcard_pks(indexcard_pks): return ( trove_db.LatestIndexcardRdf.objects .filter(indexcard_id__in=indexcard_pks) - .filter(Exists( + .filter(Exists( # only index items that have an osfmap_json representation trove_db.DerivedIndexcard.objects .filter(upriver_indexcard_id=OuterRef('indexcard_id')) .filter(deriver_identifier__in=( @@ -91,6 +91,7 @@ def latest_rdf_for_indexcard_pks(indexcard_pks): .exclude(indexcard__deleted__isnull=False) .select_related('indexcard__source_record_suid__source_config') .prefetch_related('indexcard__focus_identifier_set') + .prefetch_related('indexcard__supplementary_rdf_set') ) @@ -120,10 +121,6 @@ def propertypath_as_keyword(path: Propertypath) -> str: ]) -def propertypath_as_field_name(path: Propertypath) -> str: - return b64(propertypath_as_keyword(path)) - - def b64(value: str) -> str: return base64.urlsafe_b64encode(value.encode()).decode() diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 37a01439f..802158975 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -10,7 +10,6 @@ from typing import Iterable, ClassVar, Optional, Iterator from django.conf import settings -from django.db.models import Exists, OuterRef import elasticsearch8 from primitive_metadata import primitive_rdf @@ -41,6 +40,7 @@ from trove.vocab.osfmap import is_date_property from trove.vocab.namespaces import TROVE, RDF, OWL from ._trovesearch_util import ( + latest_rdf_for_indexcard_pks, TITLE_PROPERTIES, NAME_PROPERTIES, LABEL_PROPERTIES, @@ -263,22 +263,7 @@ def _flattened_iris_suffuniq(self, nested_iris: dict['_NestedIriKey', set[str]]) } def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - _indexcard_rdf_qs = ( - trove_db.LatestIndexcardRdf.objects - .filter(indexcard_id__in=messages_chunk.target_ids_chunk) - .filter(Exists( - trove_db.DerivedIndexcard.objects - .filter(upriver_indexcard_id=OuterRef('indexcard_id')) - .filter(deriver_identifier__in=( - trove_db.ResourceIdentifier.objects - .queryset_for_iri(TROVE['derive/osfmap_json']) - )) - )) - .exclude(indexcard__deleted__isnull=False) - .select_related('indexcard__source_record_suid__source_config') - .prefetch_related('indexcard__focus_identifier_set') - .prefetch_related('indexcard__supplementary_rdf_set') - ) + _indexcard_rdf_qs = latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) _remaining_indexcard_ids = set(messages_chunk.target_ids_chunk) for _indexcard_rdf in _indexcard_rdf_qs: _suid = _indexcard_rdf.indexcard.source_record_suid diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 87a3514b9..1f78dc8b2 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -158,10 +158,7 @@ def before_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: Itera # abstract method from Elastic8IndexStrategy def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - _indexcard_rdf_qs = ( - ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) - .select_related('indexcard__source_record_suid__source_config') - ) + _indexcard_rdf_qs = ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) _remaining_indexcard_pks = set(messages_chunk.target_ids_chunk) for _indexcard_rdf in _indexcard_rdf_qs: _docbuilder = self._SourcedocBuilder(_indexcard_rdf) @@ -244,13 +241,13 @@ class _SourcedocBuilder: ''' indexcard_rdf: trove_db.IndexcardRdf indexcard: trove_db.Indexcard = dataclasses.field(init=False) - rdfdoc: rdf.RdfTripleDictionary = dataclasses.field(init=False) focus_iri: str = dataclasses.field(init=False) + rdfdoc: rdf.RdfTripleDictionary = dataclasses.field(init=False) def __post_init__(self) -> None: self.indexcard = self.indexcard_rdf.indexcard - self.rdfdoc = rdf.RdfGraph(self.indexcard_rdf.as_rdf_tripledict()) self.focus_iri = self.indexcard_rdf.focus_iri + self.rdfdoc = self.indexcard_rdf.as_rdfdoc_with_supplements() def should_skip(self) -> bool: _suid = self.indexcard.source_record_suid @@ -329,7 +326,7 @@ def _propertypaths_present(self, walk: ts.GraphWalk): def _iris_by_propertypath(self, walk: ts.GraphWalk): return { - ts.propertypath_as_field_name(_path): ts.suffuniq_iris(ts.iris_synonyms(_iris, self.rdfdoc)) + _path_field_name(_path): ts.suffuniq_iris(ts.iris_synonyms(_iris, self.rdfdoc)) for _path, _iris in walk.iri_values.items() } @@ -344,7 +341,7 @@ def _iris_by_depth(self, walk: ts.GraphWalk): def _texts_by_propertypath(self, walk: ts.GraphWalk): return { - ts.propertypath_as_field_name(_path): list(_value_set) + _path_field_name(_path): list(_value_set) for _path, _value_set in walk.text_values.items() } @@ -363,7 +360,7 @@ def _texts_by_depth(self, walk: ts.GraphWalk): def _dates_by_propertypath(self, walk: ts.GraphWalk): return { - ts.propertypath_as_field_name(_path): [ + _path_field_name(_path): [ _date.isoformat() for _date in _value_set ] @@ -372,7 +369,7 @@ def _dates_by_propertypath(self, walk: ts.GraphWalk): def _ints_by_propertypath(self, walk: ts.GraphWalk): return { - ts.propertypath_as_field_name(_path): list(_value_set) + _path_field_name(_path): list(_value_set) for _path, _value_set in walk.integer_values.items() } @@ -429,7 +426,6 @@ def _valuesearch_response( def _valuesearch_iri_result(self, iri_bucket) -> ValuesearchResult: return ValuesearchResult( value_iri=iri_bucket['key'], - # TODO: get type and text somehow value_type=_bucketlist(iri_bucket.get('agg_type_iri', [])), name_text=_bucketlist(iri_bucket.get('agg_value_name', [])), title_text=_bucketlist(iri_bucket.get('agg_value_title', [])), @@ -449,7 +445,7 @@ def _cardsearch_response( self, cardsearch_params: CardsearchParams, es8_response: dict, - cursor: '_CardsearchCursor', + cursor: _CardsearchCursor, ) -> CardsearchResponse: _es8_total = es8_response['hits']['total'] if _es8_total['relation'] != 'eq': @@ -642,7 +638,7 @@ def _text_field_name(self, propertypath: ts.Propertypath): return ( f'{self.base_field}.text_by_depth.{_depth_field_name(len(propertypath))}' if is_globpath(propertypath) - else f'{self.base_field}.text_by_propertypath.{ts.propertypath_as_field_name(propertypath)}' + else f'{self.base_field}.text_by_propertypath.{_path_field_name(propertypath)}' ) def _exact_text_query(self, textsegment: Textsegment) -> dict: @@ -864,7 +860,7 @@ def _path_field_name(path: ts.Propertypath) -> str: def _parse_path_field_name(path_field_name: str) -> ts.Propertypath: - # inverse of propertypath_as_field_name + # inverse of _path_field_name _list = json.loads(ts.b64_reverse(path_field_name)) assert isinstance(_list, list) assert all(isinstance(_item, str) for _item in _list) @@ -878,6 +874,9 @@ def _any_query(queries: abc.Collection[dict]): return {'bool': {'should': list(queries), 'minimum_should_match': 1}} +### +# cursor implementations + @dataclasses.dataclass class _SimpleCursor: start_index: int @@ -887,7 +886,7 @@ class _SimpleCursor: MAX_INDEX: ClassVar[int] = ts.VALUESEARCH_MAX @classmethod - def from_page_param(cls, page: PageParam) -> '_SimpleCursor': + def from_page_param(cls, page: PageParam) -> _SimpleCursor: if page.cursor: return decode_cursor_dataclass(page.cursor, cls) assert page.size is not None @@ -945,7 +944,7 @@ class _CardsearchCursor(_SimpleCursor): MAX_INDEX: ClassVar[int] = ts.CARDSEARCH_MAX @classmethod - def from_cardsearch_params(cls, params: CardsearchParams) -> '_CardsearchCursor': + def from_cardsearch_params(cls, params: CardsearchParams) -> _CardsearchCursor: if params.page.cursor: return decode_cursor_dataclass(params.page.cursor, cls) assert params.page.size is not None diff --git a/share/shell_util.py b/share/shell_util.py index 305d82d4d..223f13304 100644 --- a/share/shell_util.py +++ b/share/shell_util.py @@ -4,13 +4,13 @@ """ from share import tasks -from share.search import IndexMessenger, IndexStrategy +from share.search import IndexMessenger, index_strategy from share.util import IDObfuscator __all__ = ( - 'tasks', - 'IndexMessenger', - 'IndexStrategy', 'IDObfuscator', + 'IndexMessenger', + 'index_strategy', + 'tasks', ) From 2072e2b60cdbf640ecfbd01ce146f9cc348151ee Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 25 Oct 2024 09:19:35 -0400 Subject: [PATCH 05/14] add tests for supplement and sorting --- .../_common_trovesearch_tests.py | 206 ++++++++++++------ .../test_trove_indexcard_flats.py | 3 + 2 files changed, 145 insertions(+), 64 deletions(-) diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index b2a2bbec5..b1820f8e5 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -9,7 +9,7 @@ from share.search import messages from trove import models as trove_db from trove.trovesearch.search_params import CardsearchParams, ValuesearchParams -from trove.vocab.namespaces import RDFS, TROVE, RDF, DCTERMS, OWL, FOAF +from trove.vocab.namespaces import RDFS, TROVE, RDF, DCTERMS, OWL, FOAF, DCAT from ._with_real_services import RealElasticTestCase @@ -53,16 +53,23 @@ def test_for_smoke_with_daemon(self): def test_cardsearch(self): self._fill_test_data_for_querying() - for _queryparams, _expected_result_iris in self.cardsearch_cases(): + _cardsearch_cases = itertools.chain( + self.cardsearch_cases(), + self.cardsearch_integer_cases(), + ) + for _queryparams, _expected_result_iris in _cardsearch_cases: _cardsearch_params = CardsearchParams.from_querystring(urlencode(_queryparams)) assert isinstance(_cardsearch_params, CardsearchParams) _cardsearch_response = self.current_index.pls_handle_cardsearch(_cardsearch_params) # assumes all results fit on one page - _actual_result_iris = { + _actual_result_iris: set[str] | list[str] = [ self._indexcard_focus_by_uuid[_result.card_uuid] for _result in _cardsearch_response.search_result_page - } - self.assertEqual(_expected_result_iris, _actual_result_iris) + ] + # test sort order only when expected results are ordered + if isinstance(_expected_result_iris, set): + _actual_result_iris = set(_actual_result_iris) + self.assertEqual(_expected_result_iris, _actual_result_iris, msg=f'?{_queryparams}') def test_valuesearch(self): self._fill_test_data_for_querying() @@ -82,70 +89,105 @@ def test_valuesearch(self): self.assertEqual(_expected_values, _actual_values) def _fill_test_data_for_querying(self): - self._index_indexcards([ - self._create_indexcard(BLARG.a, { - BLARG.a: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, - DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('aaaa')}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, - DCTERMS.references: {BLARG.b, BLARG.c}, - DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, - }, - BLARG.someone: { - FOAF.name: {rdf.literal('some one')}, - }, - BLARG.b: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, - DCTERMS.title: {rdf.literal('bbbb')}, - DCTERMS.references: {BLARG.c}, - }, - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, - DCTERMS.title: {rdf.literal('cccc')}, - }, - }), - self._create_indexcard(BLARG.b, { - BLARG.b: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.b_same}, - DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('bbbb')}, - DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, - DCTERMS.references: {BLARG.c}, - DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, - }, - BLARG.someone: { - FOAF.name: {rdf.literal('some one')}, - }, - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, - DCTERMS.title: {rdf.literal('cccc')}, + _card_a = self._create_indexcard(BLARG.a, { + BLARG.a: { + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, + DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('aaaa')}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, + DCTERMS.references: {BLARG.b, BLARG.c}, + DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, + }, + BLARG.someone: { + FOAF.name: {rdf.literal('some one')}, + }, + BLARG.b: { + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, + DCTERMS.title: {rdf.literal('bbbb')}, + DCTERMS.references: {BLARG.c}, + }, + BLARG.c: { + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, + DCTERMS.title: {rdf.literal('cccc')}, + }, + }) + _card_b = self._create_indexcard(BLARG.b, { + BLARG.b: { + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.b_same}, + DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('bbbb')}, + DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, + DCTERMS.references: {BLARG.c}, + DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, + }, + BLARG.someone: { + FOAF.name: {rdf.literal('some one')}, + }, + BLARG.c: { + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, + DCTERMS.title: {rdf.literal('cccc')}, + }, + }) + _card_c = self._create_indexcard(BLARG.c, { + BLARG.c: { + RDF.type: {BLARG.Thing}, + DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, + DCTERMS.creator: {BLARG.someone_else}, + DCTERMS.title: {rdf.literal('cccc')}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, + DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, + }, + BLARG.someone_else: { + FOAF.name: {rdf.literal('some one else')}, + }, + }) + self._create_supplement(_card_a, BLARG.a, { + BLARG.a: { + DCTERMS.replaces: {BLARG.a_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(10)}}), }, - }), - self._create_indexcard(BLARG.c, { - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, - DCTERMS.creator: {BLARG.someone_else}, - DCTERMS.title: {rdf.literal('cccc')}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, - DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, + }, + }) + self._create_supplement(_card_b, BLARG.b, { + BLARG.b: { + DCTERMS.replaces: {BLARG.b_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(7)}}), }, - BLARG.someone_else: { - FOAF.name: {rdf.literal('some one else')}, + }, + }) + self._create_supplement(_card_c, BLARG.c, { + BLARG.c: { + DCTERMS.replaces: {BLARG.c_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(333)}}), }, - }), - ]) + }, + }) + self._index_indexcards([_card_a, _card_b, _card_c]) - def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: + def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: # using data from _fill_test_data_for_querying + yield ( + {}, # no query params + {BLARG.a, BLARG.b, BLARG.c}, + ) + yield ( + {'sort': 'dateCreated'}, + [BLARG.a, BLARG.b, BLARG.c], # ordered list + ) + yield ( + {'sort': '-dateCreated'}, + [BLARG.c, BLARG.b, BLARG.a], # ordered list + ) yield ( {'cardSearchFilter[creator]': BLARG.someone}, {BLARG.a, BLARG.b}, @@ -190,6 +232,14 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: {'cardSearchFilter[references.references.subject][is-absent]': ''}, {BLARG.c, BLARG.b}, ) + yield ( + {'cardSearchFilter[dcterms:replaces]': BLARG.b_past}, + {BLARG.b}, + ) + yield ( + {'cardSearchFilter[dcterms:replaces][is-absent]': ''}, + set(), + ) yield ( {'cardSearchFilter[subject]': BLARG.subj_ac}, {BLARG.c, BLARG.a}, @@ -300,6 +350,17 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: {BLARG.b}, ) + def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: + # cases that depend on integer values getting indexed + yield ( + {'sort': 'dcat:servesDataset.dcat:spatialResolutionInMeters'}, + [BLARG.b, BLARG.a, BLARG.c], # ordered list + ) + yield ( + {'sort': '-dcat:servesDataset.dcat:spatialResolutionInMeters'}, + [BLARG.c, BLARG.a, BLARG.b], # ordered list + ) + def valuesearch_simple_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: yield ( {'valueSearchPropertyPath': 'references'}, @@ -361,3 +422,20 @@ def _create_indexcard(self, focus_iri: str, rdf_tripledict: rdf.RdfTripleDiction ) self._indexcard_focus_by_uuid[str(_indexcard.uuid)] = focus_iri return _indexcard + + def _create_supplement( + self, + indexcard: trove_db.Indexcard, + focus_iri: str, + rdf_tripledict: rdf.RdfTripleDictionary, + ) -> trove_db.SupplementaryIndexcardRdf: + _supp_suid = factories.SourceUniqueIdentifierFactory() + _supp_raw = factories.RawDatumFactory(suid=_supp_suid) + return trove_db.SupplementaryIndexcardRdf.objects.create( + from_raw_datum=_supp_raw, + indexcard=indexcard, + supplementary_suid=_supp_suid, + focus_iri=focus_iri, + rdf_as_turtle=rdf.turtle_from_tripledict(rdf_tripledict), + turtle_checksum_iri='sup', # not enforced + ) diff --git a/tests/share/search/index_strategy/test_trove_indexcard_flats.py b/tests/share/search/index_strategy/test_trove_indexcard_flats.py index 3cf84ec82..2cccf2d41 100644 --- a/tests/share/search/index_strategy/test_trove_indexcard_flats.py +++ b/tests/share/search/index_strategy/test_trove_indexcard_flats.py @@ -7,3 +7,6 @@ class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests): # for RealElasticTestCase def get_index_strategy(self): return TroveIndexcardFlatsIndexStrategy('test_trove_indexcard_flats') + + def cardsearch_integer_cases(self): + yield from () # integers not indexed by this strategy From 00dc003d3957dcf6444964345d26ea783a50b590 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 25 Oct 2024 10:58:37 -0400 Subject: [PATCH 06/14] support sorting along paths and on integers --- .../index_strategy/_trovesearch_util.py | 29 ++-- .../index_strategy/trove_indexcard_flats.py | 2 +- .../index_strategy/trovesearch_denorm.py | 39 +++-- .../_common_trovesearch_tests.py | 4 +- trove/exceptions.py | 4 + trove/trovesearch/search_params.py | 154 ++++++++++++------ trove/vocab/trove.py | 12 +- 7 files changed, 164 insertions(+), 80 deletions(-) diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py index 909290d50..6bfb55004 100644 --- a/share/search/index_strategy/_trovesearch_util.py +++ b/share/search/index_strategy/_trovesearch_util.py @@ -15,6 +15,7 @@ from trove import models as trove_db from trove.trovesearch.search_params import ( is_globpath, + Propertypath, ) from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri from trove.vocab.namespaces import ( @@ -34,12 +35,6 @@ _logger = logging.getLogger(__name__) -### -# type aliases - -Propertypath = tuple[str, ...] - - ### # constants @@ -167,18 +162,26 @@ def __post_init__(self): self.iri_values[_walk_path].add(_walk_obj) elif isinstance(_walk_obj, datetime.date): self.date_values[_walk_path].add(_walk_obj) - elif is_date_property(_walk_path[-1]): - try: - _parsed_date = datetime.date.fromisoformat(_walk_obj.unicode_value) - except ValueError: - _logger.debug('skipping malformatted date "%s"', _walk_obj.unicode_value) - else: - self.date_values[_walk_path].add(_parsed_date) + elif isinstance(_walk_obj, int): + self.integer_values[_walk_path].add(_walk_obj) elif isinstance(_walk_obj, rdf.Literal): if XSD.integer in _walk_obj.datatype_iris: self.integer_values[_walk_path].add(_walk_obj) if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris): self.text_values[_walk_path].add(_walk_obj.unicode_value) + # try for date in a date property, regardless of the above + if is_date_property(_walk_path[-1]) and isinstance(_walk_obj, (str, rdf.Literal)): + _date_str = ( + _walk_obj.unicode_value + if isinstance(_walk_obj, rdf.Literal) + else _walk_obj + ) + try: + _parsed_date = datetime.date.fromisoformat(_date_str) + except ValueError: + _logger.debug('skipping malformatted date "%s"', _date_str) + else: + self.date_values[_walk_path].add(_parsed_date) def shortwalk_from(self, from_iri: str) -> GraphWalk: return GraphWalk( diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 802158975..51aaa9a76 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -665,7 +665,7 @@ def _cardsearch_sort(self, sort_list: tuple[SortParam]): 'path': 'nested_date', 'filter': {'term': { 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - [_sortparam.property_iri], + _sortparam.propertypath, suffuniq=True, ), }}, diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 1f78dc8b2..253b84148 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -24,19 +24,21 @@ from trove import models as trove_db from trove.trovesearch.search_params import ( CardsearchParams, - ValuesearchParams, + PageParam, + Propertypath, SearchFilter, Textsegment, - PageParam, + ValueType, + ValuesearchParams, is_globpath, ) from trove.trovesearch.search_response import ( CardsearchResponse, - ValuesearchResponse, - TextMatchEvidence, CardsearchResult, - ValuesearchResult, PropertypathUsage, + TextMatchEvidence, + ValuesearchResponse, + ValuesearchResult, ) from trove.vocab.osfmap import is_date_property from trove.vocab.namespaces import TROVE, OWL, RDF @@ -202,7 +204,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear index=self.indexname, source=False, # no need to get _source, identifiers are enough docvalue_fields=['card.card_iri'], - highlight={ # TODO: only one field gets highlighted? + highlight={ 'require_field_match': False, 'fields': {'card.text_by_propertypath.*': {}}, }, @@ -592,7 +594,7 @@ def _presence_query(self, search_filter) -> dict: for _path in search_filter.propertypath_set ]) - def _path_presence_query(self, path: ts.Propertypath): + def _path_presence_query(self, path: Propertypath): _field = f'{self.base_field}.propertypaths_present' return {'term': {_field: ts.propertypath_as_keyword(path)}} @@ -634,7 +636,7 @@ def _date_filter_for_path(self, path, filter_operator, value_set): else: raise ValueError(f'invalid date filter operator (got {filter_operator})') - def _text_field_name(self, propertypath: ts.Propertypath): + def _text_field_name(self, propertypath: Propertypath): return ( f'{self.base_field}.text_by_depth.{_depth_field_name(len(propertypath))}' if is_globpath(propertypath) @@ -749,10 +751,19 @@ def _cardsearch_aggs(self): def _cardsearch_sorts(self): for _sortparam in self.params.sort_list: - _path = (_sortparam.property_iri,) - _field = f'card.date_by_propertypath.{_path_field_name(_path)}' - _order = 'desc' if _sortparam.descending else 'asc' - yield {_field: _order} + _fieldkey = _path_field_name(_sortparam.propertypath) + if _sortparam.value_type == ValueType.DATE: + _field = f'card.date_by_propertypath.{_fieldkey}' + _unmapped_type = 'date' + elif _sortparam.value_type == ValueType.INTEGER: + _field = f'card.int_by_propertypath.{_fieldkey}' + _unmapped_type = 'long' + else: + raise ValueError(f'unsupported sort value type: {_sortparam}') + yield {_field: { + 'order': 'desc' if _sortparam.descending else 'asc', + 'unmapped_type': _unmapped_type, + }} def _build_iri_valuesearch(params: ValuesearchParams, cursor: _SimpleCursor) -> dict: @@ -855,11 +866,11 @@ def _depth_field_name(depth: int) -> str: return f'depth{depth}' -def _path_field_name(path: ts.Propertypath) -> str: +def _path_field_name(path: Propertypath) -> str: return ts.b64(ts.propertypath_as_keyword(path)) -def _parse_path_field_name(path_field_name: str) -> ts.Propertypath: +def _parse_path_field_name(path_field_name: str) -> Propertypath: # inverse of _path_field_name _list = json.loads(ts.b64_reverse(path_field_name)) assert isinstance(_list, list) diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index b1820f8e5..d912b961c 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -353,11 +353,11 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: # cases that depend on integer values getting indexed yield ( - {'sort': 'dcat:servesDataset.dcat:spatialResolutionInMeters'}, + {'sort[integer-value]': 'dcat:servesDataset.dcat:spatialResolutionInMeters'}, [BLARG.b, BLARG.a, BLARG.c], # ordered list ) yield ( - {'sort': '-dcat:servesDataset.dcat:spatialResolutionInMeters'}, + {'sort[integer-value]': '-dcat:servesDataset.dcat:spatialResolutionInMeters'}, [BLARG.c, BLARG.a, BLARG.b], # ordered list ) diff --git a/trove/exceptions.py b/trove/exceptions.py index ff69be61d..b25e3eb20 100644 --- a/trove/exceptions.py +++ b/trove/exceptions.py @@ -74,6 +74,10 @@ class InvalidQueryParams(RequestParsingError): pass +class InvalidSort(RequestParsingError): + pass + + ### # rendering a response diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 6f20e8b5c..29f49d0d8 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -1,3 +1,4 @@ +from __future__ import annotations import collections import dataclasses import enum @@ -29,6 +30,11 @@ logger = logging.getLogger(__name__) +### +# type aliases +Propertypath = tuple[str, ...] +PropertypathSet = frozenset[Propertypath] + ### # constants for use in query param parsing @@ -49,9 +55,28 @@ # special path-step that matches any property GLOB_PATHSTEP = '*' -ONE_GLOB_PROPERTYPATH = (GLOB_PATHSTEP,) -DEFAULT_PROPERTYPATH_SET = frozenset([ONE_GLOB_PROPERTYPATH]) +ONE_GLOB_PROPERTYPATH: Propertypath = (GLOB_PATHSTEP,) +DEFAULT_PROPERTYPATH_SET: PropertypathSet = frozenset([ONE_GLOB_PROPERTYPATH]) + + +class ValueType(enum.Enum): + # note: enum values are iris + IRI = TROVE['value-type/iri'] + DATE = TROVE['value-type/date'] + INTEGER = TROVE['value-type/integer'] + + @classmethod + def from_shortname(cls, shortname): + _iri = trove_shorthand().expand_iri(shortname) + return cls(_iri) + + @classmethod + def shortnames(cls): + for _value_type in cls: + yield _value_type.to_shortname() + def to_shortname(self) -> str: + return trove_shorthand().compact_iri(self.value) ### # dataclasses for parsed search-api query parameters @@ -60,15 +85,15 @@ @dataclasses.dataclass(frozen=True) class BaseTroveParams: iri_shorthand: primitive_rdf.IriShorthand = dataclasses.field(repr=False) - include: frozenset[tuple[str, ...]] + include: PropertypathSet accept_mediatype: str | None @classmethod - def from_querystring(cls, querystring: str) -> 'BaseTroveParams': # TODO py3.11: typing.Self + def from_querystring(cls, querystring: str) -> BaseTroveParams: # TODO py3.11: typing.Self return cls.from_queryparams(queryparams_from_querystring(querystring)) @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> 'BaseTroveParams': + def from_queryparams(cls, queryparams: QueryparamDict) -> BaseTroveParams: return cls(**cls.parse_queryparams(queryparams)) @classmethod @@ -115,7 +140,7 @@ class Textsegment: is_fuzzy: bool = True is_negated: bool = False is_openended: bool = False - propertypath_set: frozenset[tuple[str, ...]] = DEFAULT_PROPERTYPATH_SET + propertypath_set: PropertypathSet = DEFAULT_PROPERTYPATH_SET def __post_init__(self): if self.is_negated and self.is_fuzzy: @@ -282,7 +307,7 @@ def is_valueless_operator(self): operator: FilterOperator value_set: frozenset[str] - propertypath_set: frozenset[tuple[str, ...]] = DEFAULT_PROPERTYPATH_SET + propertypath_set: PropertypathSet = DEFAULT_PROPERTYPATH_SET @classmethod def from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str): @@ -373,40 +398,68 @@ def as_queryparam(self, queryparam_family: str): @dataclasses.dataclass(frozen=True) class SortParam: - property_iri: str - descending: bool = False + value_type: ValueType + propertypath: Propertypath + descending: bool @classmethod - def sortlist_as_queryparam_value(cls, sort_params): - return join_queryparam_value( - _sort.as_queryparam_value() - for _sort in sort_params - ) + def from_sort_queryparams(cls, queryparams: QueryparamDict) -> tuple[SortParam, ...]: + return tuple(filter(None, ( + cls._from_sort_queryparam(_param_name, _param_value) + for (_param_name, _param_value) + in queryparams.get('sort', ()) + ))) @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> tuple['SortParam', ...]: - _paramvalue = _get_single_value(queryparams, QueryparamName('sort')) - if not _paramvalue or _paramvalue == '-relevance': - return () - return tuple(cls._from_sort_param_str(_paramvalue)) + def _from_sort_queryparam( + cls, + param_name: QueryparamName, + param_value: str, + ) -> SortParam | None: + if not param_value or param_value == '-relevance': + return None + _value_type = ValueType.DATE # default + if param_name.bracketed_names: + try: # "sort[]" + (_value_type_str,) = param_name.bracketed_names + if _value_type_str: + _value_type = ValueType.from_shortname(_value_type_str) + if _value_type not in (ValueType.DATE, ValueType.INTEGER): + raise ValueError + except ValueError: + raise trove_exceptions.InvalidQueryParamName(str(param_name), ( + 'valid sort param names: sort,' + f' sort[{ValueType.DATE.to_shortname()}],' + f' sort[{ValueType.INTEGER.to_shortname()}],' + )) + _descending = param_value.startswith(DESCENDING_SORT_PREFIX) + _rawpath = param_value.lstrip(DESCENDING_SORT_PREFIX) + _path = _parse_propertypath(_rawpath, allow_globs=False) + return cls( + value_type=_value_type, + propertypath=_path, + descending=_descending, + ) - @classmethod - def _from_sort_param_str(cls, param_value: str) -> typing.Iterable['SortParam']: - for _sort in split_queryparam_value(param_value): - _sort_property = _sort.lstrip(DESCENDING_SORT_PREFIX) - _property_iri = osfmap_shorthand().expand_iri(_sort_property) - if not is_date_property(_property_iri): - raise trove_exceptions.InvalidQueryParamValue('sort', _sort_property, "may not sort on non-date properties") - yield cls( - property_iri=_property_iri, - descending=param_value.startswith(DESCENDING_SORT_PREFIX), + def __post_init__(self): + if ( + self.value_type == ValueType.DATE + and not is_date_path(self.propertypath) + ): + raise trove_exceptions.InvalidSort( + '='.join(self.as_queryparam()), + 'may not sort by date on a path leading to a non-date property', ) - def as_queryparam_value(self): - _key = propertypath_key((self.property_iri,)) - if self.descending: - return f'-{_key}' - return _key + def as_queryparam(self) -> tuple[str, str]: + _name = ( + 'sort' + if (self.value_type == ValueType.DATE) + else f'sort[{self.value_type.to_shortname()}]' + ) + _pathkey = propertypath_key(self.propertypath) + _value = (f'-{_pathkey}' if self.descending else _pathkey) + return (_name, _value) @dataclasses.dataclass(frozen=True) @@ -415,7 +468,7 @@ class PageParam: size: int | None = None # size is None iff cursor is not None @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> 'PageParam': + def from_page_queryparams(cls, queryparams: QueryparamDict) -> PageParam: _cursor = _get_single_value(queryparams, QueryparamName('page', ('cursor',))) if _cursor: return cls(cursor=_cursor) @@ -433,7 +486,7 @@ class CardsearchParams(BaseTroveParams): index_strategy_name: str | None sort_list: tuple[SortParam] page: PageParam - related_property_paths: tuple[tuple[str, ...]] + related_property_paths: tuple[Propertypath, ...] unnamed_iri_values: frozenset[str] @classmethod @@ -444,8 +497,8 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: 'cardsearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'cardSearchText'), 'cardsearch_filter_set': _filter_set, 'index_strategy_name': _get_single_value(queryparams, QueryparamName('indexStrategy')), - 'sort_list': SortParam.from_queryparams(queryparams), - 'page': PageParam.from_queryparams(queryparams), + 'sort_list': SortParam.from_sort_queryparams(queryparams), + 'page': PageParam.from_page_queryparams(queryparams), 'include': None, # TODO 'related_property_paths': _get_related_property_paths(_filter_set), 'unnamed_iri_values': frozenset(), # TODO: frozenset(_get_unnamed_iri_values(_filter_set)), @@ -455,8 +508,9 @@ def to_querydict(self) -> QueryDict: _querydict = super().to_querydict() for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('cardSearchText', self.cardsearch_textsegment_set): _querydict[_qp_name] = _qp_value - if self.sort_list: - _querydict['sort'] = SortParam.sortlist_as_queryparam_value(self.sort_list) + for _sort in self.sort_list: + _qp_name, _qp_value = _sort.as_queryparam() + _querydict.appendlist(_qp_name, _qp_value) if self.page.cursor: _querydict['page[cursor]'] = self.page.cursor elif self.page.size != DEFAULT_PAGE_SIZE: @@ -473,7 +527,7 @@ def to_querydict(self) -> QueryDict: class ValuesearchParams(CardsearchParams): # includes fields from CardsearchParams, because a # valuesearch is always in context of a cardsearch - valuesearch_propertypath: tuple[str, ...] + valuesearch_propertypath: Propertypath valuesearch_textsegment_set: frozenset[Textsegment] valuesearch_filter_set: frozenset[SearchFilter] @@ -526,14 +580,18 @@ def valuesearch_type_iris(self): ### # helper functions -def is_globpath(path: tuple[str, ...]) -> bool: +def is_globpath(path: Propertypath) -> bool: return all(_pathstep == GLOB_PATHSTEP for _pathstep in path) -def make_globpath(length: int) -> tuple[str, ...]: +def make_globpath(length: int) -> Propertypath: return ONE_GLOB_PROPERTYPATH * length +def is_date_path(path: Propertypath) -> bool: + return bool(path) and is_date_property(path[-1]) + + def propertypathstep_key(pathstep: str) -> str: if pathstep == GLOB_PATHSTEP: return pathstep @@ -541,14 +599,14 @@ def propertypathstep_key(pathstep: str) -> str: return urllib.parse.quote(osfmap_shorthand().compact_iri(pathstep)) -def propertypath_key(property_path: tuple[str, ...]) -> str: +def propertypath_key(property_path: Propertypath) -> str: return PROPERTYPATH_DELIMITER.join( propertypathstep_key(_pathstep) for _pathstep in property_path ) -def propertypath_set_key(propertypath_set: frozenset[tuple[str, ...]]) -> str: +def propertypath_set_key(propertypath_set: PropertypathSet) -> str: return join_queryparam_value( propertypath_key(_propertypath) for _propertypath in propertypath_set @@ -585,7 +643,7 @@ def _get_single_value( return _singlevalue -def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> frozenset[tuple[str, ...]]: +def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> PropertypathSet: # comma-delimited set of dot-delimited paths return frozenset( _parse_propertypath(_path, allow_globs=allow_globs) @@ -593,7 +651,7 @@ def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> fr ) -def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> tuple[str, ...]: +def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> Propertypath: _path = tuple( osfmap_shorthand().expand_iri(_pathstep) for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER) @@ -609,7 +667,7 @@ def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> tuple[str, return _path -def _get_related_property_paths(filter_set) -> tuple[tuple[str, ...], ...]: +def _get_related_property_paths(filter_set) -> tuple[Propertypath, ...]: # hard-coded for osf.io search pages, static list per type # TODO: replace with some dynamism, maybe a 'significant_terms' aggregation _type_iris = set() diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index b7106f3dd..e188e85bb 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -799,9 +799,17 @@ def trove_browse_link(iri: str): RDF.type: {TROVE.FilterOperator}, JSONAPI_MEMBERNAME: {literal('after', language='en')}, }, - TROVE['at-date']: { + TROVE['value-type/iri']: { RDF.type: {TROVE.FilterOperator}, - JSONAPI_MEMBERNAME: {literal('at-date', language='en')}, + JSONAPI_MEMBERNAME: {literal('iri-value', language='en')}, + }, + TROVE['value-type/date']: { + RDF.type: {TROVE.ValueType}, + JSONAPI_MEMBERNAME: {literal('date-value', language='en')}, + }, + TROVE['value-type/integer']: { + RDF.type: {TROVE.ValueType}, + JSONAPI_MEMBERNAME: {literal('integer-value', language='en')}, }, # other: From 6561e0a163f03e570f74659da965fa83a5dbba41 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 25 Oct 2024 17:02:03 -0400 Subject: [PATCH 07/14] update sort docs --- trove/vocab/trove.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index e188e85bb..0e8bd9da2 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -656,17 +656,21 @@ def trove_browse_link(iri: str): RDFS.comment: {literal('how to order search results', language='en')}, TROVE.jsonSchema: {literal_json({'type': 'string'})}, DCTERMS.description: {_literal_markdown(f'''**sort** is -a query param to control ordering of search results +a query param to control ordering of search results based on values of a specific type at a specific path. -accepts a short-hand iri for a date property: +to sort by date values, use `sort` (or `sort[date-value]`) with a **property-path** that ends with +one of the following supported date properties: {", ".join(f"`{osfmap_shorthand().compact_iri(_date_iri)}`" for _date_iri in DATE_PROPERTIES)} -prefix with `-` to sort descending (latest first), otherwise sorts ascending (earliest first) +to sort by integer values, use `sort[integer-value]` with a **property-path** to the integers of interest. -if missing (or if `sort=-relevance`), results are sorted by some notion of +by default, sorts "ascending" (beginning with earliest date or smallest integer) -- +prefix the value with `-` to sort "descending" (beginning with latest date or largest integer). + +if missing (or with value `-relevance`), results are sorted by some notion of relevance to the request's search-text or (if no search-text) by random. -may not be used with `page[cursor]` +may not be used with `page[cursor]`. ''', language='en')}, }, From 727d6c03941860bfb0aee998c5c4835fd98d5790 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 25 Oct 2024 17:29:59 -0400 Subject: [PATCH 08/14] fix: card.pk => card.card_pk --- share/search/index_strategy/trovesearch_denorm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 253b84148..7478e27f2 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -151,7 +151,7 @@ def before_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: Itera self.es8_client.delete_by_query( index=list(indexnames), query={'bool': {'must': [ - {'terms': {'card.pk': messages_chunk.target_ids_chunk}}, + {'terms': {'card.card_pk': messages_chunk.target_ids_chunk}}, {'exists': {'field': 'iri_value.value_iri'}}, ]}}, ) @@ -718,7 +718,7 @@ def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: 'random_score': {}, # default random_score is fast and unpredictable }, } - _firstpage_filter = {'terms': {'card.pk': self.cursor.first_page_pks}} + _firstpage_filter = {'terms': {'card.card_pk': self.cursor.first_page_pks}} if self.cursor.is_first_page(): # returning to a first page previously visited _bool.add_boolpart('filter', _firstpage_filter) @@ -731,7 +731,7 @@ def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: 'boost_mode': 'replace', 'random_score': { 'seed': ''.join(self.cursor.first_page_pks), - 'field': 'card.pk', + 'field': 'card.card_pk', }, }, } From b91966d88e487aebb9c041b481a747d341a1d7c4 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 5 Nov 2024 11:14:26 -0500 Subject: [PATCH 09/14] wip --- share/search/index_strategy/_cursor.py | 132 ++++++++++++++++++ share/search/index_strategy/_util.py | 14 -- .../index_strategy/trove_indexcard_flats.py | 83 ----------- .../index_strategy/trovesearch_denorm.py | 106 +------------- 4 files changed, 136 insertions(+), 199 deletions(-) create mode 100644 share/search/index_strategy/_cursor.py diff --git a/share/search/index_strategy/_cursor.py b/share/search/index_strategy/_cursor.py new file mode 100644 index 000000000..852df2aee --- /dev/null +++ b/share/search/index_strategy/_cursor.py @@ -0,0 +1,132 @@ +from __future__ import annotations +import base64 +import dataclasses +import json +import typing + +from ._trovesearch_util import ( + VALUESEARCH_MAX, + CARDSEARCH_MAX, +) + +if typing.TYPE_CHECKING: + from trove.trovesearch.search_params import ( + CardsearchParams, + PageParam, + ) + +__all__ = ('OffsetCursor', 'CardsearchCursor') + + +_SomeDataclass = typing.TypeVar('_SomeDataclass') + + +@dataclasses.dataclass +class PageCursor: + page_size: int + + def as_queryparam_value(self) -> str: + _as_json = json.dumps(dataclasses.astuple(self)) + _cursor_bytes = base64.urlsafe_b64encode(_as_json.encode()) + return _cursor_bytes.decode() + + @classmethod + def from_queryparam_value(cls, cursor_value: str): + _as_list = json.loads(base64.urlsafe_b64decode(cursor_value)) + return cls(*_as_list) + + +@dataclasses.dataclass +class OffsetCursor(PageCursor): + start_index: int + result_count: int | None # use -1 to indicate "many more" + + MAX_INDEX: typing.ClassVar[int] = VALUESEARCH_MAX + + @classmethod + def from_page_param(cls, page: PageParam) -> OffsetCursor: + if page.cursor: + return cls.from_value(page.cursor) + assert page.size is not None + return cls( + start_index=0, + page_size=page.size, + result_count=None, # should be set when results are in + ) + + def next_cursor(self) -> str | None: + if not self.result_count: + return None + _next = dataclasses.replace(self, start_index=(self.start_index + self.page_size)) + return ( + encode_cursor_dataclass(_next) + if _next.is_valid_cursor() + else None + ) + + def prev_cursor(self) -> str | None: + _prev = dataclasses.replace(self, start_index=(self.start_index - self.page_size)) + return ( + encode_cursor_dataclass(_prev) + if _prev.is_valid_cursor() + else None + ) + + def first_cursor(self) -> str | None: + if self.is_first_page(): + return None + return encode_cursor_dataclass(dataclasses.replace(self, start_index=0)) + + def is_first_page(self) -> bool: + return self.start_index == 0 + + def has_many_more(self) -> bool: + return self.result_count == -1 + + def max_index(self) -> int: + return ( + self.MAX_INDEX + if self.has_many_more() + else min(self.result_count or 0, self.MAX_INDEX) + ) + + def is_valid_cursor(self) -> bool: + return 0 <= self.start_index < self.max_index() + + +@dataclasses.dataclass +class CardsearchCursor(OffsetCursor): + random_sort: bool # how to sort by relevance to nothingness? randomness! + first_page_pks: tuple[str, ...] = () + + MAX_INDEX: typing.ClassVar[int] = CARDSEARCH_MAX + + @classmethod + def from_cardsearch_params(cls, params: CardsearchParams) -> CardsearchCursor: + if params.page.cursor: + return decode_cursor_dataclass(params.page.cursor, cls) + assert params.page.size is not None + return cls( + start_index=0, + page_size=params.page.size, + result_count=None, # should be set when results are in + random_sort=( + not params.sort_list + and not params.cardsearch_textsegment_set + ), + ) + + def cardsearch_start_index(self) -> int: + if self.is_first_page() or not self.random_sort: + return self.start_index + return self.start_index - len(self.first_page_pks) + + def first_cursor(self) -> str | None: + if self.random_sort and not self.first_page_pks: + return None + return super().prev_cursor() + + def prev_cursor(self) -> str | None: + if self.random_sort and not self.first_page_pks: + return None + return super().prev_cursor() diff --git a/share/search/index_strategy/_util.py b/share/search/index_strategy/_util.py index 5b3586006..e6f0cafca 100644 --- a/share/search/index_strategy/_util.py +++ b/share/search/index_strategy/_util.py @@ -13,17 +13,3 @@ def timestamp_to_readable_datetime(timestamp_in_milliseconds): .fromtimestamp(seconds, tz=datetime.timezone.utc) .isoformat(timespec='minutes') ) - - -def encode_cursor_dataclass(dataclass_instance) -> str: - _as_json = json.dumps(dataclasses.astuple(dataclass_instance)) - _cursor_bytes = base64.urlsafe_b64encode(_as_json.encode()) - return _cursor_bytes.decode() - - -_SomeDataclass = typing.TypeVar('_SomeDataclass') - - -def decode_cursor_dataclass(cursor: str, dataclass_class: type[_SomeDataclass]) -> _SomeDataclass: - _as_list = json.loads(base64.urlsafe_b64decode(cursor)) - return dataclass_class(*_as_list) diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 51aaa9a76..f0adf8a59 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -912,89 +912,6 @@ def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} -@dataclasses.dataclass -class _SimpleCursor: - start_index: int - page_size: int - result_count: int | None # use -1 to indicate "many more" - - MAX_INDEX: ClassVar[int] = VALUESEARCH_MAX - - @classmethod - def from_page_param(cls, page: PageParam) -> '_SimpleCursor': - if page.cursor: - return decode_cursor_dataclass(page.cursor, cls) - return cls( - start_index=0, - page_size=page.size, - result_count=None, # should be set when results are in - ) - - def next_cursor(self) -> str | None: - if not self.result_count: - return None - _next = dataclasses.replace(self, start_index=(self.start_index + self.page_size)) - return ( - encode_cursor_dataclass(_next) - if _next.is_valid_cursor() - else None - ) - - def prev_cursor(self) -> str | None: - _prev = dataclasses.replace(self, start_index=(self.start_index - self.page_size)) - return ( - encode_cursor_dataclass(_prev) - if _prev.is_valid_cursor() - else None - ) - - def first_cursor(self) -> str | None: - if self.is_first_page(): - return None - return encode_cursor_dataclass(dataclasses.replace(self, start_index=0)) - - def is_first_page(self) -> bool: - return self.start_index == 0 - - def has_many_more(self) -> bool: - return self.result_count == -1 - - def max_index(self) -> int: - return ( - self.MAX_INDEX - if self.has_many_more() - else min(self.result_count, self.MAX_INDEX) - ) - - def is_valid_cursor(self) -> bool: - return 0 <= self.start_index < self.max_index() - - -@dataclasses.dataclass -class _CardsearchCursor(_SimpleCursor): - random_sort: bool # how to sort by relevance to nothingness? randomness! - first_page_uuids: tuple[str, ...] = () - - MAX_INDEX: ClassVar[int] = CARDSEARCH_MAX - - @classmethod - def from_params(cls, params: CardsearchParams) -> '_CardsearchCursor': - if params.page.cursor: - return decode_cursor_dataclass(params.page.cursor, cls) - return cls( - start_index=0, - page_size=params.page.size, - result_count=None, # should be set when results are in - random_sort=( - not params.sort_list - and not params.cardsearch_textsegment_set - ), - ) - - def cardsearch_start_index(self) -> int: - if self.is_first_page() or not self.random_sort: - return self.start_index - return self.start_index - len(self.first_page_uuids) class _PredicatePathWalker: diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 7478e27f2..670e51119 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -217,7 +217,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear # abstract method from IndexStrategy.SpecificIndex def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: _path = valuesearch_params.valuesearch_propertypath - _cursor = _SimpleCursor.from_page_param(valuesearch_params.page) + _cursor = OffsetCursor.from_page_param(valuesearch_params.page) _query = ( _build_date_valuesearch(valuesearch_params, _cursor) if is_date_property(_path[-1]) @@ -389,7 +389,7 @@ def _valuesearch_response( self, valuesearch_params: ValuesearchParams, es8_response: dict, - cursor: _SimpleCursor, + cursor: OffsetCursor, ) -> ValuesearchResponse: _iri_aggs = es8_response['aggregations'].get('agg_valuesearch_iris') if _iri_aggs: @@ -766,7 +766,7 @@ def _cardsearch_sorts(self): }} -def _build_iri_valuesearch(params: ValuesearchParams, cursor: _SimpleCursor) -> dict: +def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> dict: _path = params.valuesearch_propertypath _bool = _BoolBuilder() _bool.add_boolpart('filter', {'term': { @@ -811,7 +811,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: _SimpleCursor) -> } -def _build_date_valuesearch(params: ValuesearchParams, cursor: _SimpleCursor) -> dict: +def _build_date_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> dict: assert not params.valuesearch_textsegment_set assert not params.valuesearch_filter_set _bool = _BoolBuilder() @@ -885,101 +885,3 @@ def _any_query(queries: abc.Collection[dict]): return {'bool': {'should': list(queries), 'minimum_should_match': 1}} -### -# cursor implementations - -@dataclasses.dataclass -class _SimpleCursor: - start_index: int - page_size: int - result_count: int | None # use -1 to indicate "many more" - - MAX_INDEX: ClassVar[int] = ts.VALUESEARCH_MAX - - @classmethod - def from_page_param(cls, page: PageParam) -> _SimpleCursor: - if page.cursor: - return decode_cursor_dataclass(page.cursor, cls) - assert page.size is not None - return cls( - start_index=0, - page_size=page.size, - result_count=None, # should be set when results are in - ) - - def next_cursor(self) -> str | None: - if not self.result_count: - return None - _next = dataclasses.replace(self, start_index=(self.start_index + self.page_size)) - return ( - encode_cursor_dataclass(_next) - if _next.is_valid_cursor() - else None - ) - - def prev_cursor(self) -> str | None: - _prev = dataclasses.replace(self, start_index=(self.start_index - self.page_size)) - return ( - encode_cursor_dataclass(_prev) - if _prev.is_valid_cursor() - else None - ) - - def first_cursor(self) -> str | None: - if self.is_first_page(): - return None - return encode_cursor_dataclass(dataclasses.replace(self, start_index=0)) - - def is_first_page(self) -> bool: - return self.start_index == 0 - - def has_many_more(self) -> bool: - return self.result_count == -1 - - def max_index(self) -> int: - return ( - self.MAX_INDEX - if self.has_many_more() - else min(self.result_count or 0, self.MAX_INDEX) - ) - - def is_valid_cursor(self) -> bool: - return 0 <= self.start_index < self.max_index() - - -@dataclasses.dataclass -class _CardsearchCursor(_SimpleCursor): - random_sort: bool # how to sort by relevance to nothingness? randomness! - first_page_pks: tuple[str, ...] = () - - MAX_INDEX: ClassVar[int] = ts.CARDSEARCH_MAX - - @classmethod - def from_cardsearch_params(cls, params: CardsearchParams) -> _CardsearchCursor: - if params.page.cursor: - return decode_cursor_dataclass(params.page.cursor, cls) - assert params.page.size is not None - return cls( - start_index=0, - page_size=params.page.size, - result_count=None, # should be set when results are in - random_sort=( - not params.sort_list - and not params.cardsearch_textsegment_set - ), - ) - - def cardsearch_start_index(self) -> int: - if self.is_first_page() or not self.random_sort: - return self.start_index - return self.start_index - len(self.first_page_pks) - - def first_cursor(self) -> str | None: - if self.random_sort and not self.first_page_pks: - return None - return super().prev_cursor() - - def prev_cursor(self) -> str | None: - if self.random_sort and not self.first_page_pks: - return None - return super().prev_cursor() From 7873eecfbb53ed9d6ca94101d2882f805eb4be03 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 7 Nov 2024 10:06:21 -0500 Subject: [PATCH 10/14] pks=ids=uuids --- share/search/index_strategy/_util.py | 4 - .../index_strategy/trove_indexcard_flats.py | 16 ++-- .../index_strategy/trovesearch_denorm.py | 16 ++-- .../trovesearch}/_cursor.py | 90 +++++++++---------- trove/trovesearch/search_response.py | 5 +- 5 files changed, 61 insertions(+), 70 deletions(-) rename {share/search/index_strategy => trove/trovesearch}/_cursor.py (57%) diff --git a/share/search/index_strategy/_util.py b/share/search/index_strategy/_util.py index e6f0cafca..1908536a4 100644 --- a/share/search/index_strategy/_util.py +++ b/share/search/index_strategy/_util.py @@ -1,8 +1,4 @@ -import base64 -import dataclasses import datetime -import json -import typing def timestamp_to_readable_datetime(timestamp_in_milliseconds): diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index f0adf8a59..10d7e55d6 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -386,7 +386,7 @@ def _cardsearch_query( if not cardsearch_cursor or not cardsearch_cursor.random_sort: # no need for randomness return {'bool': _bool_query} - if not cardsearch_cursor.first_page_uuids: + if not cardsearch_cursor.first_page_ids: # independent random sample return { 'function_score': { @@ -395,7 +395,7 @@ def _cardsearch_query( 'random_score': {}, # default random_score is fast and unpredictable }, } - _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_uuids}} + _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} if cardsearch_cursor.is_first_page(): # returning to a first page previously visited _bool_query['filter'].append(_firstpage_uuid_query) @@ -407,7 +407,7 @@ def _cardsearch_query( 'query': {'bool': _bool_query}, 'boost_mode': 'replace', 'random_score': { - 'seed': ''.join(cardsearch_cursor.first_page_uuids), + 'seed': ''.join(cardsearch_cursor.first_page_ids), 'field': 'indexcard_uuid', }, }, @@ -687,7 +687,7 @@ def _cardsearch_response( cursor.result_count = _es8_total['value'] if cursor.random_sort and not cursor.is_first_page(): # account for the filtered-out first page - cursor.result_count += len(cursor.first_page_uuids) + cursor.result_count += len(cursor.first_page_ids) _results = [] for _es8_hit in es8_response['hits']['hits']: _card_iri = _es8_hit['_id'] @@ -695,25 +695,25 @@ def _cardsearch_response( card_iri=_card_iri, text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), )) - if cursor.is_first_page() and cursor.first_page_uuids: + if cursor.is_first_page() and cursor.first_page_ids: # revisiting first page; reproduce original random order _uuid_index = { _uuid: _i - for (_i, _uuid) in enumerate(cursor.first_page_uuids) + for (_i, _uuid) in enumerate(cursor.first_page_ids) } _results.sort(key=lambda _r: _uuid_index[_r.card_uuid]) else: _should_start_reproducible_randomness = ( cursor.random_sort and cursor.is_first_page() - and not cursor.first_page_uuids + and not cursor.first_page_ids and any( not _filter.is_type_filter() # look for a non-default filter for _filter in cardsearch_params.cardsearch_filter_set ) ) if _should_start_reproducible_randomness: - cursor.first_page_uuids = tuple( + cursor.first_page_ids = tuple( _result.card_uuid for _result in _results ) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 670e51119..2c1e3cf8c 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -457,7 +457,7 @@ def _cardsearch_response( if cursor.random_sort and not cursor.is_first_page(): # account for the filtered-out first page assert cursor.result_count is not None - cursor.result_count += len(cursor.first_page_pks) + cursor.result_count += len(cursor.first_page_ids) _results = [] for _es8_hit in es8_response['hits']['hits']: _card_iri = _es8_hit['fields']['card.card_iri'][0] @@ -466,18 +466,18 @@ def _cardsearch_response( card_pk=_es8_hit['_id'], text_match_evidence=list(self._gather_textmatch_evidence(_card_iri, _es8_hit)), )) - if cursor.is_first_page() and cursor.first_page_pks: + if cursor.is_first_page() and cursor.first_page_ids: # revisiting first page; reproduce original random order _ordering_by_id = { _id: _i - for (_i, _id) in enumerate(cursor.first_page_pks) + for (_i, _id) in enumerate(cursor.first_page_ids) } _results.sort(key=lambda _r: _ordering_by_id[_r.card_pk]) else: _should_start_reproducible_randomness = ( cursor.random_sort and cursor.is_first_page() - and not cursor.first_page_pks + and not cursor.first_page_ids and not cursor.has_many_more() and any( not _filter.is_type_filter() # look for a non-default filter @@ -485,7 +485,7 @@ def _cardsearch_response( ) ) if _should_start_reproducible_randomness: - cursor.first_page_pks = tuple(_result.card_pk for _result in _results) + cursor.first_page_ids = tuple(_result.card_pk for _result in _results) _relatedproperty_list: list[PropertypathUsage] = [] if cardsearch_params.related_property_paths: _relatedproperty_list.extend( @@ -709,7 +709,7 @@ def _cardsearch_query(self) -> dict: ) def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: - if not self.cursor.first_page_pks: + if not self.cursor.first_page_ids: # independent random sample return { 'function_score': { @@ -718,7 +718,7 @@ def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: 'random_score': {}, # default random_score is fast and unpredictable }, } - _firstpage_filter = {'terms': {'card.card_pk': self.cursor.first_page_pks}} + _firstpage_filter = {'terms': {'card.card_pk': self.cursor.first_page_ids}} if self.cursor.is_first_page(): # returning to a first page previously visited _bool.add_boolpart('filter', _firstpage_filter) @@ -730,7 +730,7 @@ def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: 'query': _bool.as_query(), 'boost_mode': 'replace', 'random_score': { - 'seed': ''.join(self.cursor.first_page_pks), + 'seed': ''.join(self.cursor.first_page_ids), 'field': 'card.card_pk', }, }, diff --git a/share/search/index_strategy/_cursor.py b/trove/trovesearch/_cursor.py similarity index 57% rename from share/search/index_strategy/_cursor.py rename to trove/trovesearch/_cursor.py index 852df2aee..8deedbe94 100644 --- a/share/search/index_strategy/_cursor.py +++ b/trove/trovesearch/_cursor.py @@ -15,15 +15,16 @@ PageParam, ) -__all__ = ('OffsetCursor', 'CardsearchCursor') +__all__ = ('BasicCursor', 'OffsetCursor', 'CardsearchCursor') +_MANY_MORE = -1 # special count value _SomeDataclass = typing.TypeVar('_SomeDataclass') @dataclasses.dataclass -class PageCursor: - page_size: int +class BasicCursor: + sample_size: int def as_queryparam_value(self) -> str: _as_json = json.dumps(dataclasses.astuple(self)) @@ -35,47 +36,42 @@ def from_queryparam_value(cls, cursor_value: str): _as_list = json.loads(base64.urlsafe_b64decode(cursor_value)) return cls(*_as_list) + @classmethod + def from_page_param(cls, page: PageParam) -> BasicCursor: + if page.cursor: + return cls.from_queryparam_value(page.cursor) + assert page.size is not None + return cls(sample_size=page.size) + + def next_cursor(self) -> typing.Self | None: + return None + + def prev_cursor(self) -> typing.Self | None: + return None + + def first_cursor(self) -> typing.Self | None: + return None + @dataclasses.dataclass -class OffsetCursor(PageCursor): - start_index: int - result_count: int | None # use -1 to indicate "many more" +class OffsetCursor(BasicCursor): + start_index: int = 0 + result_count: int = _MANY_MORE MAX_INDEX: typing.ClassVar[int] = VALUESEARCH_MAX - @classmethod - def from_page_param(cls, page: PageParam) -> OffsetCursor: - if page.cursor: - return cls.from_value(page.cursor) - assert page.size is not None - return cls( - start_index=0, - page_size=page.size, - result_count=None, # should be set when results are in - ) + def next_cursor(self) -> typing.Self | None: + _next = dataclasses.replace(self, start_index=(self.start_index + self.sample_size)) + return (_next if _next.is_valid_cursor() else None) - def next_cursor(self) -> str | None: - if not self.result_count: - return None - _next = dataclasses.replace(self, start_index=(self.start_index + self.page_size)) - return ( - encode_cursor_dataclass(_next) - if _next.is_valid_cursor() - else None - ) + def prev_cursor(self) -> typing.Self | None: + _prev = dataclasses.replace(self, start_index=(self.start_index - self.sample_size)) + return (_prev if _prev.is_valid_cursor() else None) - def prev_cursor(self) -> str | None: - _prev = dataclasses.replace(self, start_index=(self.start_index - self.page_size)) - return ( - encode_cursor_dataclass(_prev) - if _prev.is_valid_cursor() - else None - ) - - def first_cursor(self) -> str | None: + def first_cursor(self) -> typing.Self | None: if self.is_first_page(): return None - return encode_cursor_dataclass(dataclasses.replace(self, start_index=0)) + return dataclasses.replace(self, start_index=0) def is_first_page(self) -> bool: return self.start_index == 0 @@ -96,37 +92,37 @@ def is_valid_cursor(self) -> bool: @dataclasses.dataclass class CardsearchCursor(OffsetCursor): - random_sort: bool # how to sort by relevance to nothingness? randomness! - first_page_pks: tuple[str, ...] = () + random_sort: bool = True # how to sort by relevance to nothingness? randomness! + first_page_ids: tuple[str, ...] = () MAX_INDEX: typing.ClassVar[int] = CARDSEARCH_MAX @classmethod def from_cardsearch_params(cls, params: CardsearchParams) -> CardsearchCursor: if params.page.cursor: - return decode_cursor_dataclass(params.page.cursor, cls) + return cls.from_queryparam_value(params.page.cursor) assert params.page.size is not None return cls( - start_index=0, - page_size=params.page.size, - result_count=None, # should be set when results are in + sample_size=params.page.size, random_sort=( not params.sort_list and not params.cardsearch_textsegment_set ), + start_index=0, ) def cardsearch_start_index(self) -> int: if self.is_first_page() or not self.random_sort: return self.start_index - return self.start_index - len(self.first_page_pks) + return self.start_index - len(self.first_page_ids) - def first_cursor(self) -> str | None: - if self.random_sort and not self.first_page_pks: + def first_cursor(self) -> typing.Self | None: + if self.random_sort and not self.first_page_ids: return None - return super().prev_cursor() + return super().first_cursor() - def prev_cursor(self) -> str | None: - if self.random_sort and not self.first_page_pks: + def prev_cursor(self) -> typing.Self | None: + if self.random_sort and not self.first_page_ids: return None return super().prev_cursor() + diff --git a/trove/trovesearch/search_response.py b/trove/trovesearch/search_response.py index ee09f5c0d..8382822c4 100644 --- a/trove/trovesearch/search_response.py +++ b/trove/trovesearch/search_response.py @@ -5,6 +5,7 @@ from trove.vocab.namespaces import TROVE from trove.vocab.trove import trove_indexcard_namespace +from trove.trovesearch impo BoundedCount = Union[ @@ -42,9 +43,7 @@ def __post_init__(self): class CardsearchResponse: total_result_count: BoundedCount search_result_page: Iterable[CardsearchResult] - next_page_cursor: Optional[str] - prev_page_cursor: Optional[str] - first_page_cursor: Optional[str] + cursor: BasicCursor | None related_propertypath_results: Iterable['PropertypathUsage'] From aeff22abaf8e32140498e34079fda9ece3b2581f Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 7 Nov 2024 10:08:35 -0500 Subject: [PATCH 11/14] page_cursor --- .../index_strategy/trove_indexcard_flats.py | 45 +++--- .../index_strategy/trovesearch_denorm.py | 46 ++++--- trove/exceptions.py | 4 + trove/trovesearch/_cursor.py | 128 ------------------ trove/trovesearch/page_cursor.py | 123 +++++++++++++++++ trove/trovesearch/search_response.py | 30 +++- 6 files changed, 203 insertions(+), 173 deletions(-) delete mode 100644 trove/trovesearch/_cursor.py create mode 100644 trove/trovesearch/page_cursor.py diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 10d7e55d6..574e03480 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -7,7 +7,7 @@ import logging import re import uuid -from typing import Iterable, ClassVar, Optional, Iterator +from typing import Iterable, Optional, Iterator from django.conf import settings import elasticsearch8 @@ -16,16 +16,18 @@ from share.search import exceptions from share.search import messages from share.search.index_strategy.elastic8 import Elastic8IndexStrategy -from share.search.index_strategy._util import encode_cursor_dataclass, decode_cursor_dataclass from share.util.checksum_iri import ChecksumIri from trove import models as trove_db +from trove.trovesearch.cursor import ( + OffsetCursor, + CardsearchCursor, +) from trove.trovesearch.search_params import ( CardsearchParams, ValuesearchParams, SearchFilter, Textsegment, SortParam, - PageParam, GLOB_PATHSTEP, ) from trove.trovesearch.search_response import ( @@ -45,8 +47,6 @@ NAME_PROPERTIES, LABEL_PROPERTIES, NAMELIKE_PROPERTIES, - VALUESEARCH_MAX, - CARDSEARCH_MAX, KEYWORD_LENGTH_MAX, SKIPPABLE_PROPERTIES, ) @@ -294,18 +294,23 @@ def pls_handle_search__sharev2_backcompat(self, request_body=None, request_query ) def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse: - _cursor = _CardsearchCursor.from_params(cardsearch_params) + _cursor = CardsearchCursor.from_params(cardsearch_params) _sort = self._cardsearch_sort(cardsearch_params.sort_list) _query = self._cardsearch_query( cardsearch_params.cardsearch_filter_set, cardsearch_params.cardsearch_textsegment_set, cardsearch_cursor=_cursor, ) + _from_offset = ( + _cursor.start_offset + if _cursor.is_first_page() + else _cursor.start_offset - len(_cursor.first_page_ids) + ) _search_kwargs = dict( query=_query, aggs=self._cardsearch_aggs(cardsearch_params), sort=_sort, - from_=_cursor.cardsearch_start_index(), + from_=_from_offset, size=_cursor.page_size, source=False, # no need to get _source; _id is enough ) @@ -321,7 +326,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear return self._cardsearch_response(cardsearch_params, _es8_response, _cursor) def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: - _cursor = _SimpleCursor.from_page_param(valuesearch_params.page) + _cursor = OffsetCursor.from_page_param(valuesearch_params.page) _is_date_search = is_date_property(valuesearch_params.valuesearch_propertypath[-1]) _search_kwargs = dict( query=self._cardsearch_query( @@ -356,7 +361,7 @@ def _cardsearch_query( self, filter_set, textsegment_set, *, additional_filters=None, - cardsearch_cursor: Optional['_CardsearchCursor'] = None, + cardsearch_cursor: Optional[CardsearchCursor] = None, ) -> dict: _bool_query = { 'filter': additional_filters or [], @@ -426,7 +431,7 @@ def _cardsearch_aggs(self, cardsearch_params): }} return _aggs - def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: '_SimpleCursor'): + def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): _nested_iri_bool = { 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( valuesearch_params.valuesearch_propertypath, @@ -439,7 +444,7 @@ def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: ' _nested_terms_agg = { 'field': 'nested_iri.iri_value', # WARNING: terribly inefficient pagination (part one) - 'size': cursor.start_index + cursor.page_size + 1, + 'size': cursor.start_offset + cursor.page_size + 1, } _iris = list(valuesearch_params.valuesearch_iris()) if _iris: @@ -520,15 +525,15 @@ def _valuesearch_response( self, valuesearch_params: ValuesearchParams, es8_response: dict, - cursor: '_SimpleCursor', + cursor: OffsetCursor, ): _iri_aggs = es8_response['aggregations'].get('in_nested_iri') if _iri_aggs: _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] _bucket_count = len(_buckets) # WARNING: terribly inefficient pagination (part two) - _page_end_index = cursor.start_index + cursor.page_size - _bucket_page = _buckets[cursor.start_index:_page_end_index] # discard prior pages + _page_end_index = cursor.start_offset + cursor.page_size + _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages cursor.result_count = ( -1 # "many more" if (_bucket_count > _page_end_index) # agg includes one more, if there @@ -539,9 +544,7 @@ def _valuesearch_response( self._valuesearch_iri_result(_iri_bucket) for _iri_bucket in _bucket_page ], - next_page_cursor=cursor.next_cursor(), - prev_page_cursor=cursor.prev_cursor(), - first_page_cursor=cursor.first_cursor(), + cursor=cursor, ) else: # assume date _year_buckets = ( @@ -678,7 +681,7 @@ def _cardsearch_response( self, cardsearch_params: CardsearchParams, es8_response: dict, - cursor: '_CardsearchCursor', + cursor: CardsearchCursor, ) -> CardsearchResponse: _es8_total = es8_response['hits']['total'] if _es8_total['relation'] != 'eq': @@ -738,9 +741,7 @@ def _cardsearch_response( ), search_result_page=_results, related_propertypath_results=_relatedproperty_list, - next_page_cursor=cursor.next_cursor(), - prev_page_cursor=cursor.prev_cursor(), - first_page_cursor=cursor.first_cursor(), + cursor=cursor, ) def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: @@ -912,8 +913,6 @@ def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} - - class _PredicatePathWalker: WalkYield = tuple[tuple[str, ...], primitive_rdf.RdfObject] _visiting: set[str | frozenset] diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 2c1e3cf8c..75ded2ad3 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -6,7 +6,6 @@ import logging import re from typing import ( - ClassVar, Iterable, Iterator, Literal, @@ -19,12 +18,15 @@ from share.search import exceptions from share.search import messages from share.search.index_strategy.elastic8 import Elastic8IndexStrategy -from share.search.index_strategy._util import encode_cursor_dataclass, decode_cursor_dataclass from share.util.checksum_iri import ChecksumIri from trove import models as trove_db +from trove.trovesearch.page_cursor import ( + PageCursor, + OffsetCursor, + ReproduciblyRandomSampleCursor, +) from trove.trovesearch.search_params import ( CardsearchParams, - PageParam, Propertypath, SearchFilter, Textsegment, @@ -395,9 +397,9 @@ def _valuesearch_response( if _iri_aggs: _buckets = _iri_aggs['buckets'] _bucket_count = len(_buckets) - # WARNING: terribly inefficient pagination (part two) - _page_end_index = cursor.start_index + cursor.page_size - _bucket_page = _buckets[cursor.start_index:_page_end_index] # discard prior pages + # WARNING: terribly hacky pagination (part two) + _page_end_index = cursor.start_offset + cursor.page_size + _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages cursor.result_count = ( -1 # "many more" if (_bucket_count > _page_end_index) # agg includes one more, if there @@ -408,9 +410,7 @@ def _valuesearch_response( self._valuesearch_iri_result(_iri_bucket) for _iri_bucket in _bucket_page ], - next_page_cursor=cursor.next_cursor(), - prev_page_cursor=cursor.prev_cursor(), - first_page_cursor=cursor.first_cursor(), + cursor=cursor, ) else: # assume date _year_buckets = ( @@ -447,7 +447,7 @@ def _cardsearch_response( self, cardsearch_params: CardsearchParams, es8_response: dict, - cursor: _CardsearchCursor, + cursor: ReproduciblyRandomSampleCursor, ) -> CardsearchResponse: _es8_total = es8_response['hits']['total'] if _es8_total['relation'] != 'eq': @@ -507,9 +507,7 @@ def _cardsearch_response( ), search_result_page=_results, related_propertypath_results=_relatedproperty_list, - next_page_cursor=cursor.next_cursor(), - prev_page_cursor=cursor.prev_cursor(), - first_page_cursor=cursor.first_cursor(), + cursor=cursor, ) def _gather_textmatch_evidence(self, card_iri, es8_hit) -> Iterator[TextMatchEvidence]: @@ -688,7 +686,21 @@ def build(self): @functools.cached_property def cursor(self): - return _CardsearchCursor.from_cardsearch_params(self.params) + if self.params.page.cursor: + return PageCursor.from_queryparam_value(self.params.page.cursor) + assert self.params.page.size is not None + _has_sort = bool(self.params.sort_list or self.params.cardsearch_textsegment_set) + if _has_sort: + return OffsetCursor(page_size=self.params.page.size) + # how to sort by relevance to nothingness? randomness! + return ReproduciblyRandomSampleCursor(sample_size=self.params.page.size) + + def _cardsearch_start_offset(self): + return ( + self.cursor.start_offset + if self.cursor.is_first_page() + else self.cursor.start_offset - len(self.cursor.first_page_ids) + ) def _cardsearch_query(self) -> dict: _bool = _BoolBuilder() @@ -795,8 +807,8 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d 'agg_valuesearch_iris': { 'terms': { 'field': 'iri_value.value_iri', - # WARNING: terribly inefficient pagination (part one) - 'size': cursor.start_index + cursor.page_size + 1, + # WARNING: terribly hacky pagination (part one) + 'size': cursor.start_offset + cursor.page_size + 1, }, 'aggs': { 'agg_type_iri': {'terms': { @@ -883,5 +895,3 @@ def _any_query(queries: abc.Collection[dict]): (_query,) = queries return _query return {'bool': {'should': list(queries), 'minimum_should_match': 1}} - - diff --git a/trove/exceptions.py b/trove/exceptions.py index b25e3eb20..7935c0511 100644 --- a/trove/exceptions.py +++ b/trove/exceptions.py @@ -58,6 +58,10 @@ class InvalidSearchText(InvalidQueryParamValue): pass +class InvalidPageCursorValue(InvalidQueryParamValue): + pass + + class MissingRequiredQueryParam(RequestParsingError): pass diff --git a/trove/trovesearch/_cursor.py b/trove/trovesearch/_cursor.py deleted file mode 100644 index 8deedbe94..000000000 --- a/trove/trovesearch/_cursor.py +++ /dev/null @@ -1,128 +0,0 @@ -from __future__ import annotations -import base64 -import dataclasses -import json -import typing - -from ._trovesearch_util import ( - VALUESEARCH_MAX, - CARDSEARCH_MAX, -) - -if typing.TYPE_CHECKING: - from trove.trovesearch.search_params import ( - CardsearchParams, - PageParam, - ) - -__all__ = ('BasicCursor', 'OffsetCursor', 'CardsearchCursor') - -_MANY_MORE = -1 # special count value - -_SomeDataclass = typing.TypeVar('_SomeDataclass') - - -@dataclasses.dataclass -class BasicCursor: - sample_size: int - - def as_queryparam_value(self) -> str: - _as_json = json.dumps(dataclasses.astuple(self)) - _cursor_bytes = base64.urlsafe_b64encode(_as_json.encode()) - return _cursor_bytes.decode() - - @classmethod - def from_queryparam_value(cls, cursor_value: str): - _as_list = json.loads(base64.urlsafe_b64decode(cursor_value)) - return cls(*_as_list) - - @classmethod - def from_page_param(cls, page: PageParam) -> BasicCursor: - if page.cursor: - return cls.from_queryparam_value(page.cursor) - assert page.size is not None - return cls(sample_size=page.size) - - def next_cursor(self) -> typing.Self | None: - return None - - def prev_cursor(self) -> typing.Self | None: - return None - - def first_cursor(self) -> typing.Self | None: - return None - - -@dataclasses.dataclass -class OffsetCursor(BasicCursor): - start_index: int = 0 - result_count: int = _MANY_MORE - - MAX_INDEX: typing.ClassVar[int] = VALUESEARCH_MAX - - def next_cursor(self) -> typing.Self | None: - _next = dataclasses.replace(self, start_index=(self.start_index + self.sample_size)) - return (_next if _next.is_valid_cursor() else None) - - def prev_cursor(self) -> typing.Self | None: - _prev = dataclasses.replace(self, start_index=(self.start_index - self.sample_size)) - return (_prev if _prev.is_valid_cursor() else None) - - def first_cursor(self) -> typing.Self | None: - if self.is_first_page(): - return None - return dataclasses.replace(self, start_index=0) - - def is_first_page(self) -> bool: - return self.start_index == 0 - - def has_many_more(self) -> bool: - return self.result_count == -1 - - def max_index(self) -> int: - return ( - self.MAX_INDEX - if self.has_many_more() - else min(self.result_count or 0, self.MAX_INDEX) - ) - - def is_valid_cursor(self) -> bool: - return 0 <= self.start_index < self.max_index() - - -@dataclasses.dataclass -class CardsearchCursor(OffsetCursor): - random_sort: bool = True # how to sort by relevance to nothingness? randomness! - first_page_ids: tuple[str, ...] = () - - MAX_INDEX: typing.ClassVar[int] = CARDSEARCH_MAX - - @classmethod - def from_cardsearch_params(cls, params: CardsearchParams) -> CardsearchCursor: - if params.page.cursor: - return cls.from_queryparam_value(params.page.cursor) - assert params.page.size is not None - return cls( - sample_size=params.page.size, - random_sort=( - not params.sort_list - and not params.cardsearch_textsegment_set - ), - start_index=0, - ) - - def cardsearch_start_index(self) -> int: - if self.is_first_page() or not self.random_sort: - return self.start_index - return self.start_index - len(self.first_page_ids) - - def first_cursor(self) -> typing.Self | None: - if self.random_sort and not self.first_page_ids: - return None - return super().first_cursor() - - def prev_cursor(self) -> typing.Self | None: - if self.random_sort and not self.first_page_ids: - return None - return super().prev_cursor() - diff --git a/trove/trovesearch/page_cursor.py b/trove/trovesearch/page_cursor.py new file mode 100644 index 000000000..0975f0e52 --- /dev/null +++ b/trove/trovesearch/page_cursor.py @@ -0,0 +1,123 @@ +from __future__ import annotations +import base64 +import dataclasses +import enum +import json +import math +import typing + +from trove.exceptions import InvalidPageCursorValue +from ._trovesearch_util import ( + VALUESEARCH_MAX, + CARDSEARCH_MAX, +) + +if typing.TYPE_CHECKING: + from number import Number + + +__all__ = ('PageCursor', 'OffsetCursor', 'ReproduciblyRandomSampleCursor') + + +_MANY_MORE = math.inf + + +@dataclasses.dataclass +class PageCursor: + page_size: Number + total_count: Number = _MANY_MORE + + @staticmethod + def from_queryparam_value(cursor_value: str) -> PageCursor: + try: + (_type_key, _args) = json.loads(base64.urlsafe_b64decode(cursor_value)) + _cls = _PageCursorTypes[_type_key].value + assert issubclass(_cls, PageCursor) + return _cls(*_args) + except Exception: + raise InvalidPageCursorValue(cursor_value) + + def as_queryparam_value(self) -> str: + _as_json = json.dumps(dataclasses.astuple(self)) + _cursor_bytes = base64.urlsafe_b64encode(_as_json.encode()) + return _cursor_bytes.decode() + + def is_valid(self) -> bool: + return self.page_size > 0 and (0 <= self.total_count <= _MANY_MORE) + + def has_many_more(self) -> bool: + return self.total_count >= _MANY_MORE + + def next_cursor(self) -> typing.Self | None: + return None + + def prev_cursor(self) -> typing.Self | None: + return None + + def first_cursor(self) -> typing.Self | None: + return None + + +@dataclasses.dataclass +class OffsetCursor(PageCursor): + # page_size: Number (from PageCursor) + # total_count: Number (from PageCursor) + start_offset: Number = 0 + + MAX_INDEX: typing.ClassVar[Number] = VALUESEARCH_MAX + + def is_valid(self) -> bool: + return ( + super().is_valid() + and 0 <= self.start_offset < self.max_index() + ) + + def is_first_page(self) -> bool: + return self.start_offset == 0 + + def max_index(self) -> Number: + return ( + self.MAX_INDEX + if self.has_many_more() + else min(self.total_count or 0, self.MAX_INDEX) + ) + + def next_cursor(self): + return dataclasses.replace(self, start_offset=(self.start_offset + self.page_size)) + + def prev_cursor(self): + return dataclasses.replace(self, start_offset=(self.start_offset - self.page_size)) + + def first_cursor(self): + return dataclasses.replace(self, start_offset=0) + + +@dataclasses.dataclass +class ReproduciblyRandomSampleCursor(OffsetCursor): + # page_size: Number (from PageCursor) + # total_count: Number (from PageCursor) + # start_offset: Number (from OffsetCursor) + first_page_ids: typing.Iterable[str] = () + + MAX_INDEX: typing.ClassVar[Number] = CARDSEARCH_MAX + + def next_cursor(self): + return ( + super().next_cursor() + if self.first_page_ids + else None + ) + + def prev_cursor(self): + return ( + super().prev_cursor() + if self.first_page_ids + else None + ) + + +class _PageCursorTypes(enum.Enum): + '''registry of cursor types into which cursor values can be deserialized''' + PC = PageCursor + OC = OffsetCursor + RRSC = ReproduciblyRandomSampleCursor diff --git a/trove/trovesearch/search_response.py b/trove/trovesearch/search_response.py index 8382822c4..3d1f162c1 100644 --- a/trove/trovesearch/search_response.py +++ b/trove/trovesearch/search_response.py @@ -43,7 +43,7 @@ def __post_init__(self): class CardsearchResponse: total_result_count: BoundedCount search_result_page: Iterable[CardsearchResult] - cursor: BasicCursor | None + cursor: Cursor | None related_propertypath_results: Iterable['PropertypathUsage'] @@ -74,6 +74,28 @@ def __post_init__(self): class ValuesearchResponse: search_result_page: Iterable[ValuesearchResult] total_result_count: Optional[int] = None - next_page_cursor: Optional[str] = None - prev_page_cursor: Optional[str] = None - first_page_cursor: Optional[str] = None + cursor: Cursor | None = None + + @functools.cached_property + def next_page_cursor(self) -> str: + if self.cursor is not None: + _next = self.cursor.next_cursor() + if _next.is_valid(): + return _next + return '' + + @functools.cached_property + def prev_page_cursor(self) -> str: + if self.cursor is not None: + _prev = self.cursor.prev_cursor() + if _prev.is_valid(): + return _prev + return '' + + @functools.cached_property + def first_page_cursor(self) -> str: + if self.cursor is not None: + _first = self.cursor.first_cursor() + if _first.is_valid(): + return _first + return '' From 53787876aec61a83b1b8563c8c33cb476e983251 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 11 Nov 2024 09:43:51 -0500 Subject: [PATCH 12/14] consistents cursors and walks --- .../index_strategy/_trovesearch_util.py | 5 +- .../index_strategy/trove_indexcard_flats.py | 153 +++++--------- .../index_strategy/trovesearch_denorm.py | 190 +++++++++--------- tests/trove/trovesearch/test_page_cursor.py | 25 +++ .../{ => trovesearch}/test_search_params.py | 0 trove/trovesearch/page_cursor.py | 66 +++--- trove/trovesearch/search_params.py | 48 ++--- trove/trovesearch/search_response.py | 135 ++++++++----- trove/trovesearch/trovesearch_gathering.py | 27 ++- 9 files changed, 323 insertions(+), 326 deletions(-) create mode 100644 tests/trove/trovesearch/test_page_cursor.py rename tests/trove/{ => trovesearch}/test_search_params.py (100%) diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py index 6bfb55004..53daa11af 100644 --- a/share/search/index_strategy/_trovesearch_util.py +++ b/share/search/index_strategy/_trovesearch_util.py @@ -48,9 +48,6 @@ LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel) NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES) -VALUESEARCH_MAX = 234 -CARDSEARCH_MAX = 9997 - KEYWORD_LENGTH_MAX = 8191 # skip keyword terms that might exceed lucene's internal limit # (see https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html) KEYWORD_MAPPING = {'type': 'keyword', 'ignore_above': KEYWORD_LENGTH_MAX} @@ -168,7 +165,7 @@ def __post_init__(self): if XSD.integer in _walk_obj.datatype_iris: self.integer_values[_walk_path].add(_walk_obj) if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris): - self.text_values[_walk_path].add(_walk_obj.unicode_value) + self.text_values[_walk_path].add(_walk_obj) # try for date in a date property, regardless of the above if is_date_property(_walk_path[-1]) and isinstance(_walk_obj, (str, rdf.Literal)): _date_str = ( diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 574e03480..b9bfbd33b 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -1,13 +1,12 @@ import base64 from collections import defaultdict -import contextlib import dataclasses import datetime import json import logging import re import uuid -from typing import Iterable, Optional, Iterator +from typing import Iterable, Iterator, Any from django.conf import settings import elasticsearch8 @@ -18,9 +17,11 @@ from share.search.index_strategy.elastic8 import Elastic8IndexStrategy from share.util.checksum_iri import ChecksumIri from trove import models as trove_db -from trove.trovesearch.cursor import ( +from trove.trovesearch.page_cursor import ( + MANY_MORE, OffsetCursor, - CardsearchCursor, + PageCursor, + ReproduciblyRandomSampleCursor, ) from trove.trovesearch.search_params import ( CardsearchParams, @@ -40,15 +41,15 @@ ) from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword from trove.vocab.osfmap import is_date_property -from trove.vocab.namespaces import TROVE, RDF, OWL +from trove.vocab.namespaces import RDF, OWL from ._trovesearch_util import ( latest_rdf_for_indexcard_pks, + GraphWalk, TITLE_PROPERTIES, NAME_PROPERTIES, LABEL_PROPERTIES, NAMELIKE_PROPERTIES, KEYWORD_LENGTH_MAX, - SKIPPABLE_PROPERTIES, ) @@ -165,22 +166,16 @@ def _build_sourcedoc(self, indexcard_rdf): _nested_iris = defaultdict(set) _nested_dates = defaultdict(set) _nested_texts = defaultdict(set) - _pathset = set() - for _walk_path, _walk_obj in _PredicatePathWalker(_rdfdoc.tripledict).walk_from_subject(indexcard_rdf.focus_iri): - _pathset.add(_walk_path) - if isinstance(_walk_obj, str): - _nested_iris[_NestedIriKey.for_iri_at_path(_walk_path, _walk_obj, _rdfdoc)].add(_walk_obj) - elif isinstance(_walk_obj, datetime.date): - _nested_dates[_walk_path].add(datetime.date.isoformat(_walk_obj)) - elif is_date_property(_walk_path[-1]): - try: - datetime.date.fromisoformat(_walk_obj.unicode_value) - except ValueError: - logger.debug('skipping malformatted date "%s" in %s', _walk_obj.unicode_value, indexcard_rdf) - else: - _nested_dates[_walk_path].add(_walk_obj.unicode_value) - elif isinstance(_walk_obj, primitive_rdf.Literal): - _nested_texts[(_walk_path, tuple(_walk_obj.datatype_iris))].add(_walk_obj.unicode_value) + _walk = GraphWalk(_rdfdoc, indexcard_rdf.focus_iri) + for _walk_path, _walk_iris in _walk.iri_values.items(): + for _iri_obj in _walk_iris: + _nested_iris[_NestedIriKey.for_iri_at_path(_walk_path, _iri_obj, _rdfdoc)].add(_iri_obj) + for _walk_path, _walk_dates in _walk.date_values.items(): + for _date_obj in _walk_dates: + _nested_dates[_walk_path].add(datetime.date.isoformat(_date_obj)) + for _walk_path, _walk_texts in _walk.text_values.items(): + for _text_obj in _walk_texts: + _nested_texts[(_walk_path, tuple(_text_obj.datatype_iris))].add(_text_obj.unicode_value) _focus_iris = {indexcard_rdf.focus_iri} _suffuniq_focus_iris = {get_sufficiently_unique_iri(indexcard_rdf.focus_iri)} for _identifier in indexcard_rdf.indexcard.focus_identifier_set.all(): @@ -196,11 +191,11 @@ def _build_sourcedoc(self, indexcard_rdf): 'flat_iri_values_suffuniq': self._flattened_iris_suffuniq(_nested_iris), 'iri_paths_present': [ iri_path_as_keyword(_path) - for _path in _pathset + for _path in _walk.paths_walked ], 'iri_paths_present_suffuniq': [ iri_path_as_keyword(_path, suffuniq=True) - for _path in _pathset + for _path in _walk.paths_walked ], 'nested_iri': list(filter(bool, ( self._iri_nested_sourcedoc(_nested_iri_key, _iris, _rdfdoc) @@ -294,7 +289,7 @@ def pls_handle_search__sharev2_backcompat(self, request_body=None, request_query ) def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse: - _cursor = CardsearchCursor.from_params(cardsearch_params) + _cursor = self._cardsearch_cursor(cardsearch_params) _sort = self._cardsearch_sort(cardsearch_params.sort_list) _query = self._cardsearch_query( cardsearch_params.cardsearch_filter_set, @@ -303,7 +298,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear ) _from_offset = ( _cursor.start_offset - if _cursor.is_first_page() + if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) else _cursor.start_offset - len(_cursor.first_page_ids) ) _search_kwargs = dict( @@ -326,7 +321,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear return self._cardsearch_response(cardsearch_params, _es8_response, _cursor) def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: - _cursor = OffsetCursor.from_page_param(valuesearch_params.page) + _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) _is_date_search = is_date_property(valuesearch_params.valuesearch_propertypath[-1]) _search_kwargs = dict( query=self._cardsearch_query( @@ -357,11 +352,21 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value ### # query implementation + def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: + _request_cursor = cardsearch_params.page_cursor + if ( + _request_cursor.is_basic() + and not cardsearch_params.sort_list + and not cardsearch_params.cardsearch_textsegment_set + ): + return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) + return OffsetCursor.from_cursor(_request_cursor) + def _cardsearch_query( self, filter_set, textsegment_set, *, additional_filters=None, - cardsearch_cursor: Optional[CardsearchCursor] = None, + cardsearch_cursor: PageCursor | None = None, ) -> dict: _bool_query = { 'filter': additional_filters or [], @@ -383,12 +388,12 @@ def _cardsearch_query( else: raise ValueError(f'unknown filter operator {_searchfilter.operator}') _textq_builder = self._NestedTextQueryBuilder( - relevance_matters=bool(cardsearch_cursor and not cardsearch_cursor.random_sort), + relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), ) for _textsegment in textsegment_set: for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): _bool_query[_boolkey].extend(_textqueries) - if not cardsearch_cursor or not cardsearch_cursor.random_sort: + if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): # no need for randomness return {'bool': _bool_query} if not cardsearch_cursor.first_page_ids: @@ -432,7 +437,7 @@ def _cardsearch_aggs(self, cardsearch_params): return _aggs def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): - _nested_iri_bool = { + _nested_iri_bool: dict[str, Any] = { 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( valuesearch_params.valuesearch_propertypath, suffuniq=True, @@ -534,17 +539,17 @@ def _valuesearch_response( # WARNING: terribly inefficient pagination (part two) _page_end_index = cursor.start_offset + cursor.page_size _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages - cursor.result_count = ( - -1 # "many more" + cursor.total_count = ( + MANY_MORE if (_bucket_count > _page_end_index) # agg includes one more, if there else _bucket_count ) return ValuesearchResponse( + cursor=cursor, search_result_page=[ self._valuesearch_iri_result(_iri_bucket) for _iri_bucket in _bucket_page ], - cursor=cursor, ) else: # assume date _year_buckets = ( @@ -552,6 +557,7 @@ def _valuesearch_response( ['value_at_propertypath']['count_by_year']['buckets'] ) return ValuesearchResponse( + cursor=PageCursor(len(_year_buckets)), search_result_page=[ self._valuesearch_date_result(_year_bucket) for _year_bucket in _year_buckets @@ -681,16 +687,16 @@ def _cardsearch_response( self, cardsearch_params: CardsearchParams, es8_response: dict, - cursor: CardsearchCursor, + cursor: OffsetCursor, ) -> CardsearchResponse: _es8_total = es8_response['hits']['total'] if _es8_total['relation'] != 'eq': - cursor.result_count = -1 # "too many" + cursor.total_count = MANY_MORE + elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): + # account for the filtered-out first page + cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) else: # exact (and small) count - cursor.result_count = _es8_total['value'] - if cursor.random_sort and not cursor.is_first_page(): - # account for the filtered-out first page - cursor.result_count += len(cursor.first_page_ids) + cursor.total_count = _es8_total['value'] _results = [] for _es8_hit in es8_response['hits']['hits']: _card_iri = _es8_hit['_id'] @@ -698,29 +704,7 @@ def _cardsearch_response( card_iri=_card_iri, text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), )) - if cursor.is_first_page() and cursor.first_page_ids: - # revisiting first page; reproduce original random order - _uuid_index = { - _uuid: _i - for (_i, _uuid) in enumerate(cursor.first_page_ids) - } - _results.sort(key=lambda _r: _uuid_index[_r.card_uuid]) - else: - _should_start_reproducible_randomness = ( - cursor.random_sort - and cursor.is_first_page() - and not cursor.first_page_ids - and any( - not _filter.is_type_filter() # look for a non-default filter - for _filter in cardsearch_params.cardsearch_filter_set - ) - ) - if _should_start_reproducible_randomness: - cursor.first_page_ids = tuple( - _result.card_uuid - for _result in _results - ) - _relatedproperty_list = [] + _relatedproperty_list: list[PropertypathUsage] = [] if cardsearch_params.related_property_paths: _relatedproperty_list.extend( PropertypathUsage(property_path=_path, usage_count=0) @@ -734,14 +718,10 @@ def _cardsearch_response( _path = tuple(json.loads(_bucket['key'])) _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] return CardsearchResponse( - total_result_count=( - TROVE['ten-thousands-and-more'] - if cursor.has_many_more() - else cursor.result_count - ), + cursor=cursor, search_result_page=_results, related_propertypath_results=_relatedproperty_list, - cursor=cursor, + cardsearch_params=cardsearch_params, ) def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: @@ -913,41 +893,6 @@ def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} -class _PredicatePathWalker: - WalkYield = tuple[tuple[str, ...], primitive_rdf.RdfObject] - _visiting: set[str | frozenset] - - def __init__(self, tripledict: primitive_rdf.RdfTripleDictionary): - self.tripledict = tripledict - self._visiting = set() - - def walk_from_subject(self, iri_or_blanknode, last_path: tuple[str, ...] = ()) -> Iterable[WalkYield]: - '''walk the graph from the given subject, yielding (pathkey, obj) for every reachable object - ''' - with self._visit(iri_or_blanknode): - _twopledict = ( - primitive_rdf.twopledict_from_twopleset(iri_or_blanknode) - if isinstance(iri_or_blanknode, frozenset) - else self.tripledict.get(iri_or_blanknode, {}) - ) - for _predicate_iri, _obj_set in _twopledict.items(): - if _predicate_iri not in SKIPPABLE_PROPERTIES: - _path = (*last_path, _predicate_iri) - for _obj in _obj_set: - if not isinstance(_obj, frozenset): # omit the blanknode as a value - yield (_path, _obj) - if isinstance(_obj, (str, frozenset)) and (_obj not in self._visiting): - # step further for iri or blanknode - yield from self.walk_from_subject(_obj, last_path=_path) - - @contextlib.contextmanager - def _visit(self, focus_obj): - assert focus_obj not in self._visiting - self._visiting.add(focus_obj) - yield - self._visiting.discard(focus_obj) - - @dataclasses.dataclass(frozen=True) class _NestedIriKey: '''if this is the same for multiple iri values, they can be combined in one `nested_iri` doc diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 75ded2ad3..bfc696ff5 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -20,9 +20,11 @@ from share.search.index_strategy.elastic8 import Elastic8IndexStrategy from share.util.checksum_iri import ChecksumIri from trove import models as trove_db +from trove import exceptions as trove_exceptions from trove.trovesearch.page_cursor import ( - PageCursor, + MANY_MORE, OffsetCursor, + PageCursor, ReproduciblyRandomSampleCursor, ) from trove.trovesearch.search_params import ( @@ -43,7 +45,7 @@ ValuesearchResult, ) from trove.vocab.osfmap import is_date_property -from trove.vocab.namespaces import TROVE, OWL, RDF +from trove.vocab.namespaces import OWL, RDF from . import _trovesearch_util as ts @@ -214,15 +216,20 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear ) except elasticsearch8.TransportError as error: raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self.index_strategy._cardsearch_response(cardsearch_params, _es8_response, _querybuilder.cursor) + return self.index_strategy._cardsearch_response( + cardsearch_params, + _es8_response, + _querybuilder.response_cursor, + ) # abstract method from IndexStrategy.SpecificIndex def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: _path = valuesearch_params.valuesearch_propertypath - _cursor = OffsetCursor.from_page_param(valuesearch_params.page) + _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) + _is_date_search = is_date_property(_path[-1]) _query = ( - _build_date_valuesearch(valuesearch_params, _cursor) - if is_date_property(_path[-1]) + _build_date_valuesearch(valuesearch_params) + if _is_date_search else _build_iri_valuesearch(valuesearch_params, _cursor) ) if settings.DEBUG: @@ -234,7 +241,11 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value ) except elasticsearch8.TransportError as error: raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self.index_strategy._valuesearch_response(valuesearch_params, _es8_response, _cursor) + return ( + self.index_strategy._valuesearch_dates_response(valuesearch_params, _es8_response) + if _is_date_search + else self.index_strategy._valuesearch_iris_response(valuesearch_params, _es8_response, _cursor) + ) ### # building sourcedocs @@ -345,18 +356,19 @@ def _iris_by_depth(self, walk: ts.GraphWalk): def _texts_by_propertypath(self, walk: ts.GraphWalk): return { - _path_field_name(_path): list(_value_set) - for _path, _value_set in walk.text_values.items() + _path_field_name(_path): [_text.unicode_value for _text in _text_set] + for _path, _text_set in walk.text_values.items() } def _texts_at_properties(self, walk: ts.GraphWalk, properties: Iterable[str]): for _property in properties: - yield from walk.text_values.get((_property,), []) + for _text_literal in walk.text_values.get((_property,), []): + yield _text_literal.unicode_value def _texts_by_depth(self, walk: ts.GraphWalk): _by_depth: dict[int, set[str]] = defaultdict(set) - for _path, _value_set in walk.text_values.items(): - _by_depth[len(_path)].update(_value_set) + for _path, _text_set in walk.text_values.items(): + _by_depth[len(_path)].update(_text.unicode_value for _text in _text_set) return { _depth_field_name(_depth): list(_value_set) for _depth, _value_set in _by_depth.items() @@ -387,43 +399,48 @@ def _exact_and_suffuniq_iris(self, iri: str): ### # normalizing search responses - def _valuesearch_response( + def _valuesearch_iris_response( self, valuesearch_params: ValuesearchParams, es8_response: dict, cursor: OffsetCursor, ) -> ValuesearchResponse: _iri_aggs = es8_response['aggregations'].get('agg_valuesearch_iris') - if _iri_aggs: - _buckets = _iri_aggs['buckets'] - _bucket_count = len(_buckets) - # WARNING: terribly hacky pagination (part two) - _page_end_index = cursor.start_offset + cursor.page_size - _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages - cursor.result_count = ( - -1 # "many more" - if (_bucket_count > _page_end_index) # agg includes one more, if there - else _bucket_count - ) - return ValuesearchResponse( - search_result_page=[ - self._valuesearch_iri_result(_iri_bucket) - for _iri_bucket in _bucket_page - ], - cursor=cursor, - ) - else: # assume date - _year_buckets = ( - es8_response['aggregations'] - ['agg_valuesearch_dates'] - ['buckets'] - ) - return ValuesearchResponse( - search_result_page=[ - self._valuesearch_date_result(_year_bucket) - for _year_bucket in _year_buckets - ], - ) + _buckets = _iri_aggs['buckets'] + _bucket_count = len(_buckets) + # WARNING: terribly hacky pagination (part two) + _page_end_index = cursor.start_offset + cursor.page_size + _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages + cursor.total_count = ( + MANY_MORE + if (_bucket_count > _page_end_index) # agg includes one more, if there + else _bucket_count + ) + return ValuesearchResponse( + cursor=cursor, + search_result_page=[ + self._valuesearch_iri_result(_iri_bucket) + for _iri_bucket in _bucket_page + ], + ) + + def _valuesearch_dates_response( + self, + valuesearch_params: ValuesearchParams, + es8_response: dict, + ) -> ValuesearchResponse: + _year_buckets = ( + es8_response['aggregations'] + ['agg_valuesearch_dates'] + ['buckets'] + ) + return ValuesearchResponse( + cursor=PageCursor(len(_year_buckets)), + search_result_page=[ + self._valuesearch_date_result(_year_bucket) + for _year_bucket in _year_buckets + ], + ) def _valuesearch_iri_result(self, iri_bucket) -> ValuesearchResult: return ValuesearchResult( @@ -447,17 +464,16 @@ def _cardsearch_response( self, cardsearch_params: CardsearchParams, es8_response: dict, - cursor: ReproduciblyRandomSampleCursor, + cursor: OffsetCursor, ) -> CardsearchResponse: _es8_total = es8_response['hits']['total'] if _es8_total['relation'] != 'eq': - cursor.result_count = -1 # "too many" + cursor.total_count = MANY_MORE + elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): + # account for the filtered-out first page + cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) else: # exact (and small) count - cursor.result_count = _es8_total['value'] - if cursor.random_sort and not cursor.is_first_page(): - # account for the filtered-out first page - assert cursor.result_count is not None - cursor.result_count += len(cursor.first_page_ids) + cursor.total_count = _es8_total['value'] _results = [] for _es8_hit in es8_response['hits']['hits']: _card_iri = _es8_hit['fields']['card.card_iri'][0] @@ -466,26 +482,6 @@ def _cardsearch_response( card_pk=_es8_hit['_id'], text_match_evidence=list(self._gather_textmatch_evidence(_card_iri, _es8_hit)), )) - if cursor.is_first_page() and cursor.first_page_ids: - # revisiting first page; reproduce original random order - _ordering_by_id = { - _id: _i - for (_i, _id) in enumerate(cursor.first_page_ids) - } - _results.sort(key=lambda _r: _ordering_by_id[_r.card_pk]) - else: - _should_start_reproducible_randomness = ( - cursor.random_sort - and cursor.is_first_page() - and not cursor.first_page_ids - and not cursor.has_many_more() - and any( - not _filter.is_type_filter() # look for a non-default filter - for _filter in cardsearch_params.cardsearch_filter_set - ) - ) - if _should_start_reproducible_randomness: - cursor.first_page_ids = tuple(_result.card_pk for _result in _results) _relatedproperty_list: list[PropertypathUsage] = [] if cardsearch_params.related_property_paths: _relatedproperty_list.extend( @@ -500,14 +496,10 @@ def _cardsearch_response( _path = tuple(json.loads(_bucket['key'])) _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] return CardsearchResponse( - total_result_count=( - TROVE['ten-thousands-and-more'] - if cursor.has_many_more() - else cursor.result_count - ), + cursor=cursor, search_result_page=_results, related_propertypath_results=_relatedproperty_list, - cursor=cursor, + cardsearch_params=cardsearch_params, ) def _gather_textmatch_evidence(self, card_iri, es8_hit) -> Iterator[TextMatchEvidence]: @@ -680,27 +672,28 @@ def build(self): 'query': self._cardsearch_query(), 'aggs': self._cardsearch_aggs(), 'sort': list(self._cardsearch_sorts()) or None, - 'from_': self.cursor.cardsearch_start_index(), - 'size': self.cursor.page_size, + 'from_': self._cardsearch_start_offset(), + 'size': self.response_cursor.page_size, } @functools.cached_property - def cursor(self): - if self.params.page.cursor: - return PageCursor.from_queryparam_value(self.params.page.cursor) - assert self.params.page.size is not None - _has_sort = bool(self.params.sort_list or self.params.cardsearch_textsegment_set) - if _has_sort: - return OffsetCursor(page_size=self.params.page.size) - # how to sort by relevance to nothingness? randomness! - return ReproduciblyRandomSampleCursor(sample_size=self.params.page.size) + def response_cursor(self) -> OffsetCursor: + _request_cursor = self.params.page_cursor + if ( + _request_cursor.is_basic() + and not self.params.sort_list + and not self.params.cardsearch_textsegment_set + ): + return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) + return OffsetCursor.from_cursor(_request_cursor) def _cardsearch_start_offset(self): - return ( - self.cursor.start_offset - if self.cursor.is_first_page() - else self.cursor.start_offset - len(self.cursor.first_page_ids) - ) + if ( + self.response_cursor.is_first_page() + or not isinstance(self.response_cursor, ReproduciblyRandomSampleCursor) + ): + return self.response_cursor.start_offset + return self.response_cursor.start_offset - len(self.response_cursor.first_page_ids) def _cardsearch_query(self) -> dict: _bool = _BoolBuilder() @@ -716,12 +709,13 @@ def _cardsearch_query(self) -> dict: _bool.add_boolpart('must_not', {'exists': {'field': 'iri_value'}}) return ( self._randomly_ordered_query(_bool) - if self.cursor.random_sort + if isinstance(self.response_cursor, ReproduciblyRandomSampleCursor) else _bool.as_query() ) def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: - if not self.cursor.first_page_ids: + assert isinstance(self.response_cursor, ReproduciblyRandomSampleCursor) + if not self.response_cursor.first_page_ids: # independent random sample return { 'function_score': { @@ -730,8 +724,8 @@ def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: 'random_score': {}, # default random_score is fast and unpredictable }, } - _firstpage_filter = {'terms': {'card.card_pk': self.cursor.first_page_ids}} - if self.cursor.is_first_page(): + _firstpage_filter = {'terms': {'card.card_pk': self.response_cursor.first_page_ids}} + if self.response_cursor.is_first_page(): # returning to a first page previously visited _bool.add_boolpart('filter', _firstpage_filter) return _bool.as_query() @@ -742,7 +736,7 @@ def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: 'query': _bool.as_query(), 'boost_mode': 'replace', 'random_score': { - 'seed': ''.join(self.cursor.first_page_ids), + 'seed': ''.join(self.response_cursor.first_page_ids), 'field': 'card.card_pk', }, }, @@ -823,7 +817,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d } -def _build_date_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> dict: +def _build_date_valuesearch(params: ValuesearchParams) -> dict: assert not params.valuesearch_textsegment_set assert not params.valuesearch_filter_set _bool = _BoolBuilder() diff --git a/tests/trove/trovesearch/test_page_cursor.py b/tests/trove/trovesearch/test_page_cursor.py new file mode 100644 index 000000000..5b9027c9a --- /dev/null +++ b/tests/trove/trovesearch/test_page_cursor.py @@ -0,0 +1,25 @@ +from unittest import TestCase + + +from trove.trovesearch.page_cursor import ( + PageCursor, + OffsetCursor, + ReproduciblyRandomSampleCursor, +) + + +class TestPageCursor(TestCase): + def test_queryparam_round_trip(self): + for _original_cursor in ( + PageCursor(page_size=7), + OffsetCursor(page_size=11), + OffsetCursor(page_size=11, start_offset=22), + ReproduciblyRandomSampleCursor(page_size=13), + ReproduciblyRandomSampleCursor(page_size=3, first_page_ids=['a', 'b', 'c']), + ): + _qp_value = _original_cursor.as_queryparam_value() + self.assertIsInstance(_qp_value, str) + self.assertNotEqual(_qp_value, '') + _cursor_from_qp = PageCursor.from_queryparam_value(_qp_value) + self.assertIsInstance(_cursor_from_qp, type(_original_cursor)) + self.assertEqual(_cursor_from_qp, _original_cursor) diff --git a/tests/trove/test_search_params.py b/tests/trove/trovesearch/test_search_params.py similarity index 100% rename from tests/trove/test_search_params.py rename to tests/trove/trovesearch/test_search_params.py diff --git a/trove/trovesearch/page_cursor.py b/trove/trovesearch/page_cursor.py index 0975f0e52..61c7c03b4 100644 --- a/trove/trovesearch/page_cursor.py +++ b/trove/trovesearch/page_cursor.py @@ -3,50 +3,55 @@ import dataclasses import enum import json -import math import typing from trove.exceptions import InvalidPageCursorValue -from ._trovesearch_util import ( - VALUESEARCH_MAX, - CARDSEARCH_MAX, -) - -if typing.TYPE_CHECKING: - from number import Number __all__ = ('PageCursor', 'OffsetCursor', 'ReproduciblyRandomSampleCursor') -_MANY_MORE = math.inf +MANY_MORE = -1 @dataclasses.dataclass class PageCursor: - page_size: Number - total_count: Number = _MANY_MORE + page_size: int + total_count: int = MANY_MORE - @staticmethod - def from_queryparam_value(cursor_value: str) -> PageCursor: + @classmethod + def from_queryparam_value(cls, cursor_value: str) -> typing.Self: try: (_type_key, _args) = json.loads(base64.urlsafe_b64decode(cursor_value)) _cls = _PageCursorTypes[_type_key].value - assert issubclass(_cls, PageCursor) + assert issubclass(_cls, cls) return _cls(*_args) except Exception: raise InvalidPageCursorValue(cursor_value) + @classmethod + def from_cursor(cls, other_cursor: PageCursor) -> typing.Self: + if isinstance(other_cursor, cls): + return dataclasses.replace(other_cursor) # simple copy + return cls(*dataclasses.astuple(other_cursor)) + def as_queryparam_value(self) -> str: - _as_json = json.dumps(dataclasses.astuple(self)) + _cls_key = _PageCursorTypes(type(self)).name + _as_json = json.dumps([_cls_key, *dataclasses.astuple(self)]) _cursor_bytes = base64.urlsafe_b64encode(_as_json.encode()) return _cursor_bytes.decode() + def is_basic(self) -> bool: + return type(self) is PageCursor + def is_valid(self) -> bool: - return self.page_size > 0 and (0 <= self.total_count <= _MANY_MORE) + return self.page_size > 0 and ( + self.total_count == MANY_MORE + or self.total_count >= 0 + ) def has_many_more(self) -> bool: - return self.total_count >= _MANY_MORE + return self.total_count == MANY_MORE def next_cursor(self) -> typing.Self | None: return None @@ -60,28 +65,19 @@ def first_cursor(self) -> typing.Self | None: @dataclasses.dataclass class OffsetCursor(PageCursor): - # page_size: Number (from PageCursor) - # total_count: Number (from PageCursor) - start_offset: Number = 0 - - MAX_INDEX: typing.ClassVar[Number] = VALUESEARCH_MAX + # page_size: int (from PageCursor) + # total_count: int (from PageCursor) + start_offset: int = 0 def is_valid(self) -> bool: return ( super().is_valid() - and 0 <= self.start_offset < self.max_index() + and 0 <= self.start_offset ) def is_first_page(self) -> bool: return self.start_offset == 0 - def max_index(self) -> Number: - return ( - self.MAX_INDEX - if self.has_many_more() - else min(self.total_count or 0, self.MAX_INDEX) - ) - def next_cursor(self): return dataclasses.replace(self, start_offset=(self.start_offset + self.page_size)) @@ -94,12 +90,10 @@ def first_cursor(self): @dataclasses.dataclass class ReproduciblyRandomSampleCursor(OffsetCursor): - # page_size: Number (from PageCursor) - # total_count: Number (from PageCursor) - # start_offset: Number (from OffsetCursor) - first_page_ids: typing.Iterable[str] = () - - MAX_INDEX: typing.ClassVar[Number] = CARDSEARCH_MAX + # page_size: int (from PageCursor) + # total_count: int (from PageCursor) + # start_offset: int (from OffsetCursor) + first_page_ids: list[str] = dataclasses.field(default_factory=list) def next_cursor(self): return ( diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 29f49d0d8..8c7234222 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -11,6 +11,7 @@ from primitive_metadata import primitive_rdf from trove import exceptions as trove_exceptions +from trove.trovesearch.page_cursor import PageCursor from trove.util.queryparams import ( QueryparamDict, QueryparamName, @@ -50,6 +51,10 @@ DEFAULT_PAGE_SIZE = 13 MAX_PAGE_SIZE = 101 +# limits on paging +VALUESEARCH_MAX = 234 +CARDSEARCH_MAX = 9997 + # between each step in a property path "foo.bar.baz" PROPERTYPATH_DELIMITER = '.' @@ -462,30 +467,13 @@ def as_queryparam(self) -> tuple[str, str]: return (_name, _value) -@dataclasses.dataclass(frozen=True) -class PageParam: - cursor: str | None # intentionally opaque; for IndexStrategy to generate/interpret - size: int | None = None # size is None iff cursor is not None - - @classmethod - def from_page_queryparams(cls, queryparams: QueryparamDict) -> PageParam: - _cursor = _get_single_value(queryparams, QueryparamName('page', ('cursor',))) - if _cursor: - return cls(cursor=_cursor) - _size = int( # TODO: 400 response on non-int value - _get_single_value(queryparams, QueryparamName('page', ('size',))) - or DEFAULT_PAGE_SIZE - ) - return cls(size=min(_size, MAX_PAGE_SIZE), cursor=None) - - @dataclasses.dataclass(frozen=True) class CardsearchParams(BaseTroveParams): cardsearch_textsegment_set: frozenset[Textsegment] cardsearch_filter_set: frozenset[SearchFilter] index_strategy_name: str | None sort_list: tuple[SortParam] - page: PageParam + page_cursor: PageCursor related_property_paths: tuple[Propertypath, ...] unnamed_iri_values: frozenset[str] @@ -498,7 +486,7 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: 'cardsearch_filter_set': _filter_set, 'index_strategy_name': _get_single_value(queryparams, QueryparamName('indexStrategy')), 'sort_list': SortParam.from_sort_queryparams(queryparams), - 'page': PageParam.from_page_queryparams(queryparams), + 'page_cursor': _get_page_cursor(queryparams), 'include': None, # TODO 'related_property_paths': _get_related_property_paths(_filter_set), 'unnamed_iri_values': frozenset(), # TODO: frozenset(_get_unnamed_iri_values(_filter_set)), @@ -511,10 +499,10 @@ def to_querydict(self) -> QueryDict: for _sort in self.sort_list: _qp_name, _qp_value = _sort.as_queryparam() _querydict.appendlist(_qp_name, _qp_value) - if self.page.cursor: - _querydict['page[cursor]'] = self.page.cursor - elif self.page.size != DEFAULT_PAGE_SIZE: - _querydict['page[size]'] = self.page.size + if not self.page_cursor.is_basic(): + _querydict['page[cursor]'] = self.page_cursor.as_queryparam_value() + elif self.page_cursor.page_size != DEFAULT_PAGE_SIZE: + _querydict['page[size]'] = self.page_cursor.page_size for _filter in self.cardsearch_filter_set: _qp_name, _qp_value = _filter.as_queryparam('cardSearchFilter') _querydict.appendlist(_qp_name, _qp_value) @@ -683,3 +671,17 @@ def _get_unnamed_iri_values(filter_set) -> typing.Iterable[str]: for _iri in _filter.value_set: if _iri not in OSFMAP_THESAURUS: yield _iri + + +def _get_page_cursor(queryparams: QueryparamDict) -> PageCursor: + _cursor_value = _get_single_value(queryparams, QueryparamName('page', ('cursor',))) + if _cursor_value: + return PageCursor.from_queryparam_value(_cursor_value) + try: + _size = int( # TODO: 400 response on non-int value + _get_single_value(queryparams, QueryparamName('page', ('size',))) + or DEFAULT_PAGE_SIZE + ) + except ValueError: + raise trove_exceptions.InvalidQueryParamValue('page[size]') + return PageCursor(page_size=min(_size, MAX_PAGE_SIZE)) diff --git a/trove/trovesearch/search_response.py b/trove/trovesearch/search_response.py index 3d1f162c1..d60a6d998 100644 --- a/trove/trovesearch/search_response.py +++ b/trove/trovesearch/search_response.py @@ -3,9 +3,17 @@ from primitive_metadata import primitive_rdf +from trove.trovesearch.page_cursor import ( + PageCursor, + ReproduciblyRandomSampleCursor, +) +from trove.trovesearch.search_params import ( + VALUESEARCH_MAX, + CARDSEARCH_MAX, + CardsearchParams, +) from trove.vocab.namespaces import TROVE from trove.vocab.trove import trove_indexcard_namespace -from trove.trovesearch impo BoundedCount = Union[ @@ -25,26 +33,21 @@ class TextMatchEvidence: @dataclasses.dataclass class CardsearchResult: - text_match_evidence: Iterable[TextMatchEvidence] + text_match_evidence: list[TextMatchEvidence] card_iri: str - card_uuid: str = '' - card_pk: str = '' # TODO: use or remove - - def __post_init__(self): - if not self.card_uuid: - # card iri has the uuid at the end - self.card_uuid = primitive_rdf.iri_minus_namespace( - self.card_iri, - namespace=trove_indexcard_namespace(), - ) - + card_pk: str = '' + + @property + def card_uuid(self): + # card iri has the uuid at the end + return primitive_rdf.iri_minus_namespace( + self.card_iri, + namespace=trove_indexcard_namespace(), + ) -@dataclasses.dataclass -class CardsearchResponse: - total_result_count: BoundedCount - search_result_page: Iterable[CardsearchResult] - cursor: Cursor | None - related_propertypath_results: Iterable['PropertypathUsage'] + @property + def card_id(self): + return self.card_pk or self.card_uuid @dataclasses.dataclass @@ -70,32 +73,72 @@ def __post_init__(self): ) +### +# paged responses + +@dataclasses.dataclass +class PagedResponse: + cursor: PageCursor + + @property + def max_offset(self) -> int: + raise NotImplementedError + + @property + def total_result_count(self) -> BoundedCount: + return ( + TROVE['ten-thousands-and-more'] + if (self.cursor is None) or self.cursor.has_many_more() + else self.cursor.total_count + ) + + @dataclasses.dataclass -class ValuesearchResponse: +class CardsearchResponse(PagedResponse): + search_result_page: list[CardsearchResult] + related_propertypath_results: list['PropertypathUsage'] + cardsearch_params: CardsearchParams + + max_offset = CARDSEARCH_MAX + + def __post_init__(self): + _cursor = self.cursor + if ( + isinstance(_cursor, ReproduciblyRandomSampleCursor) + and _cursor.is_first_page() + ): + if _cursor.first_page_ids: + # revisiting first page; reproduce original random order + _ordering_by_id = { + _id: _i + for (_i, _id) in enumerate(_cursor.first_page_ids) + } + self.search_result_page.sort(key=lambda _r: _ordering_by_id[_r.card_id]) + else: + _should_start_reproducible_randomness = ( + not _cursor.has_many_more() + and any( + not _filter.is_type_filter() # look for a non-default filter + for _filter in self.cardsearch_params.cardsearch_filter_set + ) + ) + if _should_start_reproducible_randomness: + _cursor.first_page_ids = [_result.card_id for _result in self.search_result_page] + + +@dataclasses.dataclass +class ValuesearchResponse(PagedResponse): search_result_page: Iterable[ValuesearchResult] - total_result_count: Optional[int] = None - cursor: Cursor | None = None - - @functools.cached_property - def next_page_cursor(self) -> str: - if self.cursor is not None: - _next = self.cursor.next_cursor() - if _next.is_valid(): - return _next - return '' - - @functools.cached_property - def prev_page_cursor(self) -> str: - if self.cursor is not None: - _prev = self.cursor.prev_cursor() - if _prev.is_valid(): - return _prev - return '' - - @functools.cached_property - def first_page_cursor(self) -> str: - if self.cursor is not None: - _first = self.cursor.first_cursor() - if _first.is_valid(): - return _first - return '' + + max_offset = VALUESEARCH_MAX + + +### +# local helpers + +def _cursor_value(cursor: PageCursor | None) -> str: + return ( + cursor.as_queryparam_value() + if cursor is not None and cursor.is_valid() + else '' + ) diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index e027ef7a9..f91969b50 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -15,10 +15,10 @@ from trove import models as trove_db from trove import exceptions as trove_exceptions from trove.derive.osfmap_json import _RdfOsfmapJsonldRenderer +from trove.trovesearch.page_cursor import PageCursor from trove.trovesearch.search_params import ( CardsearchParams, ValuesearchParams, - PageParam, propertypath_key, propertypath_set_key, ) @@ -362,27 +362,24 @@ def _related_property_result(property_path: tuple[str, ...], count: int): def _search_page_links(search_focus, search_params, search_response): _search_iri_split = urllib.parse.urlsplit(next(iter(search_focus.iris))) - def _iri_with_page_param(page_param: PageParam): + def _iri_with_cursor(page_cursor: PageCursor): return urllib.parse.urlunsplit(( _search_iri_split.scheme, _search_iri_split.netloc, _search_iri_split.path, - dataclasses.replace(search_params, page=page_param).to_querystring(), + dataclasses.replace(search_params, page_cursor=page_cursor).to_querystring(), _search_iri_split.fragment, )) - if search_response.first_page_cursor: - yield (TROVE.searchResultPage, _jsonapi_link('first', _iri_with_page_param( - PageParam(cursor=search_response.first_page_cursor), - ))) - if search_response.next_page_cursor: - yield (TROVE.searchResultPage, _jsonapi_link('next', _iri_with_page_param( - PageParam(cursor=search_response.next_page_cursor), - ))) - if search_response.prev_page_cursor: - yield (TROVE.searchResultPage, _jsonapi_link('prev', _iri_with_page_param( - PageParam(cursor=search_response.prev_page_cursor), - ))) + _next = search_response.cursor.next_cursor() + if _next is not None and _next.is_valid(): + yield (TROVE.searchResultPage, _jsonapi_link('next', _iri_with_cursor(_next))) + _prev = search_response.cursor.prev_cursor() + if _prev is not None and _prev.is_valid(): + yield (TROVE.searchResultPage, _jsonapi_link('prev', _iri_with_cursor(_prev))) + _first = search_response.cursor.first_cursor() + if _first is not None and _first.is_valid(): + yield (TROVE.searchResultPage, _jsonapi_link('first', _iri_with_cursor(_first))) def _jsonapi_link(membername, iri): From 28ece73b15c6355511e74a771ed61b3f6c7871b8 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 11 Nov 2024 13:02:47 -0500 Subject: [PATCH 13/14] trovesearch pagination tests --- .../index_strategy/trovesearch_denorm.py | 1 - .../_common_trovesearch_tests.py | 43 ++++++++++++++++++- trove/trovesearch/page_cursor.py | 21 ++++++--- trove/trovesearch/search_params.py | 8 +--- trove/trovesearch/search_response.py | 26 ++--------- 5 files changed, 61 insertions(+), 38 deletions(-) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index bfc696ff5..35320b5c1 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -20,7 +20,6 @@ from share.search.index_strategy.elastic8 import Elastic8IndexStrategy from share.util.checksum_iri import ChecksumIri from trove import models as trove_db -from trove import exceptions as trove_exceptions from trove.trovesearch.page_cursor import ( MANY_MORE, OffsetCursor, diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index d912b961c..f3eff4813 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -1,6 +1,7 @@ from typing import Iterable, Iterator -from datetime import date +from datetime import date, timedelta import itertools +import math from urllib.parse import urlencode from primitive_metadata import primitive_rdf as rdf @@ -71,6 +72,46 @@ def test_cardsearch(self): _actual_result_iris = set(_actual_result_iris) self.assertEqual(_expected_result_iris, _actual_result_iris, msg=f'?{_queryparams}') + def test_cardsearch_pagination(self): + _cards: list[trove_db.Indexcard] = [] + _expected_iris = set() + _page_size = 7 + _total_count = 55 + _start_date = date(1999, 12, 31) + for _i in range(_total_count): + _card_iri = BLARG[f'i{_i}'] + _expected_iris.add(_card_iri) + _cards.append(self._create_indexcard(_card_iri, { + _card_iri: { + RDF.type: {BLARG.Thing}, + DCTERMS.title: {rdf.literal(f'card #{_i}')}, + DCTERMS.created: {rdf.literal(_start_date + timedelta(weeks=_i, days=_i))}, + }, + })) + self._index_indexcards(_cards) + # gather all pages results: + _querystring: str = f'page[size]={_page_size}' + _result_iris: set[str] = set() + _page_count = 0 + while True: + _cardsearch_response = self.current_index.pls_handle_cardsearch( + CardsearchParams.from_querystring(_querystring), + ) + _page_iris = { + self._indexcard_focus_by_uuid[_result.card_uuid] + for _result in _cardsearch_response.search_result_page + } + self.assertFalse(_result_iris.intersection(_page_iris)) + self.assertLessEqual(len(_page_iris), _page_size) + _result_iris.update(_page_iris) + _page_count += 1 + _next_cursor = _cardsearch_response.cursor.next_cursor() + if _next_cursor is None: + break + _querystring = urlencode({'page[cursor]': _next_cursor.as_queryparam_value()}) + self.assertEqual(_page_count, math.ceil(_total_count / _page_size)) + self.assertEqual(_result_iris, _expected_iris) + def test_valuesearch(self): self._fill_test_data_for_querying() _valuesearch_cases = itertools.chain( diff --git a/trove/trovesearch/page_cursor.py b/trove/trovesearch/page_cursor.py index 61c7c03b4..0428b78d5 100644 --- a/trove/trovesearch/page_cursor.py +++ b/trove/trovesearch/page_cursor.py @@ -12,6 +12,7 @@ MANY_MORE = -1 +MAX_OFFSET = 9997 @dataclasses.dataclass @@ -22,7 +23,7 @@ class PageCursor: @classmethod def from_queryparam_value(cls, cursor_value: str) -> typing.Self: try: - (_type_key, _args) = json.loads(base64.urlsafe_b64decode(cursor_value)) + (_type_key, *_args) = json.loads(base64.urlsafe_b64decode(cursor_value)) _cls = _PageCursorTypes[_type_key].value assert issubclass(_cls, cls) return _cls(*_args) @@ -46,8 +47,7 @@ def is_basic(self) -> bool: def is_valid(self) -> bool: return self.page_size > 0 and ( - self.total_count == MANY_MORE - or self.total_count >= 0 + self.total_count == MANY_MORE or self.total_count >= 0 ) def has_many_more(self) -> bool: @@ -72,20 +72,27 @@ class OffsetCursor(PageCursor): def is_valid(self) -> bool: return ( super().is_valid() - and 0 <= self.start_offset + and 0 <= self.start_offset <= MAX_OFFSET + and ( + self.total_count == MANY_MORE + or self.start_offset < self.total_count + ) ) def is_first_page(self) -> bool: return self.start_offset == 0 def next_cursor(self): - return dataclasses.replace(self, start_offset=(self.start_offset + self.page_size)) + _next = dataclasses.replace(self, start_offset=(self.start_offset + self.page_size)) + return (_next if _next.is_valid() else None) def prev_cursor(self): - return dataclasses.replace(self, start_offset=(self.start_offset - self.page_size)) + _prev = dataclasses.replace(self, start_offset=(self.start_offset - self.page_size)) + return (_prev if _prev.is_valid() else None) def first_cursor(self): - return dataclasses.replace(self, start_offset=0) + _first = dataclasses.replace(self, start_offset=0) + return (_first if _first.is_valid() else None) @dataclasses.dataclass diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 8c7234222..67469e80f 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -51,10 +51,6 @@ DEFAULT_PAGE_SIZE = 13 MAX_PAGE_SIZE = 101 -# limits on paging -VALUESEARCH_MAX = 234 -CARDSEARCH_MAX = 9997 - # between each step in a property path "foo.bar.baz" PROPERTYPATH_DELIMITER = '.' @@ -94,11 +90,11 @@ class BaseTroveParams: accept_mediatype: str | None @classmethod - def from_querystring(cls, querystring: str) -> BaseTroveParams: # TODO py3.11: typing.Self + def from_querystring(cls, querystring: str) -> typing.Self: return cls.from_queryparams(queryparams_from_querystring(querystring)) @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> BaseTroveParams: + def from_queryparams(cls, queryparams: QueryparamDict) -> typing.Self: return cls(**cls.parse_queryparams(queryparams)) @classmethod diff --git a/trove/trovesearch/search_response.py b/trove/trovesearch/search_response.py index d60a6d998..19bbdfe6c 100644 --- a/trove/trovesearch/search_response.py +++ b/trove/trovesearch/search_response.py @@ -7,11 +7,7 @@ PageCursor, ReproduciblyRandomSampleCursor, ) -from trove.trovesearch.search_params import ( - VALUESEARCH_MAX, - CARDSEARCH_MAX, - CardsearchParams, -) +from trove.trovesearch.search_params import CardsearchParams from trove.vocab.namespaces import TROVE from trove.vocab.trove import trove_indexcard_namespace @@ -80,10 +76,6 @@ def __post_init__(self): class PagedResponse: cursor: PageCursor - @property - def max_offset(self) -> int: - raise NotImplementedError - @property def total_result_count(self) -> BoundedCount: return ( @@ -99,8 +91,6 @@ class CardsearchResponse(PagedResponse): related_propertypath_results: list['PropertypathUsage'] cardsearch_params: CardsearchParams - max_offset = CARDSEARCH_MAX - def __post_init__(self): _cursor = self.cursor if ( @@ -114,24 +104,14 @@ def __post_init__(self): for (_i, _id) in enumerate(_cursor.first_page_ids) } self.search_result_page.sort(key=lambda _r: _ordering_by_id[_r.card_id]) - else: - _should_start_reproducible_randomness = ( - not _cursor.has_many_more() - and any( - not _filter.is_type_filter() # look for a non-default filter - for _filter in self.cardsearch_params.cardsearch_filter_set - ) - ) - if _should_start_reproducible_randomness: - _cursor.first_page_ids = [_result.card_id for _result in self.search_result_page] + elif not _cursor.has_many_more(): + _cursor.first_page_ids = [_result.card_id for _result in self.search_result_page] @dataclasses.dataclass class ValuesearchResponse(PagedResponse): search_result_page: Iterable[ValuesearchResult] - max_offset = VALUESEARCH_MAX - ### # local helpers From 46ccc83f4009bfa3f0d6e7da621bf8c26b8a42e5 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 11 Nov 2024 13:22:49 -0500 Subject: [PATCH 14/14] restore tests._testutil --- tests/_testutil.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tests/_testutil.py diff --git a/tests/_testutil.py b/tests/_testutil.py new file mode 100644 index 000000000..40bbc2f9f --- /dev/null +++ b/tests/_testutil.py @@ -0,0 +1,12 @@ +from unittest import mock + + +def patch_feature_flag(*flag_names, up=True): + from share.models.feature_flag import FeatureFlag + _old_isup = FeatureFlag.objects.flag_is_up + + def _patched_isup(flag_name): + if flag_name in flag_names: + return up + return _old_isup(flag_name) + return mock.patch.object(FeatureFlag.objects, 'flag_is_up', new=_patched_isup)