diff --git a/api/search/views.py b/api/search/views.py index a8fc19cb9..12075a82d 100644 --- a/api/search/views.py +++ b/api/search/views.py @@ -8,7 +8,7 @@ from api import authentication from share.search import exceptions -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy class Sharev2ElasticSearchView(views.APIView): @@ -32,7 +32,7 @@ def _handle_request(self, request): if 'scroll' in queryparams: return http.HttpResponseForbidden(reason='Scroll is not supported.') try: - specific_index = IndexStrategy.get_for_sharev2_search(requested_index_strategy) + specific_index = index_strategy.get_index_for_sharev2_search(requested_index_strategy) except exceptions.IndexStrategyError as error: raise http.Http404(str(error)) try: diff --git a/api/views/feeds.py b/api/views/feeds.py index 4934b2c28..417d479fa 100644 --- a/api/views/feeds.py +++ b/api/views/feeds.py @@ -10,7 +10,7 @@ import pendulum import sentry_sdk -from share.search import IndexStrategy +from share.search import index_strategy from share.search.exceptions import IndexStrategyError from share.util.xml import strip_illegal_xml_chars @@ -34,6 +34,8 @@ class MetadataRecordsRSS(Feed): description = 'Updates to the SHARE open dataset' author_name = 'SHARE' + _search_index: index_strategy.IndexStrategy.SpecificIndex + def title(self, obj): query = json.dumps(obj.get('query', 'All')) return prepare_string('SHARE: Atom feed for query: {}'.format(query)) @@ -41,7 +43,7 @@ def title(self, obj): def get_object(self, request): self._order = request.GET.get('order') elastic_query = request.GET.get('elasticQuery') - self._index_strategy = IndexStrategy.get_for_sharev2_search(request.GET.get('indexStrategy')) + self._search_index = index_strategy.get_index_for_sharev2_search(request.GET.get('indexStrategy')) if self._order not in {'date_modified', 'date_updated', 'date_created', 'date_published'}: self._order = 'date_modified' @@ -62,7 +64,7 @@ def get_object(self, request): def items(self, obj): try: - json_response = self._index_strategy.pls_handle_search__sharev2_backcompat( + json_response = self._search_index.pls_handle_search__sharev2_backcompat( request_body=obj, ) except IndexStrategyError: diff --git a/project/settings.py b/project/settings.py index d091e9e7c..0dafab53f 100644 --- a/project/settings.py +++ b/project/settings.py @@ -314,52 +314,15 @@ def split(string, delim): 'TIMEOUT': int(os.environ.get('ELASTICSEARCH_TIMEOUT', '45')), 'CHUNK_SIZE': int(os.environ.get('ELASTICSEARCH_CHUNK_SIZE', 2000)), 'MAX_RETRIES': int(os.environ.get('ELASTICSEARCH_MAX_RETRIES', 7)), - 'INDEX_STRATEGIES': {}, # populated below based on environment } ELASTICSEARCH5_URL = ( os.environ.get('ELASTICSEARCH5_URL') - or os.environ.get('ELASTICSEARCH_URL') + or os.environ.get('ELASTICSEARCH_URL') # backcompat ) -if ELASTICSEARCH5_URL: - ELASTICSEARCH['INDEX_STRATEGIES']['sharev2_elastic5'] = { - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic5.Sharev2Elastic5IndexStrategy', - 'CLUSTER_SETTINGS': { - 'URL': ELASTICSEARCH5_URL, - }, - } ELASTICSEARCH8_URL = os.environ.get('ELASTICSEARCH8_URL') -if ELASTICSEARCH8_URL: - ELASTICSEARCH8_CERT_PATH = os.environ.get('ELASTICSEARCH8_CERT_PATH') - ELASTICSEARCH8_USERNAME = os.environ.get('ELASTICSEARCH8_USERNAME', 'elastic') - ELASTICSEARCH8_SECRET = os.environ.get('ELASTICSEARCH8_SECRET') - ELASTICSEARCH8_CLUSTER_SETTINGS = { - 'URL': ELASTICSEARCH8_URL, - 'AUTH': ( - (ELASTICSEARCH8_USERNAME, ELASTICSEARCH8_SECRET) - if ELASTICSEARCH8_SECRET is not None - else None - ), - 'CERT_PATH': ELASTICSEARCH8_CERT_PATH, - } - ELASTICSEARCH['INDEX_STRATEGIES'].update({ - 'sharev2_elastic8': { - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic8.Sharev2Elastic8IndexStrategy', - 'CLUSTER_SETTINGS': ELASTICSEARCH8_CLUSTER_SETTINGS, - }, - 'trove_indexcard_flats': { - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy', - 'CLUSTER_SETTINGS': ELASTICSEARCH8_CLUSTER_SETTINGS, - }, - }) -DEFAULT_INDEX_STRATEGY_FOR_LEGACY_SEARCH = ( - 'sharev2_elastic5' - if ELASTICSEARCH5_URL - else ( - 'sharev2_elastic8' - if ELASTICSEARCH8_URL - else None - ) -) +ELASTICSEARCH8_CERT_PATH = os.environ.get('ELASTICSEARCH8_CERT_PATH') +ELASTICSEARCH8_USERNAME = os.environ.get('ELASTICSEARCH8_USERNAME', 'elastic') +ELASTICSEARCH8_SECRET = os.environ.get('ELASTICSEARCH8_SECRET') # Seconds, not an actual celery settings CELERY_RETRY_BACKOFF_BASE = int(os.environ.get('CELERY_RETRY_BACKOFF_BASE', 2 if DEBUG else 10)) diff --git a/share/admin/__init__.py b/share/admin/__init__.py index 7d1b67a75..9e68fe2e9 100644 --- a/share/admin/__init__.py +++ b/share/admin/__init__.py @@ -17,7 +17,7 @@ from share.admin.celery import CeleryTaskResultAdmin from share.admin.jobs import HarvestJobAdmin from share.admin.readonly import ReadOnlyAdmin -from share.admin.search import search_indexes_view +from share.admin.search import search_indexes_view, search_index_mappings_view from share.admin.util import TimeLimitedPaginator, linked_fk, linked_many, SourceConfigFilter from share.harvest.scheduler import HarvestScheduler from share.models import ( @@ -48,6 +48,11 @@ def get_urls(self): self.admin_view(search_indexes_view), name='search-indexes', ), + path( + 'search-index-mappings/', + self.admin_view(search_index_mappings_view), + name='search-index-mappings', + ), *super().get_urls(), ] diff --git a/share/admin/search.py b/share/admin/search.py index ce7e3aafe..fbf2446b0 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -1,13 +1,13 @@ import logging -from django.http.response import HttpResponseRedirect +from django.http.response import HttpResponseRedirect, JsonResponse from django.template.response import TemplateResponse from django.urls import reverse from share.admin.util import admin_url from share.models.index_backfill import IndexBackfill from share.search.index_messenger import IndexMessenger -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy logger = logging.getLogger(__name__) @@ -20,11 +20,12 @@ def search_indexes_view(request): 'admin/search-indexes.html', context={ 'search_url_prefix': _search_url_prefix(), + 'mappings_url_prefix': _mappings_url_prefix(), 'index_status_by_strategy': _index_status_by_strategy(), }, ) if request.method == 'POST': - _specific_index = IndexStrategy.get_specific_index(request.POST['specific_indexname']) + _specific_index = index_strategy.get_specific_index(request.POST['specific_indexname']) _pls_doer = PLS_DOERS[request.POST['pls_do']] _pls_doer(_specific_index) _redirect_id = ( @@ -35,24 +36,34 @@ def search_indexes_view(request): return HttpResponseRedirect('#'.join((request.path, _redirect_id))) +def search_index_mappings_view(request, index_name): + _specific_index = index_strategy.get_specific_index(index_name) + _mappings = _specific_index.pls_get_mappings() + return JsonResponse(_mappings) + + def _search_url_prefix(): api_url = reverse('api:search') return f'{api_url}?indexStrategy=' # append strategyname or indexname +def _mappings_url_prefix(): + return '/admin/search-index-mappings/' + + def _index_status_by_strategy(): - backfill_by_indexname = { + backfill_by_indexname: dict[str, IndexBackfill] = { backfill.specific_indexname: backfill for backfill in ( IndexBackfill.objects - .filter(index_strategy_name__in=IndexStrategy.all_strategies_by_name().keys()) + .filter(index_strategy_name__in=index_strategy.all_index_strategies().keys()) ) } status_by_strategy = {} _messenger = IndexMessenger() - for index_strategy in IndexStrategy.all_strategies(): - current_index = index_strategy.for_current_index() - status_by_strategy[index_strategy.name] = { + for _index_strategy in index_strategy.all_index_strategies().values(): + current_index = _index_strategy.for_current_index() + status_by_strategy[_index_strategy.name] = { 'current': { 'status': current_index.pls_get_status(), 'backfill': _serialize_backfill( @@ -62,7 +73,7 @@ def _index_status_by_strategy(): }, 'prior': sorted(( specific_index.pls_get_status() - for specific_index in index_strategy.each_specific_index() + for specific_index in _index_strategy.each_specific_index() if not specific_index.is_current ), reverse=True), 'queues': [ @@ -71,15 +82,18 @@ def _index_status_by_strategy(): **_messenger.get_queue_stats(_queue_name), } for _queue_name in ( - index_strategy.urgent_messagequeue_name, - index_strategy.nonurgent_messagequeue_name, + _index_strategy.urgent_messagequeue_name, + _index_strategy.nonurgent_messagequeue_name, ) ], } return status_by_strategy -def _serialize_backfill(specific_index: IndexStrategy.SpecificIndex, backfill: IndexBackfill): +def _serialize_backfill( + specific_index: index_strategy.IndexStrategy.SpecificIndex, + backfill: IndexBackfill | None, +): if not specific_index.is_current: return {} if not backfill: diff --git a/share/bin/search.py b/share/bin/search.py index 80418440d..69f5c0eff 100644 --- a/share/bin/search.py +++ b/share/bin/search.py @@ -1,7 +1,7 @@ from project.celery import app as celery_app from share.bin.util import command -from share.search import IndexStrategy +from share.search import index_strategy from share.search.exceptions import IndexStrategyError from share.search.daemon import IndexerDaemonControl @@ -29,7 +29,7 @@ def purge(args, argv): Usage: {0} search purge ... """ for index_name in args['']: - specific_index = IndexStrategy.get_specific_index(index_name) + specific_index = index_strategy.get_specific_index(index_name) specific_index.pls_delete() @@ -43,18 +43,16 @@ def setup(args, argv): if _is_initial: _specific_indexes = [ _index_strategy.for_current_index() - for _index_strategy in IndexStrategy.all_strategies() + for _index_strategy in index_strategy.all_index_strategies().values() ] else: _index_or_strategy_name = args[''] try: - _specific_indexes = [ - IndexStrategy.get_by_name(_index_or_strategy_name).for_current_index(), - ] + _specific_indexes = [index_strategy.get_specific_index(_index_or_strategy_name)] except IndexStrategyError: try: _specific_indexes = [ - IndexStrategy.get_specific_index(_index_or_strategy_name), + index_strategy.get_specific_index(_index_or_strategy_name), ] except IndexStrategyError: raise IndexStrategyError(f'unrecognized index or strategy name "{_index_or_strategy_name}"') diff --git a/share/checks.py b/share/checks.py index 0a3ec321f..a53d2a228 100644 --- a/share/checks.py +++ b/share/checks.py @@ -2,18 +2,18 @@ def check_all_index_strategies_current(app_configs, **kwargs): - from share.search import IndexStrategy + from share.search import index_strategy from share.search.exceptions import IndexStrategyError errors = [] - for index_strategy in IndexStrategy.all_strategies(): + for _index_strategy in index_strategy.all_index_strategies().values(): try: - index_strategy.assert_strategy_is_current() + _index_strategy.assert_strategy_is_current() except IndexStrategyError as exception: errors.append( checks.Error( 'IndexStrategy changed without checksum confirmation!', hint=str(exception), - obj=index_strategy, + obj=_index_strategy, id='share.search.E001', ) ) diff --git a/share/models/feature_flag.py b/share/models/feature_flag.py index df0903122..518baec67 100644 --- a/share/models/feature_flag.py +++ b/share/models/feature_flag.py @@ -31,6 +31,7 @@ class FeatureFlag(models.Model): IGNORE_SHAREV2_INGEST = 'ignore_sharev2_ingest' SUGGEST_CREATOR_FACET = 'suggest_creator_facet' FORBID_UNTRUSTED_FEED = 'forbid_untrusted_feed' + TROVESEARCH_DENORMILY = 'trovesearch_denormily' # name _should_ be one of the constants above, but that is not enforced by `choices` name = models.TextField(unique=True) diff --git a/share/search/__init__.py b/share/search/__init__.py index ac51d920e..7f723488c 100644 --- a/share/search/__init__.py +++ b/share/search/__init__.py @@ -1,6 +1,5 @@ from share.search.messages import MessageType, MessagesChunk -from share.search.index_strategy import IndexStrategy from share.search.index_messenger import IndexMessenger -__all__ = ('IndexStrategy', 'IndexMessenger', 'MessageType', 'MessagesChunk',) +__all__ = ('IndexMessenger', 'MessageType', 'MessagesChunk',) diff --git a/share/search/daemon.py b/share/search/daemon.py index 58108a1f3..90aedb855 100644 --- a/share/search/daemon.py +++ b/share/search/daemon.py @@ -11,7 +11,12 @@ from kombu.mixins import ConsumerMixin import sentry_sdk -from share.search import exceptions, messages, IndexStrategy, IndexMessenger +from share.search import ( + exceptions, + messages, + index_strategy, + IndexMessenger, +) logger = logging.getLogger(__name__) @@ -52,7 +57,7 @@ def start_daemonthreads_for_strategy(self, index_strategy): return _daemon def start_all_daemonthreads(self): - for _index_strategy in IndexStrategy.all_strategies(): + for _index_strategy in index_strategy.all_index_strategies().values(): self.start_daemonthreads_for_strategy(_index_strategy) def stop_daemonthreads(self, *, wait=False): @@ -176,7 +181,7 @@ def __repr__(self): @dataclasses.dataclass class MessageHandlingLoop: - index_strategy: IndexStrategy + index_strategy: index_strategy.IndexStrategy message_type: messages.MessageType stop_event: threading.Event local_message_queue: queue.Queue @@ -243,7 +248,6 @@ def _get_daemon_messages(self): return daemon_messages_by_target_id def _handle_some_messages(self): - # each message corresponds to one action on this daemon's index start_time = time.time() doc_count, error_count = 0, 0 daemon_messages_by_target_id = self._get_daemon_messages() @@ -265,7 +269,7 @@ def _handle_some_messages(self): logger.error('%sEncountered error: %s', self.log_prefix, message_response.error_text) sentry_sdk.capture_message('error handling message', extras={'message_response': message_response}) target_id = message_response.index_message.target_id - for daemon_message in daemon_messages_by_target_id.pop(target_id): + for daemon_message in daemon_messages_by_target_id.pop(target_id, ()): daemon_message.ack() # finally set it free if daemon_messages_by_target_id: # should be empty by now logger.error('%sUnhandled messages?? %s', self.log_prefix, len(daemon_messages_by_target_id)) diff --git a/share/search/index_messenger.py b/share/search/index_messenger.py index 7162ef533..0cd51293b 100644 --- a/share/search/index_messenger.py +++ b/share/search/index_messenger.py @@ -12,7 +12,7 @@ from share.models import FeatureFlag from share.search.messages import MessagesChunk, MessageType -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy logger = logging.getLogger(__name__) @@ -32,7 +32,7 @@ def __init__(self, *, celery_app=None, index_strategys=None): if celery_app is None else celery_app ) - self.index_strategys = index_strategys or tuple(IndexStrategy.all_strategies()) + self.index_strategys = index_strategys or tuple(index_strategy.all_index_strategies().values()) def notify_indexcard_update(self, indexcards, *, urgent=False): self.send_messages_chunk( @@ -62,18 +62,18 @@ def notify_suid_update(self, suid_ids, *, urgent=False): ) def incoming_messagequeue_iter(self, channel) -> typing.Iterable[kombu.Queue]: - for index_strategy in self.index_strategys: - yield kombu.Queue(channel=channel, name=index_strategy.urgent_messagequeue_name) - yield kombu.Queue(channel=channel, name=index_strategy.nonurgent_messagequeue_name) + for _index_strategy in self.index_strategys: + yield kombu.Queue(channel=channel, name=_index_strategy.urgent_messagequeue_name) + yield kombu.Queue(channel=channel, name=_index_strategy.nonurgent_messagequeue_name) def outgoing_messagequeue_iter(self, connection, message_type: MessageType, urgent: bool) -> typing.Iterable[kombu.simple.SimpleQueue]: - for index_strategy in self.index_strategys: - if message_type in index_strategy.supported_message_types: + for _index_strategy in self.index_strategys: + if message_type in _index_strategy.supported_message_types: yield connection.SimpleQueue( name=( - index_strategy.urgent_messagequeue_name + _index_strategy.urgent_messagequeue_name if urgent - else index_strategy.nonurgent_messagequeue_name + else _index_strategy.nonurgent_messagequeue_name ), ) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 6daa53848..297702475 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -1,4 +1,90 @@ +from __future__ import annotations +import functools +from types import MappingProxyType + +from django.conf import settings + +from share.search.exceptions import IndexStrategyError +from share.models import FeatureFlag +from trove.trovesearch import search_params +from .sharev2_elastic5 import Sharev2Elastic5IndexStrategy +from .sharev2_elastic8 import Sharev2Elastic8IndexStrategy +from .trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy +from .trovesearch_denorm import TrovesearchDenormIndexStrategy from ._base import IndexStrategy -__all__ = ('IndexStrategy',) +__all__ = ( + 'IndexStrategy', + 'all_index_strategies', + 'get_index_for_sharev2_search', + 'get_index_for_trovesearch', + 'get_index_strategy', + 'get_specific_index', +) + + +@functools.cache +def all_index_strategies() -> MappingProxyType[str, IndexStrategy]: + return MappingProxyType({ + _strategy.name: _strategy + for _strategy in _iter_all_index_strategies() + }) + + +def _iter_all_index_strategies(): + if settings.ELASTICSEARCH5_URL: + yield Sharev2Elastic5IndexStrategy(name='sharev2_elastic5') + if settings.ELASTICSEARCH8_URL: + yield Sharev2Elastic8IndexStrategy(name='sharev2_elastic8') + yield TroveIndexcardFlatsIndexStrategy(name='trove_indexcard_flats') + yield TrovesearchDenormIndexStrategy(name='trovesearch_denorm') + + +def get_index_strategy(strategyname: str) -> IndexStrategy: + try: + return all_index_strategies()[strategyname] + except KeyError: + raise IndexStrategyError(f'unknown index strategy "{strategyname}"') + + +def get_specific_index(indexname_or_strategyname: str, *, for_search=False) -> IndexStrategy.SpecificIndex: + try: + _strategy = get_index_strategy(indexname_or_strategyname) + return ( + _strategy.pls_get_default_for_searching() + if for_search + else _strategy.for_current_index() + ) + except IndexStrategyError: + for _index_strategy in all_index_strategies().values(): + try: + return _index_strategy.for_specific_index(indexname_or_strategyname) + except IndexStrategyError: + pass + raise IndexStrategyError(f'unrecognized name "{indexname_or_strategyname}"') + + +def get_index_for_sharev2_search(requested_name=None) -> IndexStrategy.SpecificIndex: + if requested_name: + _name = requested_name + elif ( + settings.ELASTICSEARCH5_URL + and not FeatureFlag.objects.flag_is_up(FeatureFlag.ELASTIC_EIGHT_DEFAULT) + ): + _name = 'sharev2_elastic5' + elif settings.ELASTICSEARCH8_URL: + _name = 'sharev2_elastic8' + else: + raise IndexStrategyError('no available index for sharev2 search') + return get_specific_index(_name, for_search=True) + + +def get_index_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy.SpecificIndex: + if params.index_strategy_name: # specific strategy requested + _name = params.index_strategy_name + elif FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY): + _name = 'trovesearch_denorm' + else: + _name = 'trove_indexcard_flats' + return get_specific_index(_name, for_search=True) diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index a2b14f7b5..1f21aefd5 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -1,12 +1,8 @@ import abc -import importlib import logging import typing -from django.conf import settings - from share.search import messages -from share.models.feature_flag import FeatureFlag from share.models.index_backfill import IndexBackfill from share.search.exceptions import IndexStrategyError from share.search.index_status import IndexStatus @@ -40,111 +36,17 @@ class IndexStrategy(abc.ABC): * may know of version- or cluster-specific features (should include identifiers like version numbers in subclass name) ''' - CURRENT_STRATEGY_CHECKSUM: ChecksumIri = None # set on subclasses to protect against accidents - - __all_strategys_by_name = None # cache for cls.all_strategies_by_name() - - @classmethod - def clear_strategy_cache(self): - self.__all_strategys_by_name = None - - @classmethod - def all_strategies_by_name(cls) -> 'dict[str, IndexStrategy]': - if cls.__all_strategys_by_name is None: - cls.__all_strategys_by_name = { - name: cls._load_from_settings(name, index_strategy_settings) - for name, index_strategy_settings - in settings.ELASTICSEARCH['INDEX_STRATEGIES'].items() - } - return cls.__all_strategys_by_name - - @classmethod - def all_strategies(cls) -> 'typing.Iterable[IndexStrategy]': - yield from cls.all_strategies_by_name().values() - - @classmethod - def get_by_name(cls, index_strategy_name: str) -> 'IndexStrategy': - try: - return cls.all_strategies_by_name()[index_strategy_name] - except KeyError: - raise IndexStrategyError(f'unknown index strategy "{index_strategy_name}"') - - @classmethod - def get_specific_index(cls, specific_indexname: str) -> 'IndexStrategy.SpecificIndex': - for index_strategy in cls.all_strategies(): - try: - return index_strategy.for_specific_index(specific_indexname) - except IndexStrategyError: - pass - raise IndexStrategyError(f'unrecognized indexname "{specific_indexname}"') - - @classmethod - def get_for_sharev2_search(cls, requested_name=None) -> 'IndexStrategy.SpecificIndex': - if requested_name: - _name = requested_name - else: - _name = ( - 'sharev2_elastic8' - if FeatureFlag.objects.flag_is_up(FeatureFlag.ELASTIC_EIGHT_DEFAULT) - else settings.DEFAULT_INDEX_STRATEGY_FOR_LEGACY_SEARCH - ) - try: # could be a strategy name - return cls.get_by_name(_name).pls_get_default_for_searching() - except IndexStrategyError: - try: # could be a specific indexname - return cls.get_specific_index(_name) - except IndexStrategyError: - raise IndexStrategyError(f'unknown name: "{_name}"') - - @classmethod - def get_for_trove_search(cls, requested_name=None) -> 'IndexStrategy.SpecificIndex': - if requested_name: - _name = requested_name - else: - _name = 'trove_indexcard_flats' - try: # could be a strategy name - return cls.get_by_name(_name).pls_get_default_for_searching() - except IndexStrategyError: - try: # could be a specific indexname - return cls.get_specific_index(_name) - except IndexStrategyError: - raise IndexStrategyError(f'unknown name: "{_name}"') - - @classmethod - def _load_from_settings(cls, index_strategy_name, index_strategy_settings): - assert set(index_strategy_settings) == {'INDEX_STRATEGY_CLASS', 'CLUSTER_SETTINGS'}, ( - 'values in settings.ELASTICSEARCH[\'INDEX_STRATEGIES\'] must have keys: ' - 'INDEX_STRATEGY_CLASS, CLUSTER_SETTINGS' - ) - class_path = index_strategy_settings['INDEX_STRATEGY_CLASS'] - module_name, separator, class_name = class_path.rpartition('.') - if not separator: - raise IndexStrategyError(f'INDEX_STRATEGY_CLASS should be importable dotted-path to an IndexStrategy class; got "{class_path}"') - assert module_name.startswith('share.search.index_strategy.'), ( - 'for now, INDEX_STRATEGY_CLASS must start with "share.search.index_strategy."' - f' (got "{module_name}")' - ) - index_strategy_class = getattr(importlib.import_module(module_name), class_name) - assert issubclass(index_strategy_class, cls) - return index_strategy_class( - name=index_strategy_name, - cluster_settings=index_strategy_settings['CLUSTER_SETTINGS'], - ) - - def __init__(self, name, cluster_settings): + CURRENT_STRATEGY_CHECKSUM: ChecksumIri # set on subclasses to protect against accidents + + def __init__(self, name): self.name = name - self.cluster_settings = cluster_settings def __repr__(self): return ''.join(( self.__class__.__qualname__, - f'(name={self.name})' + f'(name="{self.name}")' )) - @property - def cluster_url(self): - return self.cluster_settings['URL'] - @property def nonurgent_messagequeue_name(self): return f'{self.name}.nonurgent' @@ -188,7 +90,7 @@ def assert_strategy_is_current(self): ```''') def for_specific_index(self, specific_indexname) -> 'IndexStrategy.SpecificIndex': - return self.SpecificIndex(self, specific_indexname) + return self.SpecificIndex(self, specific_indexname) # type: ignore[abstract] def for_current_index(self) -> 'IndexStrategy.SpecificIndex': return self.for_specific_index(self.current_indexname) @@ -321,6 +223,9 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: raise NotImplementedError + def pls_get_mappings(self) -> dict: + raise NotImplementedError + # TODO someday: # def pls_handle_propertysearch(self, propertysearch_params: PropertysearchParams) -> PropertysearchResponse: # raise NotImplementedError diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py new file mode 100644 index 000000000..53daa11af --- /dev/null +++ b/share/search/index_strategy/_trovesearch_util.py @@ -0,0 +1,246 @@ +from __future__ import annotations +import base64 +from collections import defaultdict +import contextlib +import dataclasses +import datetime +import functools +import json +import logging +import typing + +from django.db.models import Exists, OuterRef +from primitive_metadata import primitive_rdf as rdf + +from trove import models as trove_db +from trove.trovesearch.search_params import ( + is_globpath, + Propertypath, +) +from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri +from trove.vocab.namespaces import ( + DCTERMS, + FOAF, + OSFMAP, + OWL, + RDF, + RDFS, + SKOS, + TROVE, + XSD, +) +from trove.vocab.osfmap import is_date_property + + +_logger = logging.getLogger(__name__) + + +### +# constants + +SKIPPABLE_PROPERTIES = ( + OSFMAP.contains, # too much, not helpful + OWL.sameAs, # handled special +) + +TITLE_PROPERTIES = (DCTERMS.title,) +NAME_PROPERTIES = (FOAF.name, OSFMAP.fileName) +LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel) +NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES) + +KEYWORD_LENGTH_MAX = 8191 # skip keyword terms that might exceed lucene's internal limit +# (see https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html) +KEYWORD_MAPPING = {'type': 'keyword', 'ignore_above': KEYWORD_LENGTH_MAX} +FLATTENED_MAPPING = {'type': 'flattened', 'ignore_above': KEYWORD_LENGTH_MAX} +TEXT_MAPPING = { + 'type': 'text', + 'index_options': 'offsets', # for highlighting +} +IRI_KEYWORD_MAPPING = { + 'type': 'object', + 'properties': { # for indexing iri values two ways: + 'exact': KEYWORD_MAPPING, # the exact iri value (e.g. "https://foo.example/bar/") + 'suffuniq': KEYWORD_MAPPING, # "sufficiently unique" (e.g. "://foo.example/bar") + }, +} + + +### +# utilities + +def latest_rdf_for_indexcard_pks(indexcard_pks): + return ( + trove_db.LatestIndexcardRdf.objects + .filter(indexcard_id__in=indexcard_pks) + .filter(Exists( # only index items that have an osfmap_json representation + trove_db.DerivedIndexcard.objects + .filter(upriver_indexcard_id=OuterRef('indexcard_id')) + .filter(deriver_identifier__in=( + trove_db.ResourceIdentifier.objects + .queryset_for_iri(TROVE['derive/osfmap_json']) + )) + )) + .exclude(indexcard__deleted__isnull=False) + .select_related('indexcard__source_record_suid__source_config') + .prefetch_related('indexcard__focus_identifier_set') + .prefetch_related('indexcard__supplementary_rdf_set') + ) + + +def iri_synonyms(iri: str, rdfdoc: rdf.RdfGraph) -> set[str]: + # note: extremely limited inference -- assumes objects of owl:sameAs are not used as subjects + _synonyms = ( + _synonym + for _synonym in rdfdoc.q(iri, OWL.sameAs) + if is_worthwhile_iri(_synonym) + ) + return {iri, *_synonyms} + + +def iris_synonyms(iris: typing.Iterable[str], rdfdoc: rdf.RdfGraph) -> set[str]: + return { + _synonym + for _iri in iris + for _synonym in iri_synonyms(_iri, rdfdoc) + } + + +def propertypath_as_keyword(path: Propertypath) -> str: + assert not is_globpath(path) + return json.dumps([ + get_sufficiently_unique_iri(_iri) + for _iri in path + ]) + + +def b64(value: str) -> str: + return base64.urlsafe_b64encode(value.encode()).decode() + + +def b64_reverse(b64_str: str) -> str: + return base64.urlsafe_b64decode(b64_str.encode()).decode() + + +def suffuniq_iris(iris: typing.Iterable[str]) -> list[str]: + # deduplicates, may reorder + return list({ + get_sufficiently_unique_iri(_iri) + for _iri in iris + }) + + +def _dict_of_sets(): + return defaultdict(set) + + +@dataclasses.dataclass +class GraphWalk: + rdfdoc: rdf.RdfGraph + focus_iri: str + already_visiting: set[str] = dataclasses.field(default_factory=set) + iri_values: dict[Propertypath, set[str]] = dataclasses.field( + default_factory=_dict_of_sets, + ) + text_values: dict[Propertypath, set[rdf.Literal]] = dataclasses.field( + default_factory=_dict_of_sets, + ) + date_values: dict[Propertypath, set[datetime.date]] = dataclasses.field( + default_factory=_dict_of_sets, + ) + integer_values: dict[Propertypath, set[int]] = dataclasses.field( + default_factory=_dict_of_sets, + ) + paths_walked: set[Propertypath] = dataclasses.field(default_factory=set) + + def __post_init__(self): + for _walk_path, _walk_obj in self._walk_from_subject(self.focus_iri): + self.paths_walked.add(_walk_path) + if isinstance(_walk_obj, str): + self.iri_values[_walk_path].add(_walk_obj) + elif isinstance(_walk_obj, datetime.date): + self.date_values[_walk_path].add(_walk_obj) + elif isinstance(_walk_obj, int): + self.integer_values[_walk_path].add(_walk_obj) + elif isinstance(_walk_obj, rdf.Literal): + if XSD.integer in _walk_obj.datatype_iris: + self.integer_values[_walk_path].add(_walk_obj) + if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris): + self.text_values[_walk_path].add(_walk_obj) + # try for date in a date property, regardless of the above + if is_date_property(_walk_path[-1]) and isinstance(_walk_obj, (str, rdf.Literal)): + _date_str = ( + _walk_obj.unicode_value + if isinstance(_walk_obj, rdf.Literal) + else _walk_obj + ) + try: + _parsed_date = datetime.date.fromisoformat(_date_str) + except ValueError: + _logger.debug('skipping malformatted date "%s"', _date_str) + else: + self.date_values[_walk_path].add(_parsed_date) + + def shortwalk_from(self, from_iri: str) -> GraphWalk: + return GraphWalk( + self.rdfdoc, + from_iri, + already_visiting={self.focus_iri}, + ) + + def _walk_from_subject( + self, + iri: str, + path_so_far: tuple[str, ...] = (), + ) -> typing.Iterator[tuple[Propertypath, rdf.RdfObject]]: + '''walk the graph from the given subject, yielding (pathkey, obj) for every reachable object + ''' + if iri in self.already_visiting: + return + with self._visit(iri): + _twoples = self.rdfdoc.tripledict.get(iri, {}) + for _next_steps, _obj in walk_twoples(_twoples): + _path = (*path_so_far, *_next_steps) + yield (_path, _obj) + if isinstance(_obj, str): # step further for iri + yield from self._walk_from_subject(_obj, path_so_far=_path) + + @functools.cached_property + def paths_by_iri(self) -> defaultdict[str, set[Propertypath]]: + _paths_by_iri: defaultdict[str, set[Propertypath]] = defaultdict(set) + for _path, _iris in self.iri_values.items(): + for _iri in _iris: + _paths_by_iri[_iri].add(_path) + return _paths_by_iri + + @contextlib.contextmanager + def _visit(self, focus_obj): + assert focus_obj not in self.already_visiting + self.already_visiting.add(focus_obj) + yield + self.already_visiting.discard(focus_obj) + + +def walk_twoples( + twoples: rdf.RdfTwopleDictionary | rdf.Blanknode, +) -> typing.Iterator[tuple[Propertypath, rdf.RdfObject]]: + if isinstance(twoples, frozenset): + _iter_twoples = ( + (_pred, _obj) + for _pred, _obj in twoples + if _pred not in SKIPPABLE_PROPERTIES + ) + else: + _iter_twoples = ( + (_pred, _obj) + for _pred, _obj_set in twoples.items() + if _pred not in SKIPPABLE_PROPERTIES + for _obj in _obj_set + ) + for _pred, _obj in _iter_twoples: + _path = (_pred,) + if isinstance(_obj, frozenset): + for _innerpath, _innerobj in walk_twoples(_obj): + _fullpath = (*_path, *_innerpath) + yield (_fullpath, _innerobj) + else: + yield (_path, _obj) diff --git a/share/search/index_strategy/_util.py b/share/search/index_strategy/_util.py index ffa9999f6..1908536a4 100644 --- a/share/search/index_strategy/_util.py +++ b/share/search/index_strategy/_util.py @@ -1,7 +1,4 @@ -import base64 -import dataclasses import datetime -import json def timestamp_to_readable_datetime(timestamp_in_milliseconds): @@ -12,14 +9,3 @@ def timestamp_to_readable_datetime(timestamp_in_milliseconds): .fromtimestamp(seconds, tz=datetime.timezone.utc) .isoformat(timespec='minutes') ) - - -def encode_cursor_dataclass(dataclass_instance) -> str: - _as_json = json.dumps(dataclasses.astuple(dataclass_instance)) - _cursor_bytes = base64.b64encode(_as_json.encode()) - return _cursor_bytes.decode() - - -def decode_cursor_dataclass(cursor: str, dataclass_class) -> dict: - _as_list = json.loads(base64.b64decode(cursor)) - return dataclass_class(*_as_list) diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 4fded9788..7e772e41f 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -1,5 +1,8 @@ +from __future__ import annotations import abc import collections +import dataclasses +from http import HTTPStatus import logging import typing @@ -26,10 +29,14 @@ def __init__(self, *args, **kwargs): should_sniff = settings.ELASTICSEARCH['SNIFF'] timeout = settings.ELASTICSEARCH['TIMEOUT'] self.es8_client = elasticsearch8.Elasticsearch( - self.cluster_url, + settings.ELASTICSEARCH8_URL, # security: - ca_certs=self.cluster_settings.get('CERT_PATH'), - basic_auth=self.cluster_settings.get('AUTH'), + ca_certs=settings.ELASTICSEARCH8_CERT_PATH, + basic_auth=( + (settings.ELASTICSEARCH8_USERNAME, settings.ELASTICSEARCH8_SECRET) + if settings.ELASTICSEARCH8_SECRET is not None + else None + ), # retry: retry_on_timeout=True, request_timeout=timeout, @@ -57,6 +64,13 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk) -> typin # yield (message_target_id, elastic_action) pairs raise NotImplementedError + def before_chunk( + self, + messages_chunk: messages.MessagesChunk, + indexnames: typing.Iterable[str], + ) -> None: + pass # implement when needed + ### # helper methods for subclasses to use (or override) @@ -109,45 +123,56 @@ def each_specific_index(self): def pls_handle_messages_chunk(self, messages_chunk): self.assert_message_type(messages_chunk.message_type) if messages_chunk.message_type.is_backfill: - indexnames = [self.current_indexname] + _indexnames = {self.current_indexname} else: - indexnames = self._get_indexnames_for_alias(self._alias_for_keeping_live) - _targetid_by_docid = {} - done_counter = collections.Counter() - bulk_stream = streaming_bulk( + _indexnames = self._get_indexnames_for_alias(self._alias_for_keeping_live) + self.before_chunk(messages_chunk, _indexnames) + _action_tracker = _ActionTracker() + _bulk_stream = streaming_bulk( self.es8_client, - self._elastic_actions_with_index(messages_chunk, indexnames, _targetid_by_docid), + self._elastic_actions_with_index(messages_chunk, _indexnames, _action_tracker), raise_on_error=False, max_retries=settings.ELASTICSEARCH['MAX_RETRIES'], ) - for (_ok, _response) in bulk_stream: + for (_ok, _response) in _bulk_stream: (_op_type, _response_body) = next(iter(_response.items())) _status = _response_body.get('status') _docid = _response_body['_id'] + _indexname = _response_body['_index'] _is_done = _ok or (_op_type == 'delete' and _status == 404) - _message_target_id = _targetid_by_docid[_docid] - done_counter[_message_target_id] += 1 - if done_counter[_message_target_id] >= len(indexnames): + if _is_done: + _action_tracker.action_done(_indexname, _docid) + else: + _action_tracker.action_errored(_indexname, _docid) + # yield error responses immediately yield messages.IndexMessageResponse( - is_done=_is_done, - index_message=messages.IndexMessage(messages_chunk.message_type, _message_target_id), - status_code=_response_body.get('status'), - error_text=( - None - if _ok - else str(_response_body) - ) + is_done=False, + index_message=messages.IndexMessage( + messages_chunk.message_type, + _action_tracker.get_message_id(_docid), + ), + status_code=_status, + error_text=str(_response_body), ) + # yield successes after the whole chunk completes + # (since one message may involve several actions) + for _messageid in _action_tracker.all_done_messages(): + yield messages.IndexMessageResponse( + is_done=True, + index_message=messages.IndexMessage(messages_chunk.message_type, _messageid), + status_code=HTTPStatus.OK.value, + error_text=None, + ) # abstract method from IndexStrategy - def pls_make_default_for_searching(self, specific_index: 'SpecificIndex'): + def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificIndex): self._set_indexnames_for_alias( self._alias_for_searching, {specific_index.indexname}, ) # abstract method from IndexStrategy - def pls_get_default_for_searching(self) -> 'SpecificIndex': + def pls_get_default_for_searching(self) -> IndexStrategy.SpecificIndex: # a SpecificIndex for an alias will work fine for searching, but # will error if you try to invoke lifecycle hooks return self.for_specific_index(self._alias_for_searching) @@ -166,12 +191,13 @@ def _alias_for_searching(self): def _alias_for_keeping_live(self): return f'{self.indexname_prefix}live' - def _elastic_actions_with_index(self, messages_chunk, indexnames, targetid_by_docid): + def _elastic_actions_with_index(self, messages_chunk, indexnames, action_tracker: _ActionTracker): if not indexnames: raise ValueError('cannot index to no indexes') for _message_target_id, _elastic_action in self.build_elastic_actions(messages_chunk): - targetid_by_docid[_elastic_action['_id']] = _message_target_id + _docid = _elastic_action['_id'] for _indexname in indexnames: + action_tracker.add_action(_message_target_id, _indexname, _docid) yield { **_elastic_action, '_index': _indexname, @@ -325,3 +351,37 @@ def pls_stop_keeping_live(self): alias_name=self.index_strategy._alias_for_keeping_live, ) logger.warning('%r: no longer kept live', self) + + def pls_get_mappings(self): + return self.index_strategy.es8_client.indices.get_mapping(index=self.indexname).body + + +@dataclasses.dataclass +class _ActionTracker: + messageid_by_docid: dict[str, int] = dataclasses.field(default_factory=dict) + actions_by_messageid: dict[int, set[tuple[str, str]]] = dataclasses.field( + default_factory=lambda: collections.defaultdict(set), + ) + errored_messageids: set[int] = dataclasses.field(default_factory=set) + + def add_action(self, message_id: int, index_name: str, doc_id: str): + self.messageid_by_docid[doc_id] = message_id + self.actions_by_messageid[message_id].add((index_name, doc_id)) + + def action_done(self, index_name: str, doc_id: str): + _messageid = self.messageid_by_docid[doc_id] + _message_actions = self.actions_by_messageid[_messageid] + _message_actions.discard((index_name, doc_id)) + + def action_errored(self, index_name: str, doc_id: str): + _messageid = self.messageid_by_docid[doc_id] + self.errored_messageids.add(_messageid) + + def get_message_id(self, doc_id: str): + return self.messageid_by_docid[doc_id] + + def all_done_messages(self): + for _messageid, _actions in self.actions_by_messageid.items(): + if _messageid not in self.errored_messageids: + assert not _actions + yield _messageid diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index 951921f56..13edb4881 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -39,7 +39,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) should_sniff = settings.ELASTICSEARCH['SNIFF'] self.es5_client = elasticsearch5.Elasticsearch( - self.cluster_url, + settings.ELASTICSEARCH5_URL, retry_on_timeout=True, timeout=settings.ELASTICSEARCH['TIMEOUT'], # sniff before doing anything diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index e2b879d24..b9bfbd33b 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -1,32 +1,34 @@ import base64 from collections import defaultdict -import contextlib import dataclasses import datetime import json import logging import re import uuid -from typing import Iterable, ClassVar, Optional, Iterator +from typing import Iterable, Iterator, Any from django.conf import settings -from django.db.models import Exists, OuterRef import elasticsearch8 from primitive_metadata import primitive_rdf from share.search import exceptions from share.search import messages from share.search.index_strategy.elastic8 import Elastic8IndexStrategy -from share.search.index_strategy._util import encode_cursor_dataclass, decode_cursor_dataclass from share.util.checksum_iri import ChecksumIri from trove import models as trove_db +from trove.trovesearch.page_cursor import ( + MANY_MORE, + OffsetCursor, + PageCursor, + ReproduciblyRandomSampleCursor, +) from trove.trovesearch.search_params import ( CardsearchParams, ValuesearchParams, SearchFilter, Textsegment, SortParam, - PageParam, GLOB_PATHSTEP, ) from trove.trovesearch.search_response import ( @@ -39,28 +41,19 @@ ) from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword from trove.vocab.osfmap import is_date_property -from trove.vocab.namespaces import TROVE, FOAF, RDF, RDFS, DCTERMS, OWL, SKOS, OSFMAP - - -logger = logging.getLogger(__name__) - - -TITLE_PROPERTIES = (DCTERMS.title,) -NAME_PROPERTIES = (FOAF.name, OSFMAP.fileName) -LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel) -NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES) - - -SKIPPABLE_PROPERTIES = ( - OSFMAP.contains, +from trove.vocab.namespaces import RDF, OWL +from ._trovesearch_util import ( + latest_rdf_for_indexcard_pks, + GraphWalk, + TITLE_PROPERTIES, + NAME_PROPERTIES, + LABEL_PROPERTIES, + NAMELIKE_PROPERTIES, + KEYWORD_LENGTH_MAX, ) -VALUESEARCH_MAX = 234 -CARDSEARCH_MAX = 9997 - -KEYWORD_LENGTH_MAX = 8191 # skip keyword terms that might exceed lucene's internal limit -# (see https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html) +logger = logging.getLogger(__name__) class TroveIndexcardFlatsIndexStrategy(Elastic8IndexStrategy): @@ -173,22 +166,16 @@ def _build_sourcedoc(self, indexcard_rdf): _nested_iris = defaultdict(set) _nested_dates = defaultdict(set) _nested_texts = defaultdict(set) - _pathset = set() - for _walk_path, _walk_obj in _PredicatePathWalker(_rdfdoc.tripledict).walk_from_subject(indexcard_rdf.focus_iri): - _pathset.add(_walk_path) - if isinstance(_walk_obj, str): - _nested_iris[_NestedIriKey.for_iri_at_path(_walk_path, _walk_obj, _rdfdoc)].add(_walk_obj) - elif isinstance(_walk_obj, datetime.date): - _nested_dates[_walk_path].add(datetime.date.isoformat(_walk_obj)) - elif is_date_property(_walk_path[-1]): - try: - datetime.date.fromisoformat(_walk_obj.unicode_value) - except ValueError: - logger.debug('skipping malformatted date "%s" in %s', _walk_obj.unicode_value, indexcard_rdf) - else: - _nested_dates[_walk_path].add(_walk_obj.unicode_value) - elif isinstance(_walk_obj, primitive_rdf.Literal): - _nested_texts[(_walk_path, tuple(_walk_obj.datatype_iris))].add(_walk_obj.unicode_value) + _walk = GraphWalk(_rdfdoc, indexcard_rdf.focus_iri) + for _walk_path, _walk_iris in _walk.iri_values.items(): + for _iri_obj in _walk_iris: + _nested_iris[_NestedIriKey.for_iri_at_path(_walk_path, _iri_obj, _rdfdoc)].add(_iri_obj) + for _walk_path, _walk_dates in _walk.date_values.items(): + for _date_obj in _walk_dates: + _nested_dates[_walk_path].add(datetime.date.isoformat(_date_obj)) + for _walk_path, _walk_texts in _walk.text_values.items(): + for _text_obj in _walk_texts: + _nested_texts[(_walk_path, tuple(_text_obj.datatype_iris))].add(_text_obj.unicode_value) _focus_iris = {indexcard_rdf.focus_iri} _suffuniq_focus_iris = {get_sufficiently_unique_iri(indexcard_rdf.focus_iri)} for _identifier in indexcard_rdf.indexcard.focus_identifier_set.all(): @@ -204,11 +191,11 @@ def _build_sourcedoc(self, indexcard_rdf): 'flat_iri_values_suffuniq': self._flattened_iris_suffuniq(_nested_iris), 'iri_paths_present': [ iri_path_as_keyword(_path) - for _path in _pathset + for _path in _walk.paths_walked ], 'iri_paths_present_suffuniq': [ iri_path_as_keyword(_path, suffuniq=True) - for _path in _pathset + for _path in _walk.paths_walked ], 'nested_iri': list(filter(bool, ( self._iri_nested_sourcedoc(_nested_iri_key, _iris, _rdfdoc) @@ -271,22 +258,7 @@ def _flattened_iris_suffuniq(self, nested_iris: dict['_NestedIriKey', set[str]]) } def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - _indexcard_rdf_qs = ( - trove_db.LatestIndexcardRdf.objects - .filter(indexcard_id__in=messages_chunk.target_ids_chunk) - .filter(Exists( - trove_db.DerivedIndexcard.objects - .filter(upriver_indexcard_id=OuterRef('indexcard_id')) - .filter(deriver_identifier__in=( - trove_db.ResourceIdentifier.objects - .queryset_for_iri(TROVE['derive/osfmap_json']) - )) - )) - .exclude(indexcard__deleted__isnull=False) - .select_related('indexcard__source_record_suid__source_config') - .prefetch_related('indexcard__focus_identifier_set') - .prefetch_related('indexcard__supplementary_rdf_set') - ) + _indexcard_rdf_qs = latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) _remaining_indexcard_ids = set(messages_chunk.target_ids_chunk) for _indexcard_rdf in _indexcard_rdf_qs: _suid = _indexcard_rdf.indexcard.source_record_suid @@ -317,18 +289,23 @@ def pls_handle_search__sharev2_backcompat(self, request_body=None, request_query ) def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse: - _cursor = _CardsearchCursor.from_params(cardsearch_params) + _cursor = self._cardsearch_cursor(cardsearch_params) _sort = self._cardsearch_sort(cardsearch_params.sort_list) _query = self._cardsearch_query( cardsearch_params.cardsearch_filter_set, cardsearch_params.cardsearch_textsegment_set, cardsearch_cursor=_cursor, ) + _from_offset = ( + _cursor.start_offset + if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) + else _cursor.start_offset - len(_cursor.first_page_ids) + ) _search_kwargs = dict( query=_query, aggs=self._cardsearch_aggs(cardsearch_params), sort=_sort, - from_=_cursor.cardsearch_start_index(), + from_=_from_offset, size=_cursor.page_size, source=False, # no need to get _source; _id is enough ) @@ -344,19 +321,15 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear return self._cardsearch_response(cardsearch_params, _es8_response, _cursor) def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: - _cursor = _SimpleCursor.from_page_param(valuesearch_params.page) - _is_date_search = all( - is_date_property(_path[-1]) - for _path in valuesearch_params.valuesearch_propertypath_set - ) + _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) + _is_date_search = is_date_property(valuesearch_params.valuesearch_propertypath[-1]) _search_kwargs = dict( query=self._cardsearch_query( valuesearch_params.cardsearch_filter_set, valuesearch_params.cardsearch_textsegment_set, - additional_filters=[{'terms': {'iri_paths_present': [ - iri_path_as_keyword(_path) - for _path in valuesearch_params.valuesearch_propertypath_set - ]}}], + additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + )}}], ), size=0, # ignore cardsearch hits; just want the aggs aggs=( @@ -379,11 +352,21 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value ### # query implementation + def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: + _request_cursor = cardsearch_params.page_cursor + if ( + _request_cursor.is_basic() + and not cardsearch_params.sort_list + and not cardsearch_params.cardsearch_textsegment_set + ): + return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) + return OffsetCursor.from_cursor(_request_cursor) + def _cardsearch_query( self, filter_set, textsegment_set, *, additional_filters=None, - cardsearch_cursor: Optional['_CardsearchCursor'] = None, + cardsearch_cursor: PageCursor | None = None, ) -> dict: _bool_query = { 'filter': additional_filters or [], @@ -405,15 +388,15 @@ def _cardsearch_query( else: raise ValueError(f'unknown filter operator {_searchfilter.operator}') _textq_builder = self._NestedTextQueryBuilder( - relevance_matters=bool(cardsearch_cursor and not cardsearch_cursor.random_sort), + relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), ) for _textsegment in textsegment_set: for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): _bool_query[_boolkey].extend(_textqueries) - if not cardsearch_cursor or not cardsearch_cursor.random_sort: + if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): # no need for randomness return {'bool': _bool_query} - if not cardsearch_cursor.first_page_uuids: + if not cardsearch_cursor.first_page_ids: # independent random sample return { 'function_score': { @@ -422,7 +405,7 @@ def _cardsearch_query( 'random_score': {}, # default random_score is fast and unpredictable }, } - _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_uuids}} + _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} if cardsearch_cursor.is_first_page(): # returning to a first page previously visited _bool_query['filter'].append(_firstpage_uuid_query) @@ -434,7 +417,7 @@ def _cardsearch_query( 'query': {'bool': _bool_query}, 'boost_mode': 'replace', 'random_score': { - 'seed': ''.join(cardsearch_cursor.first_page_uuids), + 'seed': ''.join(cardsearch_cursor.first_page_ids), 'field': 'indexcard_uuid', }, }, @@ -451,46 +434,14 @@ def _cardsearch_aggs(self, cardsearch_params): ], 'size': len(cardsearch_params.related_property_paths), }} - if cardsearch_params.unnamed_iri_values: - _aggs['global_agg'] = { - 'global': {}, - 'aggs': { - 'filtervalue_info': { - 'nested': {'path': 'nested_iri'}, - 'aggs': { - 'iri_values': { - 'terms': { - 'field': 'nested_iri.iri_value', - 'include': list(cardsearch_params.unnamed_iri_values), - 'size': len(cardsearch_params.unnamed_iri_values), - }, - 'aggs': { - 'type_iri': {'terms': { - 'field': 'nested_iri.value_type_iri', - }}, - 'name_text': {'terms': { - 'field': 'nested_iri.value_name_text.raw', - }}, - 'title_text': {'terms': { - 'field': 'nested_iri.value_title_text.raw', - }}, - 'label_text': {'terms': { - 'field': 'nested_iri.value_label_text.raw', - }}, - }, - }, - }, - }, - }, - } return _aggs - def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: '_SimpleCursor'): - _nested_iri_bool = { - 'filter': [{'terms': {'nested_iri.suffuniq_path_from_focus': [ - iri_path_as_keyword(_path, suffuniq=True) - for _path in valuesearch_params.valuesearch_propertypath_set - ]}}], + def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): + _nested_iri_bool: dict[str, Any] = { + 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + )}}], 'must': [], 'must_not': [], 'should': [], @@ -498,7 +449,7 @@ def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: ' _nested_terms_agg = { 'field': 'nested_iri.iri_value', # WARNING: terribly inefficient pagination (part one) - 'size': cursor.start_index + cursor.page_size + 1, + 'size': cursor.start_offset + cursor.page_size + 1, } _iris = list(valuesearch_params.valuesearch_iris()) if _iris: @@ -552,11 +503,11 @@ def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): 'nested': {'path': 'nested_date'}, 'aggs': { 'value_at_propertypath': { - 'filter': {'terms': { - 'nested_date.suffuniq_path_from_focus': [ - iri_path_as_keyword(_path, suffuniq=True) - for _path in valuesearch_params.valuesearch_propertypath_set - ], + 'filter': {'term': { + 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + ), }}, 'aggs': { 'count_by_year': { @@ -579,28 +530,26 @@ def _valuesearch_response( self, valuesearch_params: ValuesearchParams, es8_response: dict, - cursor: '_SimpleCursor', + cursor: OffsetCursor, ): _iri_aggs = es8_response['aggregations'].get('in_nested_iri') if _iri_aggs: _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] _bucket_count = len(_buckets) # WARNING: terribly inefficient pagination (part two) - _page_end_index = cursor.start_index + cursor.page_size - _bucket_page = _buckets[cursor.start_index:_page_end_index] # discard prior pages - cursor.result_count = ( - -1 # "many more" + _page_end_index = cursor.start_offset + cursor.page_size + _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages + cursor.total_count = ( + MANY_MORE if (_bucket_count > _page_end_index) # agg includes one more, if there else _bucket_count ) return ValuesearchResponse( + cursor=cursor, search_result_page=[ self._valuesearch_iri_result(_iri_bucket) for _iri_bucket in _bucket_page ], - next_page_cursor=cursor.next_cursor(), - prev_page_cursor=cursor.prev_cursor(), - first_page_cursor=cursor.first_cursor(), ) else: # assume date _year_buckets = ( @@ -608,6 +557,7 @@ def _valuesearch_response( ['value_at_propertypath']['count_by_year']['buckets'] ) return ValuesearchResponse( + cursor=PageCursor(len(_year_buckets)), search_result_page=[ self._valuesearch_date_result(_year_bucket) for _year_bucket in _year_buckets @@ -724,7 +674,7 @@ def _cardsearch_sort(self, sort_list: tuple[SortParam]): 'path': 'nested_date', 'filter': {'term': { 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - [_sortparam.property_iri], + _sortparam.propertypath, suffuniq=True, ), }}, @@ -737,16 +687,16 @@ def _cardsearch_response( self, cardsearch_params: CardsearchParams, es8_response: dict, - cursor: '_CardsearchCursor', + cursor: OffsetCursor, ) -> CardsearchResponse: _es8_total = es8_response['hits']['total'] if _es8_total['relation'] != 'eq': - cursor.result_count = -1 # "too many" + cursor.total_count = MANY_MORE + elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): + # account for the filtered-out first page + cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) else: # exact (and small) count - cursor.result_count = _es8_total['value'] - if cursor.random_sort and not cursor.is_first_page(): - # account for the filtered-out first page - cursor.result_count += len(cursor.first_page_uuids) + cursor.total_count = _es8_total['value'] _results = [] for _es8_hit in es8_response['hits']['hits']: _card_iri = _es8_hit['_id'] @@ -754,36 +704,7 @@ def _cardsearch_response( card_iri=_card_iri, text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), )) - if cursor.is_first_page() and cursor.first_page_uuids: - # revisiting first page; reproduce original random order - _uuid_index = { - _uuid: _i - for (_i, _uuid) in enumerate(cursor.first_page_uuids) - } - _results.sort(key=lambda _r: _uuid_index[_r.card_uuid()]) - else: - _should_start_reproducible_randomness = ( - cursor.random_sort - and cursor.is_first_page() - and not cursor.first_page_uuids - and any( - not _filter.is_type_filter() # look for a non-default filter - for _filter in cardsearch_params.cardsearch_filter_set - ) - ) - if _should_start_reproducible_randomness: - cursor.first_page_uuids = tuple( - _result.card_uuid() - for _result in _results - ) - _filtervalue_info = [] - if cardsearch_params.unnamed_iri_values: - _filtervalue_agg = es8_response['aggregations']['global_agg']['filtervalue_info']['iri_values'] - _filtervalue_info.extend( - self._valuesearch_iri_result(_iri_bucket) - for _iri_bucket in _filtervalue_agg['buckets'] - ) - _relatedproperty_list = [] + _relatedproperty_list: list[PropertypathUsage] = [] if cardsearch_params.related_property_paths: _relatedproperty_list.extend( PropertypathUsage(property_path=_path, usage_count=0) @@ -797,17 +718,10 @@ def _cardsearch_response( _path = tuple(json.loads(_bucket['key'])) _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] return CardsearchResponse( - total_result_count=( - TROVE['ten-thousands-and-more'] - if cursor.has_many_more() - else cursor.result_count - ), + cursor=cursor, search_result_page=_results, - filtervalue_info=_filtervalue_info, related_propertypath_results=_relatedproperty_list, - next_page_cursor=cursor.next_cursor(), - prev_page_cursor=cursor.prev_cursor(), - first_page_cursor=cursor.first_cursor(), + cardsearch_params=cardsearch_params, ) def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: @@ -979,126 +893,6 @@ def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} -@dataclasses.dataclass -class _SimpleCursor: - start_index: int - page_size: int - result_count: int | None # use -1 to indicate "many more" - - MAX_INDEX: ClassVar[int] = VALUESEARCH_MAX - - @classmethod - def from_page_param(cls, page: PageParam) -> '_SimpleCursor': - if page.cursor: - return decode_cursor_dataclass(page.cursor, cls) - return cls( - start_index=0, - page_size=page.size, - result_count=None, # should be set when results are in - ) - - def next_cursor(self) -> str | None: - if not self.result_count: - return None - _next = dataclasses.replace(self, start_index=(self.start_index + self.page_size)) - return ( - encode_cursor_dataclass(_next) - if _next.is_valid_cursor() - else None - ) - - def prev_cursor(self) -> str | None: - _prev = dataclasses.replace(self, start_index=(self.start_index - self.page_size)) - return ( - encode_cursor_dataclass(_prev) - if _prev.is_valid_cursor() - else None - ) - - def first_cursor(self) -> str | None: - if self.is_first_page(): - return None - return encode_cursor_dataclass(dataclasses.replace(self, start_index=0)) - - def is_first_page(self) -> bool: - return self.start_index == 0 - - def has_many_more(self) -> bool: - return self.result_count == -1 - - def max_index(self) -> int: - return ( - self.MAX_INDEX - if self.has_many_more() - else min(self.result_count, self.MAX_INDEX) - ) - - def is_valid_cursor(self) -> bool: - return 0 <= self.start_index < self.max_index() - - -@dataclasses.dataclass -class _CardsearchCursor(_SimpleCursor): - random_sort: bool # how to sort by relevance to nothingness? randomness! - first_page_uuids: tuple[str, ...] = () - - MAX_INDEX: ClassVar[int] = CARDSEARCH_MAX - - @classmethod - def from_params(cls, params: CardsearchParams) -> '_CardsearchCursor': - if params.page.cursor: - return decode_cursor_dataclass(params.page.cursor, cls) - return cls( - start_index=0, - page_size=params.page.size, - result_count=None, # should be set when results are in - random_sort=( - not params.sort_list - and not params.cardsearch_textsegment_set - ), - ) - - def cardsearch_start_index(self) -> int: - if self.is_first_page() or not self.random_sort: - return self.start_index - return self.start_index - len(self.first_page_uuids) - - -class _PredicatePathWalker: - WalkYield = tuple[tuple[str, ...], primitive_rdf.RdfObject] - _visiting: set[str | frozenset] - - def __init__(self, tripledict: primitive_rdf.RdfTripleDictionary): - self.tripledict = tripledict - self._visiting = set() - - def walk_from_subject(self, iri_or_blanknode, last_path: tuple[str, ...] = ()) -> Iterable[WalkYield]: - '''walk the graph from the given subject, yielding (pathkey, obj) for every reachable object - ''' - with self._visit(iri_or_blanknode): - _twopledict = ( - primitive_rdf.twopledict_from_twopleset(iri_or_blanknode) - if isinstance(iri_or_blanknode, frozenset) - else self.tripledict.get(iri_or_blanknode, {}) - ) - for _predicate_iri, _obj_set in _twopledict.items(): - if _predicate_iri not in SKIPPABLE_PROPERTIES: - _path = (*last_path, _predicate_iri) - for _obj in _obj_set: - if not isinstance(_obj, frozenset): # omit the blanknode as a value - yield (_path, _obj) - if isinstance(_obj, (str, frozenset)) and (_obj not in self._visiting): - # step further for iri or blanknode - yield from self.walk_from_subject(_obj, last_path=_path) - - @contextlib.contextmanager - def _visit(self, focus_obj): - assert focus_obj not in self._visiting - self._visiting.add(focus_obj) - yield - self._visiting.discard(focus_obj) - - @dataclasses.dataclass(frozen=True) class _NestedIriKey: '''if this is the same for multiple iri values, they can be combined in one `nested_iri` doc diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py new file mode 100644 index 000000000..35320b5c1 --- /dev/null +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -0,0 +1,890 @@ +from __future__ import annotations +from collections import abc, defaultdict +import dataclasses +import functools +import json +import logging +import re +from typing import ( + Iterable, + Iterator, + Literal, +) + +from django.conf import settings +import elasticsearch8 +from primitive_metadata import primitive_rdf as rdf + +from share.search import exceptions +from share.search import messages +from share.search.index_strategy.elastic8 import Elastic8IndexStrategy +from share.util.checksum_iri import ChecksumIri +from trove import models as trove_db +from trove.trovesearch.page_cursor import ( + MANY_MORE, + OffsetCursor, + PageCursor, + ReproduciblyRandomSampleCursor, +) +from trove.trovesearch.search_params import ( + CardsearchParams, + Propertypath, + SearchFilter, + Textsegment, + ValueType, + ValuesearchParams, + is_globpath, +) +from trove.trovesearch.search_response import ( + CardsearchResponse, + CardsearchResult, + PropertypathUsage, + TextMatchEvidence, + ValuesearchResponse, + ValuesearchResult, +) +from trove.vocab.osfmap import is_date_property +from trove.vocab.namespaces import OWL, RDF +from . import _trovesearch_util as ts + + +logger = logging.getLogger(__name__) + + +class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): + CURRENT_STRATEGY_CHECKSUM = ChecksumIri( + checksumalgorithm_name='sha-256', + salt='TrovesearchDenormIndexStrategy', + hexdigest='fa8fe6459f658877f84620412dcab5e2e70d0c949d8977354c586dca99ff2f28', + ) + + # abstract method from IndexStrategy + @property + def supported_message_types(self): + return { + messages.MessageType.UPDATE_INDEXCARD, + messages.MessageType.BACKFILL_INDEXCARD, + } + + # abstract method from IndexStrategy + @property + def backfill_message_type(self): + return messages.MessageType.BACKFILL_INDEXCARD + + # abstract method from Elastic8IndexStrategy + def index_settings(self): + return {} + + # abstract method from Elastic8IndexStrategy + def index_mappings(self): + return { + 'dynamic': 'false', + 'dynamic_templates': self._dynamic_templates(), + 'properties': { + 'card': {'properties': self._card_mappings()}, + 'iri_value': {'properties': self._iri_value_mappings()}, + }, + } + + def _dynamic_templates(self): + return [ + {'dynamic_text_by_propertypath': { + 'path_match': '*.text_by_propertypath.*', + 'mapping': ts.TEXT_MAPPING, + }}, + {'dynamic_text_by_depth': { + 'path_match': '*.text_by_depth.*', + 'mapping': ts.TEXT_MAPPING, + }}, + {'dynamic_date_by_propertypath': { + 'path_match': '*.date_by_propertypath.*', + 'mapping': { + 'type': 'date', + 'format': 'strict_date_optional_time', + }, + }}, + {'dynamic_int_by_propertypath': { + 'path_match': '*.int_by_propertypath.*', + 'mapping': {'type': 'long'}, + }}, + ] + + def _card_mappings(self): + return { + # simple keyword properties + 'card_iri': ts.KEYWORD_MAPPING, + 'card_pk': ts.KEYWORD_MAPPING, + 'suid': { + 'type': 'object', + 'properties': { + 'source_config_label': ts.KEYWORD_MAPPING, + 'source_record_identifier': ts.KEYWORD_MAPPING, + }, + }, + **self._paths_and_values_mappings(), + } + + def _iri_value_mappings(self): + return { + 'value_iri': ts.KEYWORD_MAPPING, + 'value_name': ts.KEYWORD_MAPPING, + 'value_title': ts.KEYWORD_MAPPING, + 'value_label': ts.KEYWORD_MAPPING, + 'at_card_propertypaths': ts.KEYWORD_MAPPING, + **self._paths_and_values_mappings(), + } + + def _paths_and_values_mappings(self): + return { + 'focus_iri': ts.IRI_KEYWORD_MAPPING, + 'propertypaths_present': ts.KEYWORD_MAPPING, + # flattened properties (dynamic sub-properties with keyword values) + 'iri_by_propertypath': ts.FLATTENED_MAPPING, + 'iri_by_depth': ts.FLATTENED_MAPPING, + # dynamic properties (see `_dynamic_templates`) + 'text_by_propertypath': {'type': 'object', 'dynamic': True}, + 'text_by_depth': {'type': 'object', 'dynamic': True}, + 'date_by_propertypath': {'type': 'object', 'dynamic': True}, + 'int_by_propertypath': {'type': 'object', 'dynamic': True}, + } + + # override method from Elastic8IndexStrategy + def before_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: Iterable[str]): + # delete all per-value docs (to account for missing values) + self.es8_client.delete_by_query( + index=list(indexnames), + query={'bool': {'must': [ + {'terms': {'card.card_pk': messages_chunk.target_ids_chunk}}, + {'exists': {'field': 'iri_value.value_iri'}}, + ]}}, + ) + # (possible optimization: instead, hold onto doc_ids and (in `after_chunk`?) + # delete_by_query excluding those) + + # abstract method from Elastic8IndexStrategy + def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): + _indexcard_rdf_qs = ts.latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) + _remaining_indexcard_pks = set(messages_chunk.target_ids_chunk) + for _indexcard_rdf in _indexcard_rdf_qs: + _docbuilder = self._SourcedocBuilder(_indexcard_rdf) + if not _docbuilder.should_skip(): # if skipped, will be deleted + _indexcard_pk = _indexcard_rdf.indexcard_id + for _doc_id, _doc in _docbuilder.build_docs(): + _index_action = self.build_index_action( + doc_id=_doc_id, + doc_source=_doc, + ) + yield _indexcard_pk, _index_action + _remaining_indexcard_pks.discard(_indexcard_pk) + # delete any that were skipped for any reason + for _indexcard_pk in _remaining_indexcard_pks: + yield _indexcard_pk, self.build_delete_action(_indexcard_pk) + + ### + # implement abstract IndexStrategy.SpecificIndex + + class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): + + # abstract method from IndexStrategy.SpecificIndex + def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + return self.index_strategy.es8_client.search( + index=self.indexname, + body={ + **(request_body or {}), + 'track_total_hits': True, + }, + params=(request_queryparams or {}), + ) + + # abstract method from IndexStrategy.SpecificIndex + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse: + _querybuilder = _CardsearchQueryBuilder(cardsearch_params) + _search_kwargs = _querybuilder.build() + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.index_strategy.es8_client.search( + index=self.indexname, + source=False, # no need to get _source, identifiers are enough + docvalue_fields=['card.card_iri'], + highlight={ + 'require_field_match': False, + 'fields': {'card.text_by_propertypath.*': {}}, + }, + **_search_kwargs, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self.index_strategy._cardsearch_response( + cardsearch_params, + _es8_response, + _querybuilder.response_cursor, + ) + + # abstract method from IndexStrategy.SpecificIndex + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse: + _path = valuesearch_params.valuesearch_propertypath + _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) + _is_date_search = is_date_property(_path[-1]) + _query = ( + _build_date_valuesearch(valuesearch_params) + if _is_date_search + else _build_iri_valuesearch(valuesearch_params, _cursor) + ) + if settings.DEBUG: + logger.info(json.dumps(_query, indent=2)) + try: + _es8_response = self.index_strategy.es8_client.search( + **_query, + index=self.indexname, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return ( + self.index_strategy._valuesearch_dates_response(valuesearch_params, _es8_response) + if _is_date_search + else self.index_strategy._valuesearch_iris_response(valuesearch_params, _es8_response, _cursor) + ) + + ### + # building sourcedocs + + @dataclasses.dataclass + class _SourcedocBuilder: + '''build elasticsearch sourcedocs for an rdf document + ''' + indexcard_rdf: trove_db.IndexcardRdf + indexcard: trove_db.Indexcard = dataclasses.field(init=False) + focus_iri: str = dataclasses.field(init=False) + rdfdoc: rdf.RdfTripleDictionary = dataclasses.field(init=False) + + def __post_init__(self) -> None: + self.indexcard = self.indexcard_rdf.indexcard + self.focus_iri = self.indexcard_rdf.focus_iri + self.rdfdoc = self.indexcard_rdf.as_rdfdoc_with_supplements() + + def should_skip(self) -> bool: + _suid = self.indexcard.source_record_suid + return ( + # skip cards that belong to an obsolete suid with a later duplicate + _suid.has_forecompat_replacement() + # ...or that are without some value for name/title/label + or not any(self.rdfdoc.q(self.focus_iri, ts.NAMELIKE_PROPERTIES)) + ) + + def build_docs(self) -> Iterator[tuple[str, dict]]: + # index once without `iri_value` + yield self._doc_id(), {'card': self._card_subdoc} + for _iri in self._fullwalk.paths_by_iri: + yield self._doc_id(_iri), { + 'card': self._card_subdoc, + 'iri_value': self._iri_value_subdoc(_iri), + } + + def _doc_id(self, value_iri=None) -> str: + _card_pk = str(self.indexcard.pk) + return ( + _card_pk + if value_iri is None + else f'{_card_pk}-{ts.b64(value_iri)}' + ) + + @functools.cached_property + def _fullwalk(self) -> ts.GraphWalk: + return ts.GraphWalk(self.rdfdoc, self.focus_iri) + + @functools.cached_property + def _card_subdoc(self) -> dict: + return { + 'card_iri': self.indexcard.get_iri(), + 'card_pk': str(self.indexcard.pk), + 'suid': { + 'source_record_identifier': self.indexcard.source_record_suid.identifier, + 'source_config_label': self.indexcard.source_record_suid.source_config.label, + }, + **self._paths_and_values(self._fullwalk), + } + + def _iri_value_subdoc(self, iri: str) -> dict: + _shortwalk = self._fullwalk.shortwalk_from(iri) + return { + 'value_iri': iri, + 'value_iris': self._exact_and_suffuniq_iris(iri), + 'value_name': list(self._texts_at_properties(_shortwalk, ts.NAME_PROPERTIES)), + 'value_title': list(self._texts_at_properties(_shortwalk, ts.TITLE_PROPERTIES)), + 'value_label': list(self._texts_at_properties(_shortwalk, ts.LABEL_PROPERTIES)), + 'at_card_propertypaths': [ + ts.propertypath_as_keyword(_path) + for _path in self._fullwalk.paths_by_iri[iri] + ], + **self._paths_and_values(_shortwalk), + } + + def _paths_and_values(self, walk: ts.GraphWalk): + return { + 'focus_iri': self._exact_and_suffuniq_iris(walk.focus_iri), + 'propertypaths_present': self._propertypaths_present(walk), + 'iri_by_propertypath': self._iris_by_propertypath(walk), + 'iri_by_depth': self._iris_by_depth(walk), + 'text_by_propertypath': self._texts_by_propertypath(walk), + 'text_by_depth': self._texts_by_depth(walk), + 'date_by_propertypath': self._dates_by_propertypath(walk), + 'int_by_propertypath': self._ints_by_propertypath(walk), + } + + def _propertypaths_present(self, walk: ts.GraphWalk): + return [ + ts.propertypath_as_keyword(_path) + for _path in walk.paths_walked + ] + + def _iris_by_propertypath(self, walk: ts.GraphWalk): + return { + _path_field_name(_path): ts.suffuniq_iris(ts.iris_synonyms(_iris, self.rdfdoc)) + for _path, _iris in walk.iri_values.items() + } + + def _iris_by_depth(self, walk: ts.GraphWalk): + _by_depth: dict[int, set[str]] = defaultdict(set) + for _path, _iris in walk.iri_values.items(): + _by_depth[len(_path)].update(_iris) + return { + _depth_field_name(_depth): ts.suffuniq_iris(ts.iris_synonyms(_iris, self.rdfdoc)) + for _depth, _iris in _by_depth.items() + } + + def _texts_by_propertypath(self, walk: ts.GraphWalk): + return { + _path_field_name(_path): [_text.unicode_value for _text in _text_set] + for _path, _text_set in walk.text_values.items() + } + + def _texts_at_properties(self, walk: ts.GraphWalk, properties: Iterable[str]): + for _property in properties: + for _text_literal in walk.text_values.get((_property,), []): + yield _text_literal.unicode_value + + def _texts_by_depth(self, walk: ts.GraphWalk): + _by_depth: dict[int, set[str]] = defaultdict(set) + for _path, _text_set in walk.text_values.items(): + _by_depth[len(_path)].update(_text.unicode_value for _text in _text_set) + return { + _depth_field_name(_depth): list(_value_set) + for _depth, _value_set in _by_depth.items() + } + + def _dates_by_propertypath(self, walk: ts.GraphWalk): + return { + _path_field_name(_path): [ + _date.isoformat() + for _date in _value_set + ] + for _path, _value_set in walk.date_values.items() + } + + def _ints_by_propertypath(self, walk: ts.GraphWalk): + return { + _path_field_name(_path): list(_value_set) + for _path, _value_set in walk.integer_values.items() + } + + def _exact_and_suffuniq_iris(self, iri: str): + _synonyms = ts.iri_synonyms(iri, self.rdfdoc) + return { + 'exact': list(_synonyms), + 'suffuniq': ts.suffuniq_iris(_synonyms), + } + + ### + # normalizing search responses + + def _valuesearch_iris_response( + self, + valuesearch_params: ValuesearchParams, + es8_response: dict, + cursor: OffsetCursor, + ) -> ValuesearchResponse: + _iri_aggs = es8_response['aggregations'].get('agg_valuesearch_iris') + _buckets = _iri_aggs['buckets'] + _bucket_count = len(_buckets) + # WARNING: terribly hacky pagination (part two) + _page_end_index = cursor.start_offset + cursor.page_size + _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages + cursor.total_count = ( + MANY_MORE + if (_bucket_count > _page_end_index) # agg includes one more, if there + else _bucket_count + ) + return ValuesearchResponse( + cursor=cursor, + search_result_page=[ + self._valuesearch_iri_result(_iri_bucket) + for _iri_bucket in _bucket_page + ], + ) + + def _valuesearch_dates_response( + self, + valuesearch_params: ValuesearchParams, + es8_response: dict, + ) -> ValuesearchResponse: + _year_buckets = ( + es8_response['aggregations'] + ['agg_valuesearch_dates'] + ['buckets'] + ) + return ValuesearchResponse( + cursor=PageCursor(len(_year_buckets)), + search_result_page=[ + self._valuesearch_date_result(_year_bucket) + for _year_bucket in _year_buckets + ], + ) + + def _valuesearch_iri_result(self, iri_bucket) -> ValuesearchResult: + return ValuesearchResult( + value_iri=iri_bucket['key'], + value_type=_bucketlist(iri_bucket.get('agg_type_iri', [])), + name_text=_bucketlist(iri_bucket.get('agg_value_name', [])), + title_text=_bucketlist(iri_bucket.get('agg_value_title', [])), + label_text=_bucketlist(iri_bucket.get('agg_value_label', [])), + match_count=iri_bucket['doc_count'], + ) + + def _valuesearch_date_result(self, date_bucket) -> ValuesearchResult: + return ValuesearchResult( + value_iri=None, + value_value=date_bucket['key_as_string'], + label_text=(date_bucket['key_as_string'],), + match_count=date_bucket['doc_count'], + ) + + def _cardsearch_response( + self, + cardsearch_params: CardsearchParams, + es8_response: dict, + cursor: OffsetCursor, + ) -> CardsearchResponse: + _es8_total = es8_response['hits']['total'] + if _es8_total['relation'] != 'eq': + cursor.total_count = MANY_MORE + elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): + # account for the filtered-out first page + cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) + else: # exact (and small) count + cursor.total_count = _es8_total['value'] + _results = [] + for _es8_hit in es8_response['hits']['hits']: + _card_iri = _es8_hit['fields']['card.card_iri'][0] + _results.append(CardsearchResult( + card_iri=_card_iri, + card_pk=_es8_hit['_id'], + text_match_evidence=list(self._gather_textmatch_evidence(_card_iri, _es8_hit)), + )) + _relatedproperty_list: list[PropertypathUsage] = [] + if cardsearch_params.related_property_paths: + _relatedproperty_list.extend( + PropertypathUsage(property_path=_path, usage_count=0) + for _path in cardsearch_params.related_property_paths + ) + _relatedproperty_by_path = { + _result.property_path: _result + for _result in _relatedproperty_list + } + for _bucket in es8_response['aggregations']['agg_related_propertypath_usage']['buckets']: + _path = tuple(json.loads(_bucket['key'])) + _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] + return CardsearchResponse( + cursor=cursor, + search_result_page=_results, + related_propertypath_results=_relatedproperty_list, + cardsearch_params=cardsearch_params, + ) + + def _gather_textmatch_evidence(self, card_iri, es8_hit) -> Iterator[TextMatchEvidence]: + for _field, _snippets in es8_hit.get('highlight', {}).items(): + (_, _, _encoded_path) = _field.rpartition('.') + _property_path = _parse_path_field_name(_encoded_path) + for _snippet in _snippets: + yield TextMatchEvidence( + property_path=_property_path, + matching_highlight=rdf.literal(_snippet), + card_iri=card_iri, + ) + + +### +# building queries + +@dataclasses.dataclass +class _BoolBuilder: + bool_innards: dict[str, list[dict]] = dataclasses.field( + default_factory=lambda: { + 'filter': [], + 'must_not': [], + 'must': [], + 'should': [], + }, + ) + + def as_query(self): + return {'bool': self.bool_innards} + + def add_boolpart(self, key: str, query: dict) -> None: + self.bool_innards[key].append(query) + + def add_boolparts(self, boolparts: Iterator[tuple[str, dict]]): + for _key, _query in boolparts: + self.add_boolpart(_key, _query) + + +@dataclasses.dataclass +class _QueryHelper: + base_field: Literal['card', 'iri_value'] + textsegment_set: frozenset[Textsegment] + filter_set: frozenset[SearchFilter] + relevance_matters: bool + + def boolparts(self) -> Iterator[tuple[str, dict]]: + yield from self.iri_boolparts() + yield from self.text_boolparts() + + def iri_boolparts(self) -> Iterator[tuple[str, dict]]: + # iri-keyword filters + for _searchfilter in self.filter_set: + if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: + yield 'must_not', self._iri_filter(_searchfilter) + elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: + yield 'filter', self._iri_filter(_searchfilter) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: + yield 'filter', self._presence_query(_searchfilter) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: + yield 'must_not', self._presence_query(_searchfilter) + elif _searchfilter.operator.is_date_operator(): + yield 'filter', self._date_filter(_searchfilter) + else: + raise ValueError(f'unknown filter operator {_searchfilter.operator}') + + def text_boolparts(self) -> Iterator[tuple[str, dict]]: + # text-based queries + for _textsegment in self.textsegment_set: + if _textsegment.is_negated: + yield 'must_not', self._exact_text_query(_textsegment) + elif not _textsegment.is_fuzzy: + yield 'must', self._exact_text_query(_textsegment) + else: + yield 'must', self._fuzzy_text_must_query(_textsegment) + if self.relevance_matters: + yield 'should', self._fuzzy_text_should_query(_textsegment) + + def _presence_query(self, search_filter) -> dict: + return _any_query([ + self._path_presence_query(_path) + for _path in search_filter.propertypath_set + ]) + + def _path_presence_query(self, path: Propertypath): + _field = f'{self.base_field}.propertypaths_present' + return {'term': {_field: ts.propertypath_as_keyword(path)}} + + def _iri_filter(self, search_filter) -> dict: + _iris = ts.suffuniq_iris(search_filter.value_set) + return _any_query([ + self._path_iri_query(_path, _iris) + for _path in search_filter.propertypath_set + ]) + + def _path_iri_query(self, path, suffuniq_iris): + if path == (OWL.sameAs,): + _field = f'{self.base_field}.focus_iri.suffuniq' + elif is_globpath(path): + _field = f'{self.base_field}.iri_by_depth.{_depth_field_name(len(path))}' + else: + _field = f'{self.base_field}.iri_by_propertypath.{_path_field_name(path)}' + return {'terms': {_field: suffuniq_iris}} + + def _date_filter(self, search_filter): + return _any_query([ + self._date_filter_for_path(_path, search_filter.operator, search_filter.value_set) + for _path in search_filter.propertypath_set + ]) + + def _date_filter_for_path(self, path, filter_operator, value_set): + _field = f'{self.base_field}.date_by_propertypath.{_path_field_name(path)}' + if filter_operator == SearchFilter.FilterOperator.BEFORE: + _value = min(value_set) # rely on string-comparable isoformat + return {'range': {_field: {'lt': _daterange_value(_value)}}} + elif filter_operator == SearchFilter.FilterOperator.AFTER: + _value = max(value_set) # rely on string-comparable isoformat + return {'range': {_field: {'gt': _daterange_value(_value)}}} + elif filter_operator == SearchFilter.FilterOperator.AT_DATE: + return _any_query([ + {'range': {_field: {'gte': _filtervalue, 'lte': _filtervalue}}} + for _filtervalue in map(_daterange_value, value_set) + ]) + else: + raise ValueError(f'invalid date filter operator (got {filter_operator})') + + def _text_field_name(self, propertypath: Propertypath): + return ( + f'{self.base_field}.text_by_depth.{_depth_field_name(len(propertypath))}' + if is_globpath(propertypath) + else f'{self.base_field}.text_by_propertypath.{_path_field_name(propertypath)}' + ) + + def _exact_text_query(self, textsegment: Textsegment) -> dict: + # TODO: textsegment.is_openended (prefix query) + return _any_query([ + {'match_phrase': {self._text_field_name(_path): {'query': textsegment.text}}} + for _path in textsegment.propertypath_set + ]) + + def _fuzzy_text_must_query(self, textsegment: Textsegment) -> dict: + # TODO: textsegment.is_openended (prefix query) + return _any_query([ + {'match': { + self._text_field_name(_path): { + 'query': textsegment.text, + 'fuzziness': 'AUTO', + # TODO: consider 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) + }, + }} + for _path in textsegment.propertypath_set + ]) + + def _fuzzy_text_should_query(self, textsegment: Textsegment): + _slop = len(textsegment.text.split()) + return _any_query([ + {'match_phrase': { + self._text_field_name(_path): {'query': textsegment.text, 'slop': _slop}, + }} + for _path in textsegment.propertypath_set + ]) + + +@dataclasses.dataclass +class _CardsearchQueryBuilder: + params: CardsearchParams + + def build(self): + return { + 'query': self._cardsearch_query(), + 'aggs': self._cardsearch_aggs(), + 'sort': list(self._cardsearch_sorts()) or None, + 'from_': self._cardsearch_start_offset(), + 'size': self.response_cursor.page_size, + } + + @functools.cached_property + def response_cursor(self) -> OffsetCursor: + _request_cursor = self.params.page_cursor + if ( + _request_cursor.is_basic() + and not self.params.sort_list + and not self.params.cardsearch_textsegment_set + ): + return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) + return OffsetCursor.from_cursor(_request_cursor) + + def _cardsearch_start_offset(self): + if ( + self.response_cursor.is_first_page() + or not isinstance(self.response_cursor, ReproduciblyRandomSampleCursor) + ): + return self.response_cursor.start_offset + return self.response_cursor.start_offset - len(self.response_cursor.first_page_ids) + + def _cardsearch_query(self) -> dict: + _bool = _BoolBuilder() + _bool.add_boolparts( + _QueryHelper( + base_field='card', + textsegment_set=self.params.cardsearch_textsegment_set, + filter_set=self.params.cardsearch_filter_set, + relevance_matters=(not self.params.sort_list), + ).boolparts(), + ) + # exclude iri_value docs (possible optimization: separate indexes) + _bool.add_boolpart('must_not', {'exists': {'field': 'iri_value'}}) + return ( + self._randomly_ordered_query(_bool) + if isinstance(self.response_cursor, ReproduciblyRandomSampleCursor) + else _bool.as_query() + ) + + def _randomly_ordered_query(self, _bool: _BoolBuilder) -> dict: + assert isinstance(self.response_cursor, ReproduciblyRandomSampleCursor) + if not self.response_cursor.first_page_ids: + # independent random sample + return { + 'function_score': { + 'query': _bool.as_query(), + 'boost_mode': 'replace', + 'random_score': {}, # default random_score is fast and unpredictable + }, + } + _firstpage_filter = {'terms': {'card.card_pk': self.response_cursor.first_page_ids}} + if self.response_cursor.is_first_page(): + # returning to a first page previously visited + _bool.add_boolpart('filter', _firstpage_filter) + return _bool.as_query() + # get a subsequent page using reproducible randomness + _bool.add_boolpart('must_not', _firstpage_filter) + return { + 'function_score': { + 'query': _bool.as_query(), + 'boost_mode': 'replace', + 'random_score': { + 'seed': ''.join(self.response_cursor.first_page_ids), + 'field': 'card.card_pk', + }, + }, + } + + def _cardsearch_aggs(self): + _aggs = {} + if self.params.related_property_paths: + _aggs['agg_related_propertypath_usage'] = {'terms': { + 'field': 'card.propertypaths_present', + 'include': [ + ts.propertypath_as_keyword(_path) + for _path in self.params.related_property_paths + ], + 'size': len(self.params.related_property_paths), + }} + return _aggs + + def _cardsearch_sorts(self): + for _sortparam in self.params.sort_list: + _fieldkey = _path_field_name(_sortparam.propertypath) + if _sortparam.value_type == ValueType.DATE: + _field = f'card.date_by_propertypath.{_fieldkey}' + _unmapped_type = 'date' + elif _sortparam.value_type == ValueType.INTEGER: + _field = f'card.int_by_propertypath.{_fieldkey}' + _unmapped_type = 'long' + else: + raise ValueError(f'unsupported sort value type: {_sortparam}') + yield {_field: { + 'order': 'desc' if _sortparam.descending else 'asc', + 'unmapped_type': _unmapped_type, + }} + + +def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> dict: + _path = params.valuesearch_propertypath + _bool = _BoolBuilder() + _bool.add_boolpart('filter', {'term': { + 'iri_value.at_card_propertypaths': ts.propertypath_as_keyword(_path), + }}) + _bool.add_boolparts( + _QueryHelper( + base_field='card', + textsegment_set=params.cardsearch_textsegment_set, + filter_set=params.cardsearch_filter_set, + relevance_matters=False, + ).boolparts(), + ) + _bool.add_boolparts( + _QueryHelper( + base_field='iri_value', + textsegment_set=params.valuesearch_textsegment_set, + filter_set=params.valuesearch_filter_set, + relevance_matters=False, + ).boolparts() + ) + return { + 'query': _bool.as_query(), + 'size': 0, # ignore hits; just want the aggs + 'aggs': { + 'agg_valuesearch_iris': { + 'terms': { + 'field': 'iri_value.value_iri', + # WARNING: terribly hacky pagination (part one) + 'size': cursor.start_offset + cursor.page_size + 1, + }, + 'aggs': { + 'agg_type_iri': {'terms': { + 'field': f'iri_value.iri_by_propertypath.{_path_field_name((RDF.type,))}', + }}, + 'agg_value_name': {'terms': {'field': 'iri_value.value_name'}}, + 'agg_value_title': {'terms': {'field': 'iri_value.value_title'}}, + 'agg_value_label': {'terms': {'field': 'iri_value.value_label'}}, + }, + }, + }, + } + + +def _build_date_valuesearch(params: ValuesearchParams) -> dict: + assert not params.valuesearch_textsegment_set + assert not params.valuesearch_filter_set + _bool = _BoolBuilder() + _bool.add_boolparts( + _QueryHelper( + base_field='card', + textsegment_set=params.cardsearch_textsegment_set, + filter_set=params.cardsearch_filter_set, + relevance_matters=False, + ).boolparts(), + ) + # exclude iri_value docs (possible optimization: separate indexes) + _bool.add_boolpart('must_not', {'exists': {'field': 'iri_value'}}) + _field = f'card.date_by_propertypath.{_path_field_name(params.valuesearch_propertypath)}' + return { + 'query': _bool.as_query(), + 'size': 0, # ignore hits; just want the aggs + 'aggs': {'agg_valuesearch_dates': { + 'date_histogram': { + 'field': _field, + 'calendar_interval': 'year', + 'format': 'yyyy', + 'order': {'_key': 'desc'}, + 'min_doc_count': 1, + }, + }} + } + + +### +# assorted helper functions + +def _bucketlist(agg_result: dict) -> list[str]: + return [ + _bucket['key'] + for _bucket in agg_result['buckets'] + ] + + +def _daterange_value(datevalue: str): + _cleanvalue = datevalue.strip() + if re.fullmatch(r'\d{4,}', _cleanvalue): + return f'{_cleanvalue}||/y' + if re.fullmatch(r'\d{4,}-\d{2}', _cleanvalue): + return f'{_cleanvalue}||/M' + if re.fullmatch(r'\d{4,}-\d{2}-\d{2}', _cleanvalue): + return f'{_cleanvalue}||/d' + raise ValueError(f'bad date value "{datevalue}"') + + +def _depth_field_name(depth: int) -> str: + return f'depth{depth}' + + +def _path_field_name(path: Propertypath) -> str: + return ts.b64(ts.propertypath_as_keyword(path)) + + +def _parse_path_field_name(path_field_name: str) -> Propertypath: + # inverse of _path_field_name + _list = json.loads(ts.b64_reverse(path_field_name)) + assert isinstance(_list, list) + assert all(isinstance(_item, str) for _item in _list) + return tuple(_list) + + +def _any_query(queries: abc.Collection[dict]): + if len(queries) == 1: + (_query,) = queries + return _query + return {'bool': {'should': list(queries), 'minimum_should_match': 1}} diff --git a/share/search/messages.py b/share/search/messages.py index 3eeda7204..a3930b42c 100644 --- a/share/search/messages.py +++ b/share/search/messages.py @@ -18,12 +18,6 @@ class MessageType(enum.Enum): # for indexcard-based indexes: UPDATE_INDEXCARD = 'update-indexcard' BACKFILL_INDEXCARD = 'backfill-indexcard' - # for identifier-based indexes: (TODO: remove?) - IDENTIFIER_INDEXED = 'identifier-indexed' - BACKFILL_IDENTIFIER = 'backfill-identifier' - # for aggregating identifier usage across index cards: - IDENTIFIER_USED = 'identifier-used' - BACKFILL_IDENTIFIER_USAGE = 'backfill-identifier-usage' @classmethod def from_int(cls, message_type_int: int): @@ -44,10 +38,6 @@ class IntMessageType(enum.IntEnum): BACKFILL_SUID = 6 UPDATE_INDEXCARD = 7 BACKFILL_INDEXCARD = 8 - IDENTIFIER_INDEXED = 9 - BACKFILL_IDENTIFIER = 10 - IDENTIFIER_USED = 11 - BACKFILL_IDENTIFIER_USAGE = 12 if __debug__: @@ -61,8 +51,6 @@ def _enum_keys(an_enum_class): BACKFILL_MESSAGE_TYPES = { MessageType.BACKFILL_SUID, MessageType.BACKFILL_INDEXCARD, - MessageType.BACKFILL_IDENTIFIER, - MessageType.BACKFILL_IDENTIFIER_USAGE, } diff --git a/share/shell_util.py b/share/shell_util.py index 305d82d4d..223f13304 100644 --- a/share/shell_util.py +++ b/share/shell_util.py @@ -4,13 +4,13 @@ """ from share import tasks -from share.search import IndexMessenger, IndexStrategy +from share.search import IndexMessenger, index_strategy from share.util import IDObfuscator __all__ = ( - 'tasks', - 'IndexMessenger', - 'IndexStrategy', 'IDObfuscator', + 'IndexMessenger', + 'index_strategy', + 'tasks', ) diff --git a/share/tasks/__init__.py b/share/tasks/__init__.py index c78bd4ada..4c4baecbe 100644 --- a/share/tasks/__init__.py +++ b/share/tasks/__init__.py @@ -9,7 +9,7 @@ from share.harvest.scheduler import HarvestScheduler from share import models as db from share.search.index_messenger import IndexMessenger -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy from share.search.messages import MessageType from share.tasks.jobs import HarvestJobConsumer from share.util.source_stat import SourceStatus @@ -61,7 +61,7 @@ def schedule_index_backfill(self, index_backfill_pk): _index_backfill = db.IndexBackfill.objects.get(pk=index_backfill_pk) _index_backfill.pls_note_scheduling_has_begun() try: - _index_strategy = IndexStrategy.get_by_name(_index_backfill.index_strategy_name) + _index_strategy = index_strategy.get_index_strategy(_index_backfill.index_strategy_name) _messenger = IndexMessenger(celery_app=self.app, index_strategys=[_index_strategy]) _messagetype = _index_strategy.backfill_message_type assert _messagetype in _index_strategy.supported_message_types diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html index 4a1940835..30d5e11a1 100644 --- a/templates/admin/search-indexes.html +++ b/templates/admin/search-indexes.html @@ -40,7 +40,7 @@

current index: {{index {% trans "is default for searching" %} {% trans "doc count" %} {% trans "actions" %} - {% trans "backfill" %} + {% trans "links" %} {{ indexes.current.status.creation_date|default:"--" }} @@ -89,10 +89,11 @@

current index: {{index {% if indexes.current.backfill.backfill_admin_url %}

- {{ indexes.current.backfill.backfill_status }} + {% trans "backfill" %}:{{ indexes.current.backfill.backfill_status }}

- {% else %} - -- + {% endif %} + {% if indexes.current.status.creation_date %} +

{% trans "mappings" %}

{% endif %} @@ -108,6 +109,7 @@

prior indexes

{% trans "is default for searching" %} {% trans "doc count" %} {% trans "actions" %} + {% trans "links" %} {% trans "index name" %} {% for index_status in indexes.prior %} @@ -148,6 +150,10 @@

prior indexes

{% endif %} + {% if index_status.creation_date %} +

{% trans "mappings" %}

+ {% endif %} + {{ index_status.specific_indexname }} {% endfor %} diff --git a/tests/api/test_elasticsearch.py b/tests/api/test_elasticsearch.py index cb2510ffe..13e6688f5 100644 --- a/tests/api/test_elasticsearch.py +++ b/tests/api/test_elasticsearch.py @@ -52,10 +52,10 @@ def test_search(self): '/api/v2/search/creativeworks/_search?q=foo', '/api/v2/search/creativeworks/_search/?q=foo', ) - with mock.patch('api.search.views.IndexStrategy') as mock_IndexStrategy: + with mock.patch('api.search.views.index_strategy') as _mock_index_strategy_module: mock_handle_search = ( - mock_IndexStrategy - .get_for_sharev2_search + _mock_index_strategy_module + .get_index_for_sharev2_search .return_value .pls_handle_search__sharev2_backcompat ) diff --git a/tests/api/test_feeds.py b/tests/api/test_feeds.py index a08cb1069..49a016664 100644 --- a/tests/api/test_feeds.py +++ b/tests/api/test_feeds.py @@ -52,7 +52,7 @@ def fake_items(self, Graph): json.loads(formatted_item) for formatted_item in formatted_items ] - with mock.patch('api.views.feeds.IndexStrategy.get_for_sharev2_search') as mock_get_for_searching: + with mock.patch('api.views.feeds.index_strategy.get_index_for_sharev2_search') as mock_get_for_searching: mock_strategy = mock_get_for_searching.return_value mock_strategy.pls_handle_search__sharev2_backcompat.return_value = { 'hits': { diff --git a/tests/share/bin/test_sharectl.py b/tests/share/bin/test_sharectl.py index d8f557b17..e39c6140c 100644 --- a/tests/share/bin/test_sharectl.py +++ b/tests/share/bin/test_sharectl.py @@ -8,6 +8,8 @@ from share.bin.util import execute_cmd import share.version +from tests.share.search import patch_index_strategies + def run_sharectl(*args): """run sharectl, assert that it returned as expected, and return its stdout @@ -39,7 +41,7 @@ def test_purge(self, indexnames): def _get_specific_index(indexname): return mock_specific_indexes[indexname] - with mock.patch('share.bin.search.IndexStrategy.get_specific_index', wraps=_get_specific_index) as mock_get_specific: + with mock.patch('share.bin.search.index_strategy.get_specific_index', wraps=_get_specific_index) as mock_get_specific: run_sharectl('search', 'purge', *indexnames) assert mock_get_specific.mock_calls == [ mock.call(indexname) @@ -49,20 +51,20 @@ def _get_specific_index(indexname): mock_specific_index.pls_delete.assert_called_once_with() def test_setup_initial(self, settings): - expected_indexes = ['baz', 'bar', 'foo'] - mock_index_strategys = [ - mock.Mock() - for _ in expected_indexes - ] - with mock.patch('share.bin.search.IndexStrategy.all_strategies', return_value=mock_index_strategys): + _expected_indexes = ['baz', 'bar', 'foo'] + _mock_index_strategys = { + _name: mock.Mock() + for _name in _expected_indexes + } + with patch_index_strategies(_mock_index_strategys): run_sharectl('search', 'setup', '--initial') - for mock_index_strategy in mock_index_strategys: + for mock_index_strategy in _mock_index_strategys.values(): mock_specific_index = mock_index_strategy.for_current_index.return_value assert mock_specific_index.pls_setup.mock_calls == [mock.call(skip_backfill=True)] def test_setup_index(self): mock_index_strategy = mock.Mock() - with mock.patch('share.bin.search.IndexStrategy.get_by_name', return_value=mock_index_strategy): + with mock.patch('share.bin.search.index_strategy.get_index_strategy', return_value=mock_index_strategy): run_sharectl('search', 'setup', 'foo') mock_current_index = mock_index_strategy.for_current_index.return_value assert mock_current_index.pls_setup.mock_calls == [mock.call(skip_backfill=False)] diff --git a/tests/share/search/__init__.py b/tests/share/search/__init__.py new file mode 100644 index 000000000..a7a49aaf9 --- /dev/null +++ b/tests/share/search/__init__.py @@ -0,0 +1,16 @@ +import contextlib +from unittest import mock + +from share.search import index_strategy + + +@contextlib.contextmanager +def patch_index_strategies(strategies: dict[str, index_strategy.IndexStrategy]): + index_strategy.all_index_strategies.cache_clear() + with mock.patch.object( + index_strategy, + 'all_index_strategies', + return_value=strategies, + ): + yield + index_strategy.all_index_strategies.cache_clear() diff --git a/tests/share/search/conftest.py b/tests/share/search/conftest.py index 65fe44825..b87757372 100644 --- a/tests/share/search/conftest.py +++ b/tests/share/search/conftest.py @@ -4,35 +4,10 @@ @pytest.fixture -def fake_elastic_strategies(settings): - settings.ELASTICSEARCH = { - **settings.ELASTICSEARCH, - 'INDEX_STRATEGIES': { - 'my_es5_strategy': { - 'CLUSTER_SETTINGS': {'URL': 'blah'}, - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic5.Sharev2Elastic5IndexStrategy', - }, - 'my_es8_strategy': { - 'CLUSTER_SETTINGS': {'URL': 'bleh'}, - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic8.Sharev2Elastic8IndexStrategy', - }, - 'another_es8_strategy': { - 'CLUSTER_SETTINGS': {'URL': 'bluh'}, - 'INDEX_STRATEGY_CLASS': 'share.search.index_strategy.sharev2_elastic8.Sharev2Elastic8IndexStrategy', - }, - }, - } - return tuple(settings.ELASTICSEARCH['INDEX_STRATEGIES'].keys()) - - -@pytest.fixture -def mock_elastic_clients(fake_elastic_strategies): - with mock.patch('share.search.index_strategy.sharev2_elastic5.elasticsearch5') as es5_mockpackage: - with mock.patch('share.search.index_strategy.elastic8.elasticsearch8') as es8_mockpackage: - es5_mockclient = es5_mockpackage.Elasticsearch.return_value - es8_mockclient = es8_mockpackage.Elasticsearch.return_value - yield { - 'my_es5_strategy': es5_mockclient, - 'my_es8_strategy': es8_mockclient, - 'another_es8_strategy': es8_mockclient, - } +def mock_elastic_clients(settings): + # set elastic urls to non-empty but non-usable values + settings.ELASTICSEARCH5_URL = 'fake://bleh' + settings.ELASTICSEARCH8_URL = 'fake://bluh' + with mock.patch('share.search.index_strategy.sharev2_elastic5.elasticsearch5'): + with mock.patch('share.search.index_strategy.elastic8.elasticsearch8'): + yield diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py new file mode 100644 index 000000000..f3eff4813 --- /dev/null +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -0,0 +1,482 @@ +from typing import Iterable, Iterator +from datetime import date, timedelta +import itertools +import math +from urllib.parse import urlencode + +from primitive_metadata import primitive_rdf as rdf + +from tests import factories +from share.search import messages +from trove import models as trove_db +from trove.trovesearch.search_params import CardsearchParams, ValuesearchParams +from trove.vocab.namespaces import RDFS, TROVE, RDF, DCTERMS, OWL, FOAF, DCAT +from ._with_real_services import RealElasticTestCase + + +BLARG = rdf.IriNamespace('https://blarg.example/blarg/') + + +class CommonTrovesearchTests(RealElasticTestCase): + _indexcard_focus_by_uuid: dict[str, str] + + def setUp(self): + super().setUp() + self._indexcard_focus_by_uuid = {} + + def test_for_smoke_without_daemon(self): + _indexcard = self._create_indexcard( + focus_iri=BLARG.hello, + rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, + ) + _messages_chunk = messages.MessagesChunk( + messages.MessageType.UPDATE_INDEXCARD, + [_indexcard.id], + ) + self._assert_happypath_without_daemon( + _messages_chunk, + expected_doc_count=1, + ) + + def test_for_smoke_with_daemon(self): + _indexcard = self._create_indexcard( + focus_iri=BLARG.hello, + rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, + ) + _messages_chunk = messages.MessagesChunk( + messages.MessageType.UPDATE_INDEXCARD, + [_indexcard.id], + ) + self._assert_happypath_with_daemon( + _messages_chunk, + expected_doc_count=1, + ) + + def test_cardsearch(self): + self._fill_test_data_for_querying() + _cardsearch_cases = itertools.chain( + self.cardsearch_cases(), + self.cardsearch_integer_cases(), + ) + for _queryparams, _expected_result_iris in _cardsearch_cases: + _cardsearch_params = CardsearchParams.from_querystring(urlencode(_queryparams)) + assert isinstance(_cardsearch_params, CardsearchParams) + _cardsearch_response = self.current_index.pls_handle_cardsearch(_cardsearch_params) + # assumes all results fit on one page + _actual_result_iris: set[str] | list[str] = [ + self._indexcard_focus_by_uuid[_result.card_uuid] + for _result in _cardsearch_response.search_result_page + ] + # test sort order only when expected results are ordered + if isinstance(_expected_result_iris, set): + _actual_result_iris = set(_actual_result_iris) + self.assertEqual(_expected_result_iris, _actual_result_iris, msg=f'?{_queryparams}') + + def test_cardsearch_pagination(self): + _cards: list[trove_db.Indexcard] = [] + _expected_iris = set() + _page_size = 7 + _total_count = 55 + _start_date = date(1999, 12, 31) + for _i in range(_total_count): + _card_iri = BLARG[f'i{_i}'] + _expected_iris.add(_card_iri) + _cards.append(self._create_indexcard(_card_iri, { + _card_iri: { + RDF.type: {BLARG.Thing}, + DCTERMS.title: {rdf.literal(f'card #{_i}')}, + DCTERMS.created: {rdf.literal(_start_date + timedelta(weeks=_i, days=_i))}, + }, + })) + self._index_indexcards(_cards) + # gather all pages results: + _querystring: str = f'page[size]={_page_size}' + _result_iris: set[str] = set() + _page_count = 0 + while True: + _cardsearch_response = self.current_index.pls_handle_cardsearch( + CardsearchParams.from_querystring(_querystring), + ) + _page_iris = { + self._indexcard_focus_by_uuid[_result.card_uuid] + for _result in _cardsearch_response.search_result_page + } + self.assertFalse(_result_iris.intersection(_page_iris)) + self.assertLessEqual(len(_page_iris), _page_size) + _result_iris.update(_page_iris) + _page_count += 1 + _next_cursor = _cardsearch_response.cursor.next_cursor() + if _next_cursor is None: + break + _querystring = urlencode({'page[cursor]': _next_cursor.as_queryparam_value()}) + self.assertEqual(_page_count, math.ceil(_total_count / _page_size)) + self.assertEqual(_result_iris, _expected_iris) + + def test_valuesearch(self): + self._fill_test_data_for_querying() + _valuesearch_cases = itertools.chain( + self.valuesearch_simple_cases(), + self.valuesearch_complex_cases(), + ) + for _queryparams, _expected_values in _valuesearch_cases: + _valuesearch_params = ValuesearchParams.from_querystring(urlencode(_queryparams)) + assert isinstance(_valuesearch_params, ValuesearchParams) + _valuesearch_response = self.current_index.pls_handle_valuesearch(_valuesearch_params) + # assumes all results fit on one page + _actual_values = { + _result.value_iri or _result.value_value + for _result in _valuesearch_response.search_result_page + } + self.assertEqual(_expected_values, _actual_values) + + def _fill_test_data_for_querying(self): + _card_a = self._create_indexcard(BLARG.a, { + BLARG.a: { + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, + DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('aaaa')}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, + DCTERMS.references: {BLARG.b, BLARG.c}, + DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, + }, + BLARG.someone: { + FOAF.name: {rdf.literal('some one')}, + }, + BLARG.b: { + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, + DCTERMS.title: {rdf.literal('bbbb')}, + DCTERMS.references: {BLARG.c}, + }, + BLARG.c: { + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, + DCTERMS.title: {rdf.literal('cccc')}, + }, + }) + _card_b = self._create_indexcard(BLARG.b, { + BLARG.b: { + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.b_same}, + DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('bbbb')}, + DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, + DCTERMS.references: {BLARG.c}, + DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, + }, + BLARG.someone: { + FOAF.name: {rdf.literal('some one')}, + }, + BLARG.c: { + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, + DCTERMS.title: {rdf.literal('cccc')}, + }, + }) + _card_c = self._create_indexcard(BLARG.c, { + BLARG.c: { + RDF.type: {BLARG.Thing}, + DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, + DCTERMS.creator: {BLARG.someone_else}, + DCTERMS.title: {rdf.literal('cccc')}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, + DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, + }, + BLARG.someone_else: { + FOAF.name: {rdf.literal('some one else')}, + }, + }) + self._create_supplement(_card_a, BLARG.a, { + BLARG.a: { + DCTERMS.replaces: {BLARG.a_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(10)}}), + }, + }, + }) + self._create_supplement(_card_b, BLARG.b, { + BLARG.b: { + DCTERMS.replaces: {BLARG.b_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(7)}}), + }, + }, + }) + self._create_supplement(_card_c, BLARG.c, { + BLARG.c: { + DCTERMS.replaces: {BLARG.c_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(333)}}), + }, + }, + }) + self._index_indexcards([_card_a, _card_b, _card_c]) + + def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: + # using data from _fill_test_data_for_querying + yield ( + {}, # no query params + {BLARG.a, BLARG.b, BLARG.c}, + ) + yield ( + {'sort': 'dateCreated'}, + [BLARG.a, BLARG.b, BLARG.c], # ordered list + ) + yield ( + {'sort': '-dateCreated'}, + [BLARG.c, BLARG.b, BLARG.a], # ordered list + ) + yield ( + {'cardSearchFilter[creator]': BLARG.someone}, + {BLARG.a, BLARG.b}, + ) + yield ( + {'cardSearchFilter[creator]': ','.join((BLARG.someone_else, BLARG.someone))}, + {BLARG.a, BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchFilter[resourceType]': BLARG.Thing}, + {BLARG.a, BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchFilter[resourceType]': BLARG.Nothing}, + set(), + ) + yield ( + {'cardSearchFilter[references]': BLARG.b}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[references]': BLARG.c}, + {BLARG.a, BLARG.b}, + ) + yield ( + {'cardSearchFilter[references.references]': BLARG.c}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[references.references][is-present]': ''}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[references.references.subject][is-present]': ''}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[references.references][is-absent]': ''}, + {BLARG.c, BLARG.b}, + ) + yield ( + {'cardSearchFilter[references.references.subject][is-absent]': ''}, + {BLARG.c, BLARG.b}, + ) + yield ( + {'cardSearchFilter[dcterms:replaces]': BLARG.b_past}, + {BLARG.b}, + ) + yield ( + {'cardSearchFilter[dcterms:replaces][is-absent]': ''}, + set(), + ) + yield ( + {'cardSearchFilter[subject]': BLARG.subj_ac}, + {BLARG.c, BLARG.a}, + ) + yield ( + {'cardSearchFilter[subject][none-of]': BLARG.subj_ac}, + {BLARG.b}, + ) + yield ( + { + 'cardSearchFilter[subject]': BLARG.subj_bc, + 'cardSearchFilter[creator]': BLARG.someone, + }, + {BLARG.b}, + ) + yield ( + { + 'cardSearchFilter[subject]': BLARG.subj_bc, + 'cardSearchText[*]': 'cccc', + }, + {BLARG.c}, + ) + yield ( + { + 'cardSearchFilter[resourceType]': ','.join((BLARG.Thing, BLARG.Another, BLARG.Nothing)), + 'cardSearchFilter[subject]': BLARG.subj_bc, + 'cardSearchText[*,creator.name]': 'else', + }, + {BLARG.c}, + ) + yield ( + { + 'cardSearchFilter[resourceType]': BLARG.Nothing, + 'cardSearchFilter[subject]': BLARG.subj_bc, + 'cardSearchText[*,creator.name]': 'else', + }, + set(), + ) + yield ( + {'cardSearchText[*,creator.name]': 'some'}, + {BLARG.a, BLARG.b, BLARG.c}, + ) + yield ( + { + 'cardSearchFilter[dateCreated]': '1999', + 'cardSearchText[*]': '', + }, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[dateCreated]': '1999-12'}, + {BLARG.a}, + ) + yield ( + {'cardSearchFilter[dateCreated]': '1999-11'}, + set(), + ) + yield ( + {'cardSearchFilter[dateCreated]': '2012-12-31'}, + {BLARG.b}, + ) + yield ( + {'cardSearchFilter[dateCreated][after]': '2030'}, + set(), + ) + yield ( + {'cardSearchFilter[dateCreated][after]': '2011'}, + {BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchFilter[dateCreated][before]': '2012-12'}, + {BLARG.a}, + ) + yield ( + {'cardSearchText': 'bbbb'}, + {BLARG.b}, + ) + yield ( + {'cardSearchText': '-bbbb'}, + {BLARG.a, BLARG.c}, + ) + yield ( + {'cardSearchText': 'danger'}, + {BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchText': 'dangre'}, + {BLARG.b, BLARG.c}, + ) + yield ( + {'cardSearchText': '"dangre"'}, + set(), + ) + yield ( + {'cardSearchText': 'danger -repulsive'}, + {BLARG.c}, + ) + yield ( + {'cardSearchText': '"nothing valued is here"'}, + {BLARG.a}, + ) + yield ( + {'cardSearchText': '"nothing valued here"'}, + set(), + ) + yield ( + {'cardSearchText': '"what is here"'}, + {BLARG.b}, + ) + + def cardsearch_integer_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: + # cases that depend on integer values getting indexed + yield ( + {'sort[integer-value]': 'dcat:servesDataset.dcat:spatialResolutionInMeters'}, + [BLARG.b, BLARG.a, BLARG.c], # ordered list + ) + yield ( + {'sort[integer-value]': '-dcat:servesDataset.dcat:spatialResolutionInMeters'}, + [BLARG.c, BLARG.a, BLARG.b], # ordered list + ) + + def valuesearch_simple_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: + yield ( + {'valueSearchPropertyPath': 'references'}, + {BLARG.b, BLARG.c}, + ) + yield ( + {'valueSearchPropertyPath': 'dateCreated'}, + {'1999', '2012', '2024'}, + ) + # TODO: more + + def valuesearch_complex_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: + yield ( + { + 'valueSearchPropertyPath': 'references', + 'valueSearchFilter[resourceType]': BLARG.Thing, + }, + {BLARG.b, BLARG.c}, + ) + yield ( + { + 'valueSearchPropertyPath': 'references', + 'valueSearchText': 'bbbb', + }, + {BLARG.b}, + ) + # TODO: more + + def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): + _messages_chunk = messages.MessagesChunk( + messages.MessageType.UPDATE_INDEXCARD, + [_indexcard.id for _indexcard in indexcards], + ) + self.assertTrue(all( + _response.is_done + for _response in self.index_strategy.pls_handle_messages_chunk(_messages_chunk) + )) + self.current_index.pls_refresh() + + def _create_indexcard(self, focus_iri: str, rdf_tripledict: rdf.RdfTripleDictionary) -> trove_db.Indexcard: + _suid = factories.SourceUniqueIdentifierFactory() + _raw = factories.RawDatumFactory( + suid=_suid, + ) + _indexcard = trove_db.Indexcard.objects.create( + source_record_suid=_suid, + ) + # an osfmap_json card is required for indexing, but not used in these tests + trove_db.DerivedIndexcard.objects.create( + upriver_indexcard=_indexcard, + deriver_identifier=trove_db.ResourceIdentifier.objects.get_or_create_for_iri(TROVE['derive/osfmap_json']), + ) + trove_db.LatestIndexcardRdf.objects.create( + from_raw_datum=_raw, + indexcard=_indexcard, + focus_iri=focus_iri, + rdf_as_turtle=rdf.turtle_from_tripledict(rdf_tripledict), + turtle_checksum_iri='foo', # not enforced + ) + self._indexcard_focus_by_uuid[str(_indexcard.uuid)] = focus_iri + return _indexcard + + def _create_supplement( + self, + indexcard: trove_db.Indexcard, + focus_iri: str, + rdf_tripledict: rdf.RdfTripleDictionary, + ) -> trove_db.SupplementaryIndexcardRdf: + _supp_suid = factories.SourceUniqueIdentifierFactory() + _supp_raw = factories.RawDatumFactory(suid=_supp_suid) + return trove_db.SupplementaryIndexcardRdf.objects.create( + from_raw_datum=_supp_raw, + indexcard=indexcard, + supplementary_suid=_supp_suid, + focus_iri=focus_iri, + rdf_as_turtle=rdf.turtle_from_tripledict(rdf_tripledict), + turtle_checksum_iri='sup', # not enforced + ) diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index 3a88879e5..46f133121 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -1,15 +1,13 @@ import contextlib -import unittest from unittest import mock -from django.test import override_settings, TransactionTestCase -from django.conf import settings +from django.test import TransactionTestCase from django.db import connections from project.celery import app as celery_app from share.search.daemon import IndexerDaemonControl from share.search.index_messenger import IndexMessenger -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy # base class for testing IndexStrategy subclasses with actual elasticsearch. @@ -19,18 +17,12 @@ class RealElasticTestCase(TransactionTestCase): serialized_rollback = True # for TransactionTestCase; restore db after # required for subclasses - strategy_name_for_real: str - strategy_name_for_test: str - - @classmethod - def setUpClass(cls): - cls.__original_es_settings = settings.ELASTICSEARCH + def get_index_strategy(self) -> index_strategy.IndexStrategy: + raise NotImplementedError(f'{self.__class__} must implement `get_index_strategy`') def setUp(self): super().setUp() self.enterContext(mock.patch('share.models.core._setup_user_token_and_groups')) - self.enterContext(self._settings_for_test()) - IndexStrategy.clear_strategy_cache() self.index_strategy = self.get_index_strategy() self.index_messenger = IndexMessenger( celery_app=celery_app, @@ -43,7 +35,6 @@ def setUp(self): def tearDown(self): super().tearDown() self.current_index.pls_delete() - IndexStrategy.clear_strategy_cache() # HACK: copied from TransactionTestCase._fixture_setup; restores db # to the state from before TransactionTestCase clobbered it (relies # on how django 3.2 implements `serialized_rollback = True`, above) @@ -57,43 +48,15 @@ def enterContext(self, context_manager): self.addCleanup(lambda: context_manager.__exit__(None, None, None)) return result - def get_index_strategy(self): - return IndexStrategy.get_by_name(self.strategy_name_for_test) - @contextlib.contextmanager def _daemon_up(self): - _daemon_control = IndexerDaemonControl( - celery_app, - daemonthread_context=self._settings_for_test, # will be called in daemonthread - ) + _daemon_control = IndexerDaemonControl(celery_app) _daemon_control.start_daemonthreads_for_strategy(self.get_index_strategy()) try: yield _daemon_control finally: _daemon_control.stop_daemonthreads(wait=True) - @contextlib.contextmanager - def _settings_for_test(self): - try: - _real_strategy_settings = ( - self.__original_es_settings - ['INDEX_STRATEGIES'] - [self.strategy_name_for_real] - ) - except KeyError: - raise unittest.SkipTest( - f'index strategy "{self.strategy_name_for_real}" not configured in' - " ELASTICSEARCH['INDEX_STRATEGIES'] (perhaps missing env)" - ) - _new_es_settings = { - **self.__original_es_settings, - 'INDEX_STRATEGIES': { # wipe out all configured strategies - self.strategy_name_for_test: _real_strategy_settings, - } - } - with override_settings(ELASTICSEARCH=_new_es_settings): - yield - # for test methods on subclasses to call: def _assert_happypath_without_daemon(self, messages_chunk, expected_doc_count): _responses = list(self.index_strategy.pls_handle_messages_chunk(messages_chunk)) diff --git a/tests/share/search/index_strategy/test_elastic8.py b/tests/share/search/index_strategy/test_elastic8.py index 4eeeef385..5de732690 100644 --- a/tests/share/search/index_strategy/test_elastic8.py +++ b/tests/share/search/index_strategy/test_elastic8.py @@ -46,11 +46,9 @@ def mock_es_client(self): yield es8_mockclient @pytest.fixture - def fake_strategy(self, mock_es_client): - strat = FakeElastic8IndexStrategy( - name='fake_es8', - cluster_settings={'URL': 'http://nowhere.example:12345/'}, - ) + def fake_strategy(self, mock_es_client, settings): + settings.ELASTICSEARCH8_URL = 'http://nowhere.example:12345/' + strat = FakeElastic8IndexStrategy(name='fake_es8') strat.assert_strategy_is_current() return strat diff --git a/tests/share/search/index_strategy/test_sharev2_elastic5.py b/tests/share/search/index_strategy/test_sharev2_elastic5.py index 729eab0fb..6b1618301 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic5.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic5.py @@ -1,19 +1,20 @@ import json +import unittest + +from django.conf import settings from tests import factories from share.search import messages +from share.search.index_strategy.sharev2_elastic5 import Sharev2Elastic5IndexStrategy from share.util import IDObfuscator from ._with_real_services import RealElasticTestCase +@unittest.skipUnless(settings.ELASTICSEARCH5_URL, 'missing ELASTICSEARCH5_URL setting') class TestSharev2Elastic5(RealElasticTestCase): # for RealElasticTestCase - strategy_name_for_real = 'sharev2_elastic5' - strategy_name_for_test = 'test_sharev2_elastic5' - - # override method from RealElasticTestCase def get_index_strategy(self): - index_strategy = super().get_index_strategy() + index_strategy = Sharev2Elastic5IndexStrategy('test_sharev2_elastic5') if not index_strategy.STATIC_INDEXNAME.startswith('test_'): index_strategy.STATIC_INDEXNAME = f'test_{index_strategy.STATIC_INDEXNAME}' return index_strategy diff --git a/tests/share/search/index_strategy/test_sharev2_elastic8.py b/tests/share/search/index_strategy/test_sharev2_elastic8.py index 0385cece3..7b1c76845 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic8.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic8.py @@ -2,14 +2,15 @@ from tests import factories from share.search import messages +from share.search.index_strategy.sharev2_elastic8 import Sharev2Elastic8IndexStrategy from share.util import IDObfuscator from ._with_real_services import RealElasticTestCase class TestSharev2Elastic8(RealElasticTestCase): # for RealElasticTestCase - strategy_name_for_real = 'sharev2_elastic8' - strategy_name_for_test = 'test_sharev2_elastic8' + def get_index_strategy(self): + return Sharev2Elastic8IndexStrategy('test_sharev2_elastic8') def setUp(self): super().setUp() diff --git a/tests/share/search/index_strategy/test_base_index_strategy.py b/tests/share/search/index_strategy/test_strategy_selection.py similarity index 55% rename from tests/share/search/index_strategy/test_base_index_strategy.py rename to tests/share/search/index_strategy/test_strategy_selection.py index d53cd37af..e24fb0a1a 100644 --- a/tests/share/search/index_strategy/test_base_index_strategy.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -1,30 +1,38 @@ +# TODO: update import pytest from share.search.exceptions import IndexStrategyError from share.search.index_strategy import ( + all_index_strategies, + get_index_strategy, + get_specific_index, + get_index_for_sharev2_search, IndexStrategy, sharev2_elastic5, sharev2_elastic8, + trove_indexcard_flats, + trovesearch_denorm, ) @pytest.fixture -def expected_strategy_classes(fake_elastic_strategies): +def expected_strategy_classes(): return { - 'my_es5_strategy': sharev2_elastic5.Sharev2Elastic5IndexStrategy, - 'my_es8_strategy': sharev2_elastic8.Sharev2Elastic8IndexStrategy, - 'another_es8_strategy': sharev2_elastic8.Sharev2Elastic8IndexStrategy, + 'sharev2_elastic5': sharev2_elastic5.Sharev2Elastic5IndexStrategy, + 'sharev2_elastic8': sharev2_elastic8.Sharev2Elastic8IndexStrategy, + 'trove_indexcard_flats': trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy, + 'trovesearch_denorm': trovesearch_denorm.TrovesearchDenormIndexStrategy, } class TestBaseIndexStrategy: - def test_get_by_name(self, mock_elastic_clients, expected_strategy_classes): + def test_get_index_strategy(self, mock_elastic_clients, expected_strategy_classes): for strategy_name, expected_strategy_class in expected_strategy_classes.items(): - index_strategy = IndexStrategy.get_by_name(strategy_name) + index_strategy = get_index_strategy(strategy_name) assert isinstance(index_strategy, expected_strategy_class) - def test_all_strategies(self, mock_elastic_clients, expected_strategy_classes): - all_strategys = tuple(IndexStrategy.all_strategies()) + def test_all_index_strategies(self, mock_elastic_clients, expected_strategy_classes): + all_strategys = tuple(all_index_strategies().values()) assert len(all_strategys) == len(expected_strategy_classes) strategy_names = {index_strategy.name for index_strategy in all_strategys} assert strategy_names == set(expected_strategy_classes.keys()) @@ -34,36 +42,29 @@ def test_all_strategies(self, mock_elastic_clients, expected_strategy_classes): assert issubclass(index_strategy.SpecificIndex, IndexStrategy.SpecificIndex) assert index_strategy.SpecificIndex is not IndexStrategy.SpecificIndex - def test_get_by_specific_indexname(self, mock_elastic_clients, expected_strategy_classes, fake_elastic_strategies): + def test_get_by_specific_indexname(self, mock_elastic_clients, expected_strategy_classes): for strategy_name, expected_strategy_class in expected_strategy_classes.items(): - indexname_prefix = IndexStrategy.get_by_name(strategy_name).indexname_prefix + indexname_prefix = get_index_strategy(strategy_name).indexname_prefix specific_indexname = ''.join((indexname_prefix, 'foo')) - specific_index = IndexStrategy.get_specific_index(specific_indexname) + specific_index = get_specific_index(specific_indexname) assert isinstance(specific_index.index_strategy, expected_strategy_class) assert isinstance(specific_index, expected_strategy_class.SpecificIndex) assert specific_index.indexname == specific_indexname bad_indexname = 'foo_foo' # assumed to not start with index prefix with pytest.raises(IndexStrategyError): - IndexStrategy.get_specific_index(bad_indexname) + get_specific_index(bad_indexname) @pytest.mark.django_db - def test_get_by_request(self, mock_elastic_clients, fake_elastic_strategies): - IndexStrategy.clear_strategy_cache() - for strategy_name in mock_elastic_clients.keys(): - index_strategy = IndexStrategy.get_by_name(strategy_name) + def test_get_by_request(self, mock_elastic_clients): + for strategy_name, index_strategy in all_index_strategies().items(): good_requests = [ strategy_name, index_strategy.current_indexname, ''.join((index_strategy.indexname_prefix, 'foo')), ] for good_request in good_requests: - specific_index = IndexStrategy.get_for_sharev2_search(good_request) + specific_index = get_index_for_sharev2_search(good_request) assert isinstance(specific_index, index_strategy.SpecificIndex) assert specific_index.index_strategy is index_strategy - # bad calls: - with pytest.raises(IndexStrategyError): - IndexStrategy.get_for_sharev2_search('bad-request') - with pytest.raises(IndexStrategyError): - IndexStrategy.get_for_sharev2_search() - with pytest.raises(IndexStrategyError): - IndexStrategy.get_for_sharev2_search(requested_name=None) + with pytest.raises(IndexStrategyError): + get_index_for_sharev2_search('bad-request') diff --git a/tests/share/search/index_strategy/test_trove_indexcard_flats.py b/tests/share/search/index_strategy/test_trove_indexcard_flats.py index be321a710..2cccf2d41 100644 --- a/tests/share/search/index_strategy/test_trove_indexcard_flats.py +++ b/tests/share/search/index_strategy/test_trove_indexcard_flats.py @@ -1,320 +1,12 @@ -from typing import Iterable, Iterator -from datetime import date -from urllib.parse import urlencode +from share.search.index_strategy.trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy -from primitive_metadata import primitive_rdf as rdf +from . import _common_trovesearch_tests -from tests import factories -from share.search import messages -from trove import models as trove_db -from trove.trovesearch.search_params import CardsearchParams -from trove.vocab.namespaces import RDFS, TROVE, RDF, DCTERMS, OWL, FOAF -from ._with_real_services import RealElasticTestCase - -BLARG = rdf.IriNamespace('https://blarg.example/blarg/') - - -class TestTroveIndexcardFlats(RealElasticTestCase): +class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests): # for RealElasticTestCase - strategy_name_for_real = 'trove_indexcard_flats' - strategy_name_for_test = 'test_trove_indexcard_flats' - - _indexcard_focus_by_uuid: dict[str, str] - - def setUp(self): - super().setUp() - self._indexcard_focus_by_uuid = {} - - def test_for_smoke_without_daemon(self): - _indexcard = self._create_indexcard( - focus_iri=BLARG.hello, - rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, - ) - _messages_chunk = messages.MessagesChunk( - messages.MessageType.UPDATE_INDEXCARD, - [_indexcard.id], - ) - self._assert_happypath_without_daemon( - _messages_chunk, - expected_doc_count=1, - ) - - def test_for_smoke_with_daemon(self): - _indexcard = self._create_indexcard( - focus_iri=BLARG.hello, - rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, - ) - _messages_chunk = messages.MessagesChunk( - messages.MessageType.UPDATE_INDEXCARD, - [_indexcard.id], - ) - self._assert_happypath_with_daemon( - _messages_chunk, - expected_doc_count=1, - ) - - def test_cardsearch(self): - self._fill_test_data_for_querying() - for _queryparams, _expected_result_iris in self._cardsearch_cases(): - _cardsearch_params = CardsearchParams.from_querystring(urlencode(_queryparams)) - _cardsearch_response = self.current_index.pls_handle_cardsearch(_cardsearch_params) - # assumes all results fit on one page - _actual_result_iris = { - self._indexcard_focus_by_uuid[_result.card_uuid()] - for _result in _cardsearch_response.search_result_page - } - self.assertEqual(_expected_result_iris, _actual_result_iris) - - def _fill_test_data_for_querying(self): - self._index_indexcards([ - self._create_indexcard(BLARG.a, { - BLARG.a: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, - DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('aaaa')}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, - DCTERMS.references: {BLARG.b, BLARG.c}, - DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, - }, - BLARG.someone: { - FOAF.name: {rdf.literal('some one')}, - }, - BLARG.b: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, - DCTERMS.title: {rdf.literal('bbbb')}, - DCTERMS.references: {BLARG.c}, - }, - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, - DCTERMS.title: {rdf.literal('cccc')}, - }, - }), - self._create_indexcard(BLARG.b, { - BLARG.b: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.b_same}, - DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('bbbb')}, - DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, - DCTERMS.references: {BLARG.c}, - DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, - }, - BLARG.someone: { - FOAF.name: {rdf.literal('some one')}, - }, - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, - DCTERMS.title: {rdf.literal('cccc')}, - }, - }), - self._create_indexcard(BLARG.c, { - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, - DCTERMS.creator: {BLARG.someone_else}, - DCTERMS.title: {rdf.literal('cccc')}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_bc}, - DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, - }, - BLARG.someone_else: { - FOAF.name: {rdf.literal('some one else')}, - }, - }), - ]) - - def _cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str]]]: - # using data from _fill_test_data_for_querying - yield ( - {'cardSearchFilter[creator]': BLARG.someone}, - {BLARG.a, BLARG.b}, - ) - yield ( - {'cardSearchFilter[creator]': ','.join((BLARG.someone_else, BLARG.someone))}, - {BLARG.a, BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchFilter[resourceType]': BLARG.Thing}, - {BLARG.a, BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchFilter[resourceType]': BLARG.Nothing}, - set(), - ) - yield ( - {'cardSearchFilter[references]': BLARG.b}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[references]': BLARG.c}, - {BLARG.a, BLARG.b}, - ) - yield ( - {'cardSearchFilter[references.references]': BLARG.c}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[references.references][is-present]': ''}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[references.references.subject][is-present]': ''}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[references.references][is-absent]': ''}, - {BLARG.c, BLARG.b}, - ) - yield ( - {'cardSearchFilter[references.references.subject][is-absent]': ''}, - {BLARG.c, BLARG.b}, - ) - yield ( - {'cardSearchFilter[subject]': BLARG.subj_ac}, - {BLARG.c, BLARG.a}, - ) - yield ( - {'cardSearchFilter[subject][none-of]': BLARG.subj_ac}, - {BLARG.b}, - ) - yield ( - { - 'cardSearchFilter[subject]': BLARG.subj_bc, - 'cardSearchFilter[creator]': BLARG.someone, - }, - {BLARG.b}, - ) - yield ( - { - 'cardSearchFilter[subject]': BLARG.subj_bc, - 'cardSearchText[*]': 'cccc', - }, - {BLARG.c}, - ) - yield ( - { - 'cardSearchFilter[resourceType]': ','.join((BLARG.Thing, BLARG.Another, BLARG.Nothing)), - 'cardSearchFilter[subject]': BLARG.subj_bc, - 'cardSearchText[*,creator.name]': 'else', - }, - {BLARG.c}, - ) - yield ( - { - 'cardSearchFilter[resourceType]': BLARG.Nothing, - 'cardSearchFilter[subject]': BLARG.subj_bc, - 'cardSearchText[*,creator.name]': 'else', - }, - set(), - ) - yield ( - {'cardSearchText[*,creator.name]': 'some'}, - {BLARG.a, BLARG.b, BLARG.c}, - ) - yield ( - { - 'cardSearchFilter[dateCreated]': '1999', - 'cardSearchText[*]': '', - }, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[dateCreated]': '1999-12'}, - {BLARG.a}, - ) - yield ( - {'cardSearchFilter[dateCreated]': '1999-11'}, - set(), - ) - yield ( - {'cardSearchFilter[dateCreated]': '2012-12-31'}, - {BLARG.b}, - ) - yield ( - {'cardSearchFilter[dateCreated][after]': '2030'}, - set(), - ) - yield ( - {'cardSearchFilter[dateCreated][after]': '2011'}, - {BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchFilter[dateCreated][before]': '2012-12'}, - {BLARG.a}, - ) - yield ( - {'cardSearchText': 'bbbb'}, - {BLARG.b}, - ) - yield ( - {'cardSearchText': '-bbbb'}, - {BLARG.a, BLARG.c}, - ) - yield ( - {'cardSearchText': 'danger'}, - {BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchText': 'dangre'}, - {BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchText': '"dangre"'}, - set(), - ) - yield ( - {'cardSearchText': 'danger -repulsive'}, - {BLARG.c}, - ) - yield ( - {'cardSearchText': '"nothing valued is here"'}, - {BLARG.a}, - ) - yield ( - {'cardSearchText': '"nothing valued here"'}, - set(), - ) - yield ( - {'cardSearchText': '"what is here"'}, - {BLARG.b}, - ) - - def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): - _messages_chunk = messages.MessagesChunk( - messages.MessageType.UPDATE_INDEXCARD, - [_indexcard.id for _indexcard in indexcards], - ) - self.assertTrue(all( - _response.is_done - for _response in self.index_strategy.pls_handle_messages_chunk(_messages_chunk) - )) - self.current_index.pls_refresh() + def get_index_strategy(self): + return TroveIndexcardFlatsIndexStrategy('test_trove_indexcard_flats') - def _create_indexcard(self, focus_iri: str, rdf_tripledict: rdf.RdfTripleDictionary) -> trove_db.Indexcard: - _suid = factories.SourceUniqueIdentifierFactory() - _raw = factories.RawDatumFactory( - suid=_suid, - ) - _indexcard = trove_db.Indexcard.objects.create( - source_record_suid=_suid, - ) - # an osfmap_json card is required for indexing, but not used in these tests - trove_db.DerivedIndexcard.objects.create( - upriver_indexcard=_indexcard, - deriver_identifier=trove_db.ResourceIdentifier.objects.get_or_create_for_iri(TROVE['derive/osfmap_json']), - ) - trove_db.LatestIndexcardRdf.objects.create( - from_raw_datum=_raw, - indexcard=_indexcard, - focus_iri=focus_iri, - rdf_as_turtle=rdf.turtle_from_tripledict(rdf_tripledict), - turtle_checksum_iri='foo', # not enforced - ) - self._indexcard_focus_by_uuid[str(_indexcard.uuid)] = focus_iri - return _indexcard + def cardsearch_integer_cases(self): + yield from () # integers not indexed by this strategy diff --git a/tests/share/search/index_strategy/test_trovesearch_denorm.py b/tests/share/search/index_strategy/test_trovesearch_denorm.py new file mode 100644 index 000000000..60a0e9771 --- /dev/null +++ b/tests/share/search/index_strategy/test_trovesearch_denorm.py @@ -0,0 +1,9 @@ +from share.search.index_strategy.trovesearch_denorm import TrovesearchDenormIndexStrategy + +from . import _common_trovesearch_tests + + +class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests): + # for RealElasticTestCase + def get_index_strategy(self): + return TrovesearchDenormIndexStrategy('test_trovesearch_denorm') diff --git a/tests/share/search/test_admin_workflow.py b/tests/share/search/test_admin_workflow.py index 2460f5ccc..6a1ee9a03 100644 --- a/tests/share/search/test_admin_workflow.py +++ b/tests/share/search/test_admin_workflow.py @@ -4,18 +4,18 @@ import pytest from share.models import ShareUser -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy @pytest.mark.django_db -def test_admin_search_indexes_view(fake_elastic_strategies, mock_elastic_clients): +def test_admin_search_indexes_view(mock_elastic_clients): credentials = {'username': 'test-test-test', 'password': 'password-password'} ShareUser.objects.create_superuser(**credentials) client = Client() client.login(**credentials) with mock.patch('share.search.index_strategy.elastic8.elasticsearch8'): resp = client.get('/admin/search-indexes') - for strategy_name in fake_elastic_strategies: - index_strategy = IndexStrategy.get_by_name(strategy_name) - expected_header = f'

current index: {index_strategy.current_indexname}

' + for strategy_name in index_strategy.all_index_strategies(): + _index_strategy = index_strategy.get_index_strategy(strategy_name) + expected_header = f'

current index: {_index_strategy.current_indexname}

' assert expected_header.encode() in resp.content diff --git a/tests/trove/trovesearch/test_page_cursor.py b/tests/trove/trovesearch/test_page_cursor.py new file mode 100644 index 000000000..5b9027c9a --- /dev/null +++ b/tests/trove/trovesearch/test_page_cursor.py @@ -0,0 +1,25 @@ +from unittest import TestCase + + +from trove.trovesearch.page_cursor import ( + PageCursor, + OffsetCursor, + ReproduciblyRandomSampleCursor, +) + + +class TestPageCursor(TestCase): + def test_queryparam_round_trip(self): + for _original_cursor in ( + PageCursor(page_size=7), + OffsetCursor(page_size=11), + OffsetCursor(page_size=11, start_offset=22), + ReproduciblyRandomSampleCursor(page_size=13), + ReproduciblyRandomSampleCursor(page_size=3, first_page_ids=['a', 'b', 'c']), + ): + _qp_value = _original_cursor.as_queryparam_value() + self.assertIsInstance(_qp_value, str) + self.assertNotEqual(_qp_value, '') + _cursor_from_qp = PageCursor.from_queryparam_value(_qp_value) + self.assertIsInstance(_cursor_from_qp, type(_original_cursor)) + self.assertEqual(_cursor_from_qp, _original_cursor) diff --git a/tests/trove/test_search_params.py b/tests/trove/trovesearch/test_search_params.py similarity index 100% rename from tests/trove/test_search_params.py rename to tests/trove/trovesearch/test_search_params.py diff --git a/trove/exceptions.py b/trove/exceptions.py index 516f6c200..7935c0511 100644 --- a/trove/exceptions.py +++ b/trove/exceptions.py @@ -58,6 +58,10 @@ class InvalidSearchText(InvalidQueryParamValue): pass +class InvalidPageCursorValue(InvalidQueryParamValue): + pass + + class MissingRequiredQueryParam(RequestParsingError): pass @@ -70,6 +74,14 @@ class InvalidPropertyPath(RequestParsingError): pass +class InvalidQueryParams(RequestParsingError): + pass + + +class InvalidSort(RequestParsingError): + pass + + ### # rendering a response diff --git a/trove/models/indexcard.py b/trove/models/indexcard.py index 5d4ca9441..21005d1e9 100644 --- a/trove/models/indexcard.py +++ b/trove/models/indexcard.py @@ -7,7 +7,6 @@ from primitive_metadata import primitive_rdf as rdf from share import models as share_db # TODO: break this dependency -from share.search.index_messenger import IndexMessenger from share.util.checksum_iri import ChecksumIri from trove.exceptions import DigestiveError from trove.models.resource_identifier import ResourceIdentifier @@ -46,6 +45,7 @@ def save_indexcards_from_tripledicts( .filter(id__in=_seen_focus_identifier_ids.intersection(_focus_identifier_ids)) ) raise DigestiveError(f'duplicate focus iris: {list(_duplicates)}') + _seen_focus_identifier_ids.update(_focus_identifier_ids) _indexcards.append(_indexcard) # cards seen previously on this suid (but not this time) treated as deleted for _indexcard_to_delete in ( @@ -220,6 +220,8 @@ def pls_delete(self): .filter(upriver_indexcard=self) .delete() ) + # TODO: rearrange to avoid local import + from share.search.index_messenger import IndexMessenger IndexMessenger().notify_indexcard_update([self]) def __repr__(self): diff --git a/trove/trovesearch/page_cursor.py b/trove/trovesearch/page_cursor.py new file mode 100644 index 000000000..0428b78d5 --- /dev/null +++ b/trove/trovesearch/page_cursor.py @@ -0,0 +1,124 @@ +from __future__ import annotations +import base64 +import dataclasses +import enum +import json +import typing + +from trove.exceptions import InvalidPageCursorValue + + +__all__ = ('PageCursor', 'OffsetCursor', 'ReproduciblyRandomSampleCursor') + + +MANY_MORE = -1 +MAX_OFFSET = 9997 + + +@dataclasses.dataclass +class PageCursor: + page_size: int + total_count: int = MANY_MORE + + @classmethod + def from_queryparam_value(cls, cursor_value: str) -> typing.Self: + try: + (_type_key, *_args) = json.loads(base64.urlsafe_b64decode(cursor_value)) + _cls = _PageCursorTypes[_type_key].value + assert issubclass(_cls, cls) + return _cls(*_args) + except Exception: + raise InvalidPageCursorValue(cursor_value) + + @classmethod + def from_cursor(cls, other_cursor: PageCursor) -> typing.Self: + if isinstance(other_cursor, cls): + return dataclasses.replace(other_cursor) # simple copy + return cls(*dataclasses.astuple(other_cursor)) + + def as_queryparam_value(self) -> str: + _cls_key = _PageCursorTypes(type(self)).name + _as_json = json.dumps([_cls_key, *dataclasses.astuple(self)]) + _cursor_bytes = base64.urlsafe_b64encode(_as_json.encode()) + return _cursor_bytes.decode() + + def is_basic(self) -> bool: + return type(self) is PageCursor + + def is_valid(self) -> bool: + return self.page_size > 0 and ( + self.total_count == MANY_MORE or self.total_count >= 0 + ) + + def has_many_more(self) -> bool: + return self.total_count == MANY_MORE + + def next_cursor(self) -> typing.Self | None: + return None + + def prev_cursor(self) -> typing.Self | None: + return None + + def first_cursor(self) -> typing.Self | None: + return None + + +@dataclasses.dataclass +class OffsetCursor(PageCursor): + # page_size: int (from PageCursor) + # total_count: int (from PageCursor) + start_offset: int = 0 + + def is_valid(self) -> bool: + return ( + super().is_valid() + and 0 <= self.start_offset <= MAX_OFFSET + and ( + self.total_count == MANY_MORE + or self.start_offset < self.total_count + ) + ) + + def is_first_page(self) -> bool: + return self.start_offset == 0 + + def next_cursor(self): + _next = dataclasses.replace(self, start_offset=(self.start_offset + self.page_size)) + return (_next if _next.is_valid() else None) + + def prev_cursor(self): + _prev = dataclasses.replace(self, start_offset=(self.start_offset - self.page_size)) + return (_prev if _prev.is_valid() else None) + + def first_cursor(self): + _first = dataclasses.replace(self, start_offset=0) + return (_first if _first.is_valid() else None) + + +@dataclasses.dataclass +class ReproduciblyRandomSampleCursor(OffsetCursor): + # page_size: int (from PageCursor) + # total_count: int (from PageCursor) + # start_offset: int (from OffsetCursor) + first_page_ids: list[str] = dataclasses.field(default_factory=list) + + def next_cursor(self): + return ( + super().next_cursor() + if self.first_page_ids + else None + ) + + def prev_cursor(self): + return ( + super().prev_cursor() + if self.first_page_ids + else None + ) + + +class _PageCursorTypes(enum.Enum): + '''registry of cursor types into which cursor values can be deserialized''' + PC = PageCursor + OC = OffsetCursor + RRSC = ReproduciblyRandomSampleCursor diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 14d3a6673..67469e80f 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -1,3 +1,4 @@ +from __future__ import annotations import collections import dataclasses import enum @@ -10,6 +11,7 @@ from primitive_metadata import primitive_rdf from trove import exceptions as trove_exceptions +from trove.trovesearch.page_cursor import PageCursor from trove.util.queryparams import ( QueryparamDict, QueryparamName, @@ -29,6 +31,11 @@ logger = logging.getLogger(__name__) +### +# type aliases +Propertypath = tuple[str, ...] +PropertypathSet = frozenset[Propertypath] + ### # constants for use in query param parsing @@ -49,9 +56,28 @@ # special path-step that matches any property GLOB_PATHSTEP = '*' -ONE_GLOB_PROPERTYPATH = (GLOB_PATHSTEP,) -DEFAULT_PROPERTYPATH_SET = frozenset([ONE_GLOB_PROPERTYPATH]) +ONE_GLOB_PROPERTYPATH: Propertypath = (GLOB_PATHSTEP,) +DEFAULT_PROPERTYPATH_SET: PropertypathSet = frozenset([ONE_GLOB_PROPERTYPATH]) + +class ValueType(enum.Enum): + # note: enum values are iris + IRI = TROVE['value-type/iri'] + DATE = TROVE['value-type/date'] + INTEGER = TROVE['value-type/integer'] + + @classmethod + def from_shortname(cls, shortname): + _iri = trove_shorthand().expand_iri(shortname) + return cls(_iri) + + @classmethod + def shortnames(cls): + for _value_type in cls: + yield _value_type.to_shortname() + + def to_shortname(self) -> str: + return trove_shorthand().compact_iri(self.value) ### # dataclasses for parsed search-api query parameters @@ -60,15 +86,15 @@ @dataclasses.dataclass(frozen=True) class BaseTroveParams: iri_shorthand: primitive_rdf.IriShorthand = dataclasses.field(repr=False) - include: frozenset[tuple[str, ...]] + include: PropertypathSet accept_mediatype: str | None @classmethod - def from_querystring(cls, querystring: str) -> 'BaseTroveParams': # TODO py3.11: typing.Self + def from_querystring(cls, querystring: str) -> typing.Self: return cls.from_queryparams(queryparams_from_querystring(querystring)) @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> 'BaseTroveParams': + def from_queryparams(cls, queryparams: QueryparamDict) -> typing.Self: return cls(**cls.parse_queryparams(queryparams)) @classmethod @@ -115,7 +141,7 @@ class Textsegment: is_fuzzy: bool = True is_negated: bool = False is_openended: bool = False - propertypath_set: frozenset[tuple[str, ...]] = DEFAULT_PROPERTYPATH_SET + propertypath_set: PropertypathSet = DEFAULT_PROPERTYPATH_SET def __post_init__(self): if self.is_negated and self.is_fuzzy: @@ -282,7 +308,7 @@ def is_valueless_operator(self): operator: FilterOperator value_set: frozenset[str] - propertypath_set: frozenset[tuple[str, ...]] = DEFAULT_PROPERTYPATH_SET + propertypath_set: PropertypathSet = DEFAULT_PROPERTYPATH_SET @classmethod def from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str): @@ -373,57 +399,68 @@ def as_queryparam(self, queryparam_family: str): @dataclasses.dataclass(frozen=True) class SortParam: - property_iri: str - descending: bool = False + value_type: ValueType + propertypath: Propertypath + descending: bool @classmethod - def sortlist_as_queryparam_value(cls, sort_params): - return join_queryparam_value( - _sort.as_queryparam_value() - for _sort in sort_params - ) + def from_sort_queryparams(cls, queryparams: QueryparamDict) -> tuple[SortParam, ...]: + return tuple(filter(None, ( + cls._from_sort_queryparam(_param_name, _param_value) + for (_param_name, _param_value) + in queryparams.get('sort', ()) + ))) @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> tuple['SortParam', ...]: - _paramvalue = _get_single_value(queryparams, QueryparamName('sort')) - if not _paramvalue or _paramvalue == '-relevance': - return () - return tuple(cls._from_sort_param_str(_paramvalue)) + def _from_sort_queryparam( + cls, + param_name: QueryparamName, + param_value: str, + ) -> SortParam | None: + if not param_value or param_value == '-relevance': + return None + _value_type = ValueType.DATE # default + if param_name.bracketed_names: + try: # "sort[]" + (_value_type_str,) = param_name.bracketed_names + if _value_type_str: + _value_type = ValueType.from_shortname(_value_type_str) + if _value_type not in (ValueType.DATE, ValueType.INTEGER): + raise ValueError + except ValueError: + raise trove_exceptions.InvalidQueryParamName(str(param_name), ( + 'valid sort param names: sort,' + f' sort[{ValueType.DATE.to_shortname()}],' + f' sort[{ValueType.INTEGER.to_shortname()}],' + )) + _descending = param_value.startswith(DESCENDING_SORT_PREFIX) + _rawpath = param_value.lstrip(DESCENDING_SORT_PREFIX) + _path = _parse_propertypath(_rawpath, allow_globs=False) + return cls( + value_type=_value_type, + propertypath=_path, + descending=_descending, + ) - @classmethod - def _from_sort_param_str(cls, param_value: str) -> typing.Iterable['SortParam']: - for _sort in split_queryparam_value(param_value): - _sort_property = _sort.lstrip(DESCENDING_SORT_PREFIX) - _property_iri = osfmap_shorthand().expand_iri(_sort_property) - if not is_date_property(_property_iri): - raise trove_exceptions.InvalidQueryParamValue('sort', _sort_property, "may not sort on non-date properties") - yield cls( - property_iri=_property_iri, - descending=param_value.startswith(DESCENDING_SORT_PREFIX), + def __post_init__(self): + if ( + self.value_type == ValueType.DATE + and not is_date_path(self.propertypath) + ): + raise trove_exceptions.InvalidSort( + '='.join(self.as_queryparam()), + 'may not sort by date on a path leading to a non-date property', ) - def as_queryparam_value(self): - _key = propertypath_key((self.property_iri,)) - if self.descending: - return f'-{_key}' - return _key - - -@dataclasses.dataclass(frozen=True) -class PageParam: - cursor: str | None # intentionally opaque; for IndexStrategy to generate/interpret - size: int | None = None # size is None iff cursor is not None - - @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> 'PageParam': - _cursor = _get_single_value(queryparams, QueryparamName('page', ('cursor',))) - if _cursor: - return cls(cursor=_cursor) - _size = int( # TODO: 400 response on non-int value - _get_single_value(queryparams, QueryparamName('page', ('size',))) - or DEFAULT_PAGE_SIZE + def as_queryparam(self) -> tuple[str, str]: + _name = ( + 'sort' + if (self.value_type == ValueType.DATE) + else f'sort[{self.value_type.to_shortname()}]' ) - return cls(size=min(_size, MAX_PAGE_SIZE), cursor=None) + _pathkey = propertypath_key(self.propertypath) + _value = (f'-{_pathkey}' if self.descending else _pathkey) + return (_name, _value) @dataclasses.dataclass(frozen=True) @@ -432,8 +469,8 @@ class CardsearchParams(BaseTroveParams): cardsearch_filter_set: frozenset[SearchFilter] index_strategy_name: str | None sort_list: tuple[SortParam] - page: PageParam - related_property_paths: tuple[tuple[str, ...]] + page_cursor: PageCursor + related_property_paths: tuple[Propertypath, ...] unnamed_iri_values: frozenset[str] @classmethod @@ -444,8 +481,8 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: 'cardsearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'cardSearchText'), 'cardsearch_filter_set': _filter_set, 'index_strategy_name': _get_single_value(queryparams, QueryparamName('indexStrategy')), - 'sort_list': SortParam.from_queryparams(queryparams), - 'page': PageParam.from_queryparams(queryparams), + 'sort_list': SortParam.from_sort_queryparams(queryparams), + 'page_cursor': _get_page_cursor(queryparams), 'include': None, # TODO 'related_property_paths': _get_related_property_paths(_filter_set), 'unnamed_iri_values': frozenset(), # TODO: frozenset(_get_unnamed_iri_values(_filter_set)), @@ -455,12 +492,13 @@ def to_querydict(self) -> QueryDict: _querydict = super().to_querydict() for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('cardSearchText', self.cardsearch_textsegment_set): _querydict[_qp_name] = _qp_value - if self.sort_list: - _querydict['sort'] = SortParam.sortlist_as_queryparam_value(self.sort_list) - if self.page.cursor: - _querydict['page[cursor]'] = self.page.cursor - elif self.page.size != DEFAULT_PAGE_SIZE: - _querydict['page[size]'] = self.page.size + for _sort in self.sort_list: + _qp_name, _qp_value = _sort.as_queryparam() + _querydict.appendlist(_qp_name, _qp_value) + if not self.page_cursor.is_basic(): + _querydict['page[cursor]'] = self.page_cursor.as_queryparam_value() + elif self.page_cursor.page_size != DEFAULT_PAGE_SIZE: + _querydict['page[size]'] = self.page_cursor.page_size for _filter in self.cardsearch_filter_set: _qp_name, _qp_value = _filter.as_queryparam('cardSearchFilter') _querydict.appendlist(_qp_name, _qp_value) @@ -473,7 +511,7 @@ def to_querydict(self) -> QueryDict: class ValuesearchParams(CardsearchParams): # includes fields from CardsearchParams, because a # valuesearch is always in context of a cardsearch - valuesearch_propertypath_set: frozenset[tuple[str, ...]] + valuesearch_propertypath: Propertypath valuesearch_textsegment_set: frozenset[Textsegment] valuesearch_filter_set: frozenset[SearchFilter] @@ -485,14 +523,26 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: raise trove_exceptions.MissingRequiredQueryParam('valueSearchPropertyPath') return { **super().parse_queryparams(queryparams), - 'valuesearch_propertypath_set': _parse_propertypath_set(_raw_propertypath, allow_globs=False), + 'valuesearch_propertypath': _parse_propertypath(_raw_propertypath, allow_globs=False), 'valuesearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'valueSearchText'), 'valuesearch_filter_set': SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'), } + def __post_init__(self): + if is_date_property(self.valuesearch_propertypath[-1]): + # date-value limitations + if self.valuesearch_textsegment_set: + raise trove_exceptions.InvalidQueryParams( + 'valueSearchText may not be used with valueSearchPropertyPath leading to a "date" property', + ) + if self.valuesearch_filter_set: + raise trove_exceptions.InvalidQueryParams( + 'valueSearchFilter may not be used with valueSearchPropertyPath leading to a "date" property', + ) + def to_querydict(self): _querydict = super().to_querydict() - _querydict['valueSearchPropertyPath'] = propertypath_set_key(self.valuesearch_propertypath_set) + _querydict['valueSearchPropertyPath'] = propertypath_key(self.valuesearch_propertypath) for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('valueSearchText', self.valuesearch_textsegment_set): _querydict[_qp_name] = _qp_value for _filter in self.valuesearch_filter_set: @@ -512,7 +562,19 @@ def valuesearch_type_iris(self): ### -# local helpers +# helper functions + +def is_globpath(path: Propertypath) -> bool: + return all(_pathstep == GLOB_PATHSTEP for _pathstep in path) + + +def make_globpath(length: int) -> Propertypath: + return ONE_GLOB_PROPERTYPATH * length + + +def is_date_path(path: Propertypath) -> bool: + return bool(path) and is_date_property(path[-1]) + def propertypathstep_key(pathstep: str) -> str: if pathstep == GLOB_PATHSTEP: @@ -521,14 +583,14 @@ def propertypathstep_key(pathstep: str) -> str: return urllib.parse.quote(osfmap_shorthand().compact_iri(pathstep)) -def propertypath_key(property_path: tuple[str, ...]) -> str: +def propertypath_key(property_path: Propertypath) -> str: return PROPERTYPATH_DELIMITER.join( propertypathstep_key(_pathstep) for _pathstep in property_path ) -def propertypath_set_key(propertypath_set: frozenset[tuple[str, ...]]) -> str: +def propertypath_set_key(propertypath_set: PropertypathSet) -> str: return join_queryparam_value( propertypath_key(_propertypath) for _propertypath in propertypath_set @@ -565,7 +627,7 @@ def _get_single_value( return _singlevalue -def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> frozenset[tuple[str, ...]]: +def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> PropertypathSet: # comma-delimited set of dot-delimited paths return frozenset( _parse_propertypath(_path, allow_globs=allow_globs) @@ -573,7 +635,7 @@ def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> fr ) -def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> tuple[str, ...]: +def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> Propertypath: _path = tuple( osfmap_shorthand().expand_iri(_pathstep) for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER) @@ -589,7 +651,7 @@ def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> tuple[str, return _path -def _get_related_property_paths(filter_set) -> tuple[tuple[str, ...], ...]: +def _get_related_property_paths(filter_set) -> tuple[Propertypath, ...]: # hard-coded for osf.io search pages, static list per type # TODO: replace with some dynamism, maybe a 'significant_terms' aggregation _type_iris = set() @@ -605,3 +667,17 @@ def _get_unnamed_iri_values(filter_set) -> typing.Iterable[str]: for _iri in _filter.value_set: if _iri not in OSFMAP_THESAURUS: yield _iri + + +def _get_page_cursor(queryparams: QueryparamDict) -> PageCursor: + _cursor_value = _get_single_value(queryparams, QueryparamName('page', ('cursor',))) + if _cursor_value: + return PageCursor.from_queryparam_value(_cursor_value) + try: + _size = int( # TODO: 400 response on non-int value + _get_single_value(queryparams, QueryparamName('page', ('size',))) + or DEFAULT_PAGE_SIZE + ) + except ValueError: + raise trove_exceptions.InvalidQueryParamValue('page[size]') + return PageCursor(page_size=min(_size, MAX_PAGE_SIZE)) diff --git a/trove/trovesearch/search_response.py b/trove/trovesearch/search_response.py index bd7a25f28..19bbdfe6c 100644 --- a/trove/trovesearch/search_response.py +++ b/trove/trovesearch/search_response.py @@ -3,6 +3,11 @@ from primitive_metadata import primitive_rdf +from trove.trovesearch.page_cursor import ( + PageCursor, + ReproduciblyRandomSampleCursor, +) +from trove.trovesearch.search_params import CardsearchParams from trove.vocab.namespaces import TROVE from trove.vocab.trove import trove_indexcard_namespace @@ -24,9 +29,11 @@ class TextMatchEvidence: @dataclasses.dataclass class CardsearchResult: - text_match_evidence: Iterable[TextMatchEvidence] + text_match_evidence: list[TextMatchEvidence] card_iri: str + card_pk: str = '' + @property def card_uuid(self): # card iri has the uuid at the end return primitive_rdf.iri_minus_namespace( @@ -34,21 +41,14 @@ def card_uuid(self): namespace=trove_indexcard_namespace(), ) - -@dataclasses.dataclass -class CardsearchResponse: - total_result_count: BoundedCount - search_result_page: Iterable[CardsearchResult] - next_page_cursor: Optional[str] - prev_page_cursor: Optional[str] - first_page_cursor: Optional[str] - filtervalue_info: Iterable['ValuesearchResult'] - related_propertypath_results: Iterable['PropertypathUsage'] + @property + def card_id(self): + return self.card_pk or self.card_uuid @dataclasses.dataclass class PropertypathUsage: - property_path: tuple[str] + property_path: tuple[str, ...] usage_count: int @@ -69,10 +69,56 @@ def __post_init__(self): ) +### +# paged responses + +@dataclasses.dataclass +class PagedResponse: + cursor: PageCursor + + @property + def total_result_count(self) -> BoundedCount: + return ( + TROVE['ten-thousands-and-more'] + if (self.cursor is None) or self.cursor.has_many_more() + else self.cursor.total_count + ) + + @dataclasses.dataclass -class ValuesearchResponse: +class CardsearchResponse(PagedResponse): + search_result_page: list[CardsearchResult] + related_propertypath_results: list['PropertypathUsage'] + cardsearch_params: CardsearchParams + + def __post_init__(self): + _cursor = self.cursor + if ( + isinstance(_cursor, ReproduciblyRandomSampleCursor) + and _cursor.is_first_page() + ): + if _cursor.first_page_ids: + # revisiting first page; reproduce original random order + _ordering_by_id = { + _id: _i + for (_i, _id) in enumerate(_cursor.first_page_ids) + } + self.search_result_page.sort(key=lambda _r: _ordering_by_id[_r.card_id]) + elif not _cursor.has_many_more(): + _cursor.first_page_ids = [_result.card_id for _result in self.search_result_page] + + +@dataclasses.dataclass +class ValuesearchResponse(PagedResponse): search_result_page: Iterable[ValuesearchResult] - total_result_count: Optional[int] = None - next_page_cursor: Optional[str] = None - prev_page_cursor: Optional[str] = None - first_page_cursor: Optional[str] = None + + +### +# local helpers + +def _cursor_value(cursor: PageCursor | None) -> str: + return ( + cursor.as_queryparam_value() + if cursor is not None and cursor.is_valid() + else '' + ) diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 31bf85d48..f91969b50 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -15,10 +15,10 @@ from trove import models as trove_db from trove import exceptions as trove_exceptions from trove.derive.osfmap_json import _RdfOsfmapJsonldRenderer +from trove.trovesearch.page_cursor import PageCursor from trove.trovesearch.search_params import ( CardsearchParams, ValuesearchParams, - PageParam, propertypath_key, propertypath_set_key, ) @@ -79,13 +79,13 @@ @trovesearch_by_indexstrategy.gatherer(TROVE.propertyPath, focustype_iris={TROVE.Valuesearch}) def gather_valuesearch_propertypath(focus, *, search_params, **kwargs): - yield from _multi_propertypath_twoples(search_params.valuesearch_propertypath_set) + yield from _single_propertypath_twoples(search_params.valuesearch_propertypath) @trovesearch_by_indexstrategy.gatherer(TROVE.valueSearchFilter) def gather_valuesearch_filter(focus, *, search_params, **kwargs): for _filter in search_params.valuesearch_filter_set: - yield (TROVE.valueSearchFilter, _filter_as_blanknode(_filter, {})) + yield (TROVE.valueSearchFilter, _filter_as_blanknode(_filter)) @trovesearch_by_indexstrategy.gatherer( @@ -131,13 +131,9 @@ def gather_cardsearch(focus, *, specific_index, search_params, **kwargs): ] if _relatedproperty_list: yield (TROVE.relatedPropertyList, sequence(_relatedproperty_list)) - # filter-values from search params, with any additional info - _valueinfo_by_iri = {} - for _filtervalue in _cardsearch_resp.filtervalue_info: - _value_info = _valuesearch_result_as_json(_filtervalue) - _valueinfo_by_iri[_filtervalue.value_iri] = _value_info + # filter-values from search params for _filter in search_params.cardsearch_filter_set: - yield (TROVE.cardSearchFilter, _filter_as_blanknode(_filter, _valueinfo_by_iri)) + yield (TROVE.cardSearchFilter, _filter_as_blanknode(_filter)) @trovesearch_by_indexstrategy.gatherer( @@ -261,7 +257,7 @@ def gather_card(focus, *, deriver_iri, **kwargs): ### # local helpers -def _filter_as_blanknode(search_filter, valueinfo_by_iri) -> frozenset: +def _filter_as_blanknode(search_filter) -> frozenset: _filter_twoples = [ (TROVE.filterType, search_filter.operator.value), *_multi_propertypath_twoples(search_filter.propertypath_set), @@ -269,10 +265,7 @@ def _filter_as_blanknode(search_filter, valueinfo_by_iri) -> frozenset: if not search_filter.operator.is_valueless_operator(): for _value in search_filter.value_set: if search_filter.operator.is_iri_operator(): - _valueinfo = ( - valueinfo_by_iri.get(_value) - or _osfmap_or_unknown_iri_as_json(_value) - ) + _valueinfo = _osfmap_or_unknown_iri_as_json(_value) else: _valueinfo = literal_json({'@value': _value}) _filter_twoples.append((TROVE.filterValue, _valueinfo)) @@ -369,27 +362,24 @@ def _related_property_result(property_path: tuple[str, ...], count: int): def _search_page_links(search_focus, search_params, search_response): _search_iri_split = urllib.parse.urlsplit(next(iter(search_focus.iris))) - def _iri_with_page_param(page_param: PageParam): + def _iri_with_cursor(page_cursor: PageCursor): return urllib.parse.urlunsplit(( _search_iri_split.scheme, _search_iri_split.netloc, _search_iri_split.path, - dataclasses.replace(search_params, page=page_param).to_querystring(), + dataclasses.replace(search_params, page_cursor=page_cursor).to_querystring(), _search_iri_split.fragment, )) - if search_response.first_page_cursor: - yield (TROVE.searchResultPage, _jsonapi_link('first', _iri_with_page_param( - PageParam(cursor=search_response.first_page_cursor), - ))) - if search_response.next_page_cursor: - yield (TROVE.searchResultPage, _jsonapi_link('next', _iri_with_page_param( - PageParam(cursor=search_response.next_page_cursor), - ))) - if search_response.prev_page_cursor: - yield (TROVE.searchResultPage, _jsonapi_link('prev', _iri_with_page_param( - PageParam(cursor=search_response.prev_page_cursor), - ))) + _next = search_response.cursor.next_cursor() + if _next is not None and _next.is_valid(): + yield (TROVE.searchResultPage, _jsonapi_link('next', _iri_with_cursor(_next))) + _prev = search_response.cursor.prev_cursor() + if _prev is not None and _prev.is_valid(): + yield (TROVE.searchResultPage, _jsonapi_link('prev', _iri_with_cursor(_prev))) + _first = search_response.cursor.first_cursor() + if _first is not None and _first.is_valid(): + yield (TROVE.searchResultPage, _jsonapi_link('first', _iri_with_cursor(_first))) def _jsonapi_link(membername, iri): diff --git a/trove/views/search.py b/trove/views/search.py index c303eb5b0..4173fd5e7 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -4,7 +4,7 @@ from django.views import View from primitive_metadata import gather -from share.search.index_strategy import IndexStrategy +from share.search import index_strategy from trove import exceptions as trove_exceptions from trove.trovesearch.search_params import ( CardsearchParams, @@ -79,7 +79,7 @@ def _parse_request(request: http.HttpRequest, renderer, search_params_dataclass) _search_params = search_params_dataclass.from_querystring( request.META['QUERY_STRING'], ) - _specific_index = IndexStrategy.get_for_trove_search(_search_params.index_strategy_name) + _specific_index = index_strategy.get_index_for_trovesearch(_search_params) # TODO: 404 for unknown strategy _search_gathering = trovesearch_by_indexstrategy.new_gathering({ 'search_params': _search_params, diff --git a/trove/vocab/namespaces.py b/trove/vocab/namespaces.py index 9402fd26c..73c7438b2 100644 --- a/trove/vocab/namespaces.py +++ b/trove/vocab/namespaces.py @@ -10,6 +10,7 @@ DCAT, PROV, SKOS, + XSD, DEFAULT_SHORTHAND, ) @@ -30,6 +31,7 @@ 'SHAREv2', 'SKOS', 'TROVE', + 'XSD', 'NAMESPACES_SHORTHAND', ) diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index 9efa03a32..0e8bd9da2 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -235,7 +235,7 @@ def trove_browse_link(iri: str): TROVE.valueSearchFilter, TROVE.pageSize, TROVE.pageCursor, - TROVE.sort, + # TROVE.sort, # TROVE.include, }, RDFS.label: {literal('index-value-search', language='en')}, @@ -656,17 +656,21 @@ def trove_browse_link(iri: str): RDFS.comment: {literal('how to order search results', language='en')}, TROVE.jsonSchema: {literal_json({'type': 'string'})}, DCTERMS.description: {_literal_markdown(f'''**sort** is -a query param to control ordering of search results +a query param to control ordering of search results based on values of a specific type at a specific path. -accepts a short-hand iri for a date property: +to sort by date values, use `sort` (or `sort[date-value]`) with a **property-path** that ends with +one of the following supported date properties: {", ".join(f"`{osfmap_shorthand().compact_iri(_date_iri)}`" for _date_iri in DATE_PROPERTIES)} -prefix with `-` to sort descending (latest first), otherwise sorts ascending (earliest first) +to sort by integer values, use `sort[integer-value]` with a **property-path** to the integers of interest. + +by default, sorts "ascending" (beginning with earliest date or smallest integer) -- +prefix the value with `-` to sort "descending" (beginning with latest date or largest integer). -if missing (or if `sort=-relevance`), results are sorted by some notion of +if missing (or with value `-relevance`), results are sorted by some notion of relevance to the request's search-text or (if no search-text) by random. -may not be used with `page[cursor]` +may not be used with `page[cursor]`. ''', language='en')}, }, @@ -799,9 +803,17 @@ def trove_browse_link(iri: str): RDF.type: {TROVE.FilterOperator}, JSONAPI_MEMBERNAME: {literal('after', language='en')}, }, - TROVE['at-date']: { + TROVE['value-type/iri']: { RDF.type: {TROVE.FilterOperator}, - JSONAPI_MEMBERNAME: {literal('at-date', language='en')}, + JSONAPI_MEMBERNAME: {literal('iri-value', language='en')}, + }, + TROVE['value-type/date']: { + RDF.type: {TROVE.ValueType}, + JSONAPI_MEMBERNAME: {literal('date-value', language='en')}, + }, + TROVE['value-type/integer']: { + RDF.type: {TROVE.ValueType}, + JSONAPI_MEMBERNAME: {literal('integer-value', language='en')}, }, # other: