From 90b253a38a6add326c68593cf1023b8ef6cafec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=B4mulo=20Penido?= Date: Thu, 18 Apr 2024 13:53:21 -0300 Subject: [PATCH] feat: update Studio search index when course content is updated (#34391) --- openedx/core/djangoapps/content/search/api.py | 511 ++++++++++++++++++ .../docs/decisions/0001-meilisearch.rst | 2 + .../djangoapps/content/search/documents.py | 54 +- .../djangoapps/content/search/handlers.py | 116 +++- .../search/management/commands/meili_mixin.py | 105 ---- .../management/commands/reindex_studio.py | 135 +---- .../core/djangoapps/content/search/tasks.py | 83 +++ .../content/search/tests/test_api.py | 195 +++++++ .../content/search/tests/test_documents.py | 12 +- .../content/search/tests/test_handlers.py | 156 ++++++ .../content/search/tests/test_views.py | 70 ++- .../core/djangoapps/content/search/views.py | 76 +-- 12 files changed, 1157 insertions(+), 358 deletions(-) create mode 100644 openedx/core/djangoapps/content/search/api.py delete mode 100644 openedx/core/djangoapps/content/search/management/commands/meili_mixin.py create mode 100644 openedx/core/djangoapps/content/search/tasks.py create mode 100644 openedx/core/djangoapps/content/search/tests/test_api.py create mode 100644 openedx/core/djangoapps/content/search/tests/test_handlers.py diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py new file mode 100644 index 000000000000..382e8f8d759a --- /dev/null +++ b/openedx/core/djangoapps/content/search/api.py @@ -0,0 +1,511 @@ +""" +Content index and search API using Meilisearch +""" +from __future__ import annotations + +import logging +import time +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +from functools import wraps +from typing import Callable, Generator + +from django.conf import settings +from django.contrib.auth import get_user_model +from django.core.cache import cache +from meilisearch import Client as MeilisearchClient +from meilisearch.errors import MeilisearchError +from meilisearch.models.task import TaskInfo +from opaque_keys.edx.keys import UsageKey +from opaque_keys.edx.locator import LibraryLocatorV2 +from common.djangoapps.student.roles import GlobalStaff +from rest_framework.request import Request +from common.djangoapps.student.role_helpers import get_course_roles +from openedx.core.djangoapps.content.search.models import get_access_ids_for_request + +from openedx.core.djangoapps.content_libraries import api as lib_api +from xmodule.modulestore import ModuleStoreEnum +from xmodule.modulestore.django import modulestore + +from .documents import ( + Fields, + meili_id_from_opaque_key, + searchable_doc_for_course_block, + searchable_doc_for_library_block, + searchable_doc_tags +) + +log = logging.getLogger(__name__) + +User = get_user_model() + +STUDIO_INDEX_SUFFIX = "studio_content" + +if hasattr(settings, "MEILISEARCH_INDEX_PREFIX"): + STUDIO_INDEX_NAME = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_SUFFIX +else: + STUDIO_INDEX_NAME = STUDIO_INDEX_SUFFIX + + +_MEILI_CLIENT = None +_MEILI_API_KEY_UID = None + +LOCK_EXPIRE = 24 * 60 * 60 # Lock expires in 24 hours + +MAX_ACCESS_IDS_IN_FILTER = 1_000 +MAX_ORGS_IN_FILTER = 1_000 + + +@contextmanager +def _index_rebuild_lock() -> Generator[str, None, None]: + """ + Lock to prevent that more than one rebuild is running at the same time + """ + lock_id = f"lock-meilisearch-index-{STUDIO_INDEX_NAME}" + new_index_name = STUDIO_INDEX_NAME + "_new" + + status = cache.add(lock_id, new_index_name, LOCK_EXPIRE) + + if not status: + # Lock already acquired + raise RuntimeError("Rebuild already in progress") + + # Lock acquired + try: + yield new_index_name + finally: + # Release the lock + cache.delete(lock_id) + + +def _get_running_rebuild_index_name() -> str | None: + lock_id = f"lock-meilisearch-index-{STUDIO_INDEX_NAME}" + + return cache.get(lock_id) + + +def _get_meilisearch_client(): + """ + Get the Meiliesearch client + """ + global _MEILI_CLIENT # pylint: disable=global-statement + + # Connect to Meilisearch + if not is_meilisearch_enabled(): + raise RuntimeError("MEILISEARCH_ENABLED is not set - search functionality disabled.") + + if _MEILI_CLIENT is not None: + return _MEILI_CLIENT + + _MEILI_CLIENT = MeilisearchClient(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) + try: + _MEILI_CLIENT.health() + except MeilisearchError as err: + _MEILI_CLIENT = None + raise ConnectionError("Unable to connect to Meilisearch") from err + return _MEILI_CLIENT + + +def clear_meilisearch_client(): + global _MEILI_CLIENT # pylint: disable=global-statement + + _MEILI_CLIENT = None + + +def _get_meili_api_key_uid(): + """ + Helper method to get the UID of the API key we're using for Meilisearch + """ + global _MEILI_API_KEY_UID # pylint: disable=global-statement + + if _MEILI_API_KEY_UID is not None: + return _MEILI_API_KEY_UID + + _MEILI_API_KEY_UID = _get_meilisearch_client().get_key(settings.MEILISEARCH_API_KEY).uid + + +def _wait_for_meili_task(info: TaskInfo) -> None: + """ + Simple helper method to wait for a Meilisearch task to complete + This method will block until the task is completed, so it should only be used in celery tasks + or management commands. + """ + client = _get_meilisearch_client() + current_status = client.get_task(info.task_uid) + while current_status.status in ("enqueued", "processing"): + time.sleep(0.5) + current_status = client.get_task(info.task_uid) + if current_status.status != "succeeded": + try: + err_reason = current_status.error['message'] + except (TypeError, KeyError): + err_reason = "Unknown error" + raise MeilisearchError(err_reason) + + +def _wait_for_meili_tasks(info_list: list[TaskInfo]) -> None: + """ + Simple helper method to wait for multiple Meilisearch tasks to complete + """ + while info_list: + info = info_list.pop() + _wait_for_meili_task(info) + + +def _index_exists(index_name: str) -> bool: + """ + Check if an index exists + """ + client = _get_meilisearch_client() + try: + client.get_index(index_name) + except MeilisearchError as err: + if err.code == "index_not_found": + return False + else: + raise err + return True + + +@contextmanager +def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generator[str, None, None]: + """ + Create a new temporary Meilisearch index, populate it, then swap it to + become the active index. + + Args: + status_cb (Callable): A callback function to report status messages + """ + if status_cb is None: + status_cb = log.info + + client = _get_meilisearch_client() + status_cb("Checking index...") + with _index_rebuild_lock() as temp_index_name: + if _index_exists(temp_index_name): + status_cb("Temporary index already exists. Deleting it...") + _wait_for_meili_task(client.delete_index(temp_index_name)) + + status_cb("Creating new index...") + _wait_for_meili_task( + client.create_index(temp_index_name, {'primaryKey': 'id'}) + ) + new_index_created = client.get_index(temp_index_name).created_at + + yield temp_index_name + + if not _index_exists(STUDIO_INDEX_NAME): + # We have to create the "target" index before we can successfully swap the new one into it: + status_cb("Preparing to swap into index (first time)...") + _wait_for_meili_task(client.create_index(STUDIO_INDEX_NAME)) + status_cb("Swapping index...") + client.swap_indexes([{'indexes': [temp_index_name, STUDIO_INDEX_NAME]}]) + # If we're using an API key that's restricted to certain index prefix(es), we won't be able to get the status + # of this request unfortunately. https://github.com/meilisearch/meilisearch/issues/4103 + while True: + time.sleep(1) + if client.get_index(STUDIO_INDEX_NAME).created_at != new_index_created: + status_cb("Waiting for swap completion...") + else: + break + status_cb("Deleting old index...") + _wait_for_meili_task(client.delete_index(temp_index_name)) + + +def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None: + """ + Recurse the children of an XBlock and call the given function for each + + The main purpose of this is just to wrap the loading of each child in + try...except. Otherwise block.get_children() would do what we need. + """ + if block.has_children: + for child_id in block.children: + try: + child = block.get_child(child_id) + except Exception as err: # pylint: disable=broad-except + log.exception(err) + if status_cb is not None: + status_cb(f"Unable to load block {child_id}") + else: + fn(child) + + +def _update_index_docs(docs) -> None: + """ + Helper function that updates the documents in the search index + + If there is a rebuild in progress, the document will also be added to the new index. + """ + if not docs: + return + + client = _get_meilisearch_client() + current_rebuild_index_name = _get_running_rebuild_index_name() + + tasks = [] + if current_rebuild_index_name: + # If there is a rebuild in progress, the document will also be added to the new index. + tasks.append(client.index(current_rebuild_index_name).update_documents(docs)) + tasks.append(client.index(STUDIO_INDEX_NAME).update_documents(docs)) + + _wait_for_meili_tasks(tasks) + + +def only_if_meilisearch_enabled(f): + """ + Only call `f` if meilisearch is enabled + """ + @wraps(f) + def wrapper(*args, **kwargs): + """Wraps the decorated function.""" + if is_meilisearch_enabled(): + return f(*args, **kwargs) + return wrapper + + +def is_meilisearch_enabled() -> bool: + """ + Returns whether Meilisearch is enabled + """ + if hasattr(settings, "MEILISEARCH_ENABLED"): + return settings.MEILISEARCH_ENABLED + + return False + + +def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: + """ + Rebuild the Meilisearch index from scratch + """ + if status_cb is None: + status_cb = log.info + + client = _get_meilisearch_client() + store = modulestore() + + # Get the lists of libraries + status_cb("Counting libraries...") + lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')] + num_libraries = len(lib_keys) + + # Get the list of courses + status_cb("Counting courses...") + with store.branch_setting(ModuleStoreEnum.Branch.draft_preferred): + all_courses = store.get_courses() + num_courses = len(all_courses) + + # Some counters so we can track our progress as indexing progresses: + num_contexts = num_courses + num_libraries + num_contexts_done = 0 # How many courses/libraries we've indexed + num_blocks_done = 0 # How many individual components/XBlocks we've indexed + + status_cb(f"Found {num_courses} courses and {num_libraries} libraries.") + with _using_temp_index(status_cb) as temp_index_name: + ############## Configure the index ############## + + # The following index settings are best changed on an empty index. + # Changing them on a populated index will "re-index all documents in the index, which can take some time" + # and use more RAM. Instead, we configure an empty index then populate it one course/library at a time. + + # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): + client.index(temp_index_name).update_distinct_attribute(Fields.usage_key) + # Mark which attributes can be used for filtering/faceted search: + client.index(temp_index_name).update_filterable_attributes([ + Fields.block_type, + Fields.context_key, + Fields.org, + Fields.tags, + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.type, + Fields.access_id, + ]) + # Mark which attributes are used for keyword search, in order of importance: + client.index(temp_index_name).update_searchable_attributes([ + Fields.display_name, + Fields.block_id, + Fields.content, + Fields.tags, + # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. + ]) + + ############## Libraries ############## + status_cb("Indexing libraries...") + for lib_key in lib_keys: + status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}") + try: + docs = [] + for component in lib_api.get_library_components(lib_key): + metadata = lib_api.LibraryXBlockMetadata.from_component(lib_key, component) + doc = {} + doc.update(searchable_doc_for_library_block(metadata)) + doc.update(searchable_doc_tags(metadata.usage_key)) + docs.append(doc) + + num_blocks_done += 1 + if docs: + # Add all the docs in this library at once (usually faster than adding one at a time): + _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + except Exception as err: # pylint: disable=broad-except + status_cb(f"Error indexing library {lib_key}: {err}") + finally: + num_contexts_done += 1 + + ############## Courses ############## + status_cb("Indexing courses...") + for course in all_courses: + status_cb( + f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})" + ) + docs = [] + + # Pre-fetch the course with all of its children: + course = store.get_course(course.id, depth=None) + + def add_with_children(block): + """ Recursively index the given XBlock/component """ + doc = searchable_doc_for_course_block(block) + docs.append(doc) # pylint: disable=cell-var-from-loop + _recurse_children(block, add_with_children) # pylint: disable=cell-var-from-loop + + _recurse_children(course, add_with_children) + + if docs: + # Add all the docs in this course at once (usually faster than adding one at a time): + _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + num_contexts_done += 1 + num_blocks_done += len(docs) + + status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and libraries.") + + +def upsert_xblock_index_doc(usage_key: UsageKey, recursive: bool = True) -> None: + """ + Creates or updates the document for the given XBlock in the search index + + + Args: + usage_key (UsageKey): The usage key of the XBlock to index + recursive (bool): If True, also index all children of the XBlock + """ + xblock = modulestore().get_item(usage_key) + + docs = [] + + def add_with_children(block): + """ Recursively index the given XBlock/component """ + doc = searchable_doc_for_course_block(block) + docs.append(doc) + if recursive: + _recurse_children(block, add_with_children) + + add_with_children(xblock) + + _update_index_docs(docs) + + +def delete_index_doc(usage_key: UsageKey) -> None: + """ + Deletes the document for the given XBlock from the search index + + Args: + usage_key (UsageKey): The usage key of the XBlock to be removed from the index + """ + current_rebuild_index_name = _get_running_rebuild_index_name() + + client = _get_meilisearch_client() + + tasks = [] + if current_rebuild_index_name: + # If there is a rebuild in progress, the document will also be deleted from the new index. + tasks.append(client.index(current_rebuild_index_name).delete_document(meili_id_from_opaque_key(usage_key))) + tasks.append(client.index(STUDIO_INDEX_NAME).delete_document(meili_id_from_opaque_key(usage_key))) + + _wait_for_meili_tasks(tasks) + + +def upsert_library_block_index_doc(usage_key: UsageKey) -> None: + """ + Creates or updates the document for the given Library Block in the search index + """ + + library_block = lib_api.get_component_from_usage_key(usage_key) + library_block_metadata = lib_api.LibraryXBlockMetadata.from_component(usage_key.context_key, library_block) + + docs = [ + searchable_doc_for_library_block(library_block_metadata) + ] + + _update_index_docs(docs) + + +def upsert_content_library_index_docs(library_key: LibraryLocatorV2) -> None: + """ + Creates or updates the documents for the given Content Library in the search index + """ + docs = [] + for component in lib_api.get_library_components(library_key): + metadata = lib_api.LibraryXBlockMetadata.from_component(library_key, component) + doc = searchable_doc_for_library_block(metadata) + docs.append(doc) + + _update_index_docs(docs) + + +def _get_user_orgs(request: Request) -> list[str]: + """ + Get the org.short_names for the organizations that the requesting user has OrgStaffRole or OrgInstructorRole. + + Note: org-level roles have course_id=None to distinguish them from course-level roles. + """ + course_roles = get_course_roles(request.user) + return list(set( + role.org + for role in course_roles + if role.course_id is None and role.role in ['staff', 'instructor'] + )) + + +def _get_meili_access_filter(request: Request) -> dict: + """ + Return meilisearch filter based on the requesting user's permissions. + """ + # Global staff can see anything, so no filters required. + if GlobalStaff().has_user(request.user): + return {} + + # Everyone else is limited to their org staff roles... + user_orgs = _get_user_orgs(request)[:MAX_ORGS_IN_FILTER] + + # ...or the N most recent courses and libraries they can access. + access_ids = get_access_ids_for_request(request, omit_orgs=user_orgs)[:MAX_ACCESS_IDS_IN_FILTER] + return { + "filter": f"org IN {user_orgs} OR access_id IN {access_ids}", + } + + +def generate_user_token_for_studio_search(request): + """ + Returns a Meilisearch API key that only allows the user to search content that they have permission to view + """ + expires_at = datetime.now(tz=timezone.utc) + timedelta(days=7) + + search_rules = { + STUDIO_INDEX_NAME: _get_meili_access_filter(request), + } + # Note: the following is just generating a JWT. It doesn't actually make an API call to Meilisearch. + restricted_api_key = _get_meilisearch_client().generate_tenant_token( + api_key_uid=_get_meili_api_key_uid(), + search_rules=search_rules, + expires_at=expires_at, + ) + + return { + "url": settings.MEILISEARCH_PUBLIC_URL, + "index_name": STUDIO_INDEX_NAME, + "api_key": restricted_api_key, + } diff --git a/openedx/core/djangoapps/content/search/docs/decisions/0001-meilisearch.rst b/openedx/core/djangoapps/content/search/docs/decisions/0001-meilisearch.rst index 1f355409da0e..f7430121605d 100644 --- a/openedx/core/djangoapps/content/search/docs/decisions/0001-meilisearch.rst +++ b/openedx/core/djangoapps/content/search/docs/decisions/0001-meilisearch.rst @@ -104,6 +104,8 @@ Decision new ``content/search`` Django app, so it's relatively easy to swap out later if this experiment doesn't pan out. 4. We will not use ``edx-search`` for the new search functionality. +5. For the experiment, we won't use Meilisearch during tests, but we expect to + add that in the future if we move forward with replacing Elasticsearch completely. Consequences diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index d32956c688d0..e74bfaac0381 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -2,11 +2,12 @@ Utilities related to indexing content for search """ from __future__ import annotations -from hashlib import blake2b + import logging +from hashlib import blake2b from django.utils.text import slugify -from opaque_keys.edx.keys import UsageKey, LearningContextKey +from opaque_keys.edx.keys import LearningContextKey, UsageKey from openedx.core.djangoapps.content.search.models import SearchAccess from openedx.core.djangoapps.content_libraries import api as lib_api @@ -14,7 +15,6 @@ from openedx.core.djangoapps.xblock import api as xblock_api log = logging.getLogger(__name__) -STUDIO_INDEX_NAME = "studio_content" class Fields: @@ -64,7 +64,7 @@ class DocType: library_block = "library_block" -def _meili_id_from_opaque_key(usage_key: UsageKey) -> str: +def meili_id_from_opaque_key(usage_key: UsageKey) -> str: """ Meilisearch requires each document to have a primary key that's either an integer or a string composed of alphanumeric characters (a-z A-Z 0-9), @@ -98,7 +98,6 @@ class implementation returns only: {"content": {"display_name": "..."}, "content_type": "..."} """ block_data = { - Fields.id: _meili_id_from_opaque_key(block.usage_key), Fields.usage_key: str(block.usage_key), Fields.block_id: str(block.usage_key.block_id), Fields.display_name: xblock_api.get_block_display_name(block), @@ -171,7 +170,7 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: # Note that we could improve performance for indexing many components from the same library/course, # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the # tags for each component separately. - all_tags = tagging_api.get_object_tags(object_id).all() + all_tags = tagging_api.get_object_tags(str(object_id)).all() if not all_tags: return {} result = { @@ -207,23 +206,38 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: return {Fields.tags: result} -def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> dict: +def searchable_doc_for_library_block(xblock_metadata: lib_api.LibraryXBlockMetadata) -> dict: """ Generate a dictionary document suitable for ingestion into a search engine like Meilisearch or Elasticsearch, so that the given library block can be found using faceted search. """ - library_name = lib_api.get_library(metadata.usage_key.context_key).title - doc = {} - try: - block = xblock_api.load_block(metadata.usage_key, user=None) - except Exception as err: # pylint: disable=broad-except - log.exception(f"Failed to load XBlock {metadata.usage_key}: {err}") + library_name = lib_api.get_library(xblock_metadata.usage_key.context_key).title + block = xblock_api.load_block(xblock_metadata.usage_key, user=None) + + doc = { + Fields.id: meili_id_from_opaque_key(xblock_metadata.usage_key), + Fields.type: DocType.library_block, + } + doc.update(_fields_from_block(block)) - doc.update(_tags_for_content_object(metadata.usage_key)) - doc[Fields.type] = DocType.library_block + # Add the breadcrumbs. In v2 libraries, the library itself is not a "parent" of the XBlocks so we add it here: doc[Fields.breadcrumbs] = [{"display_name": library_name}] + + return doc + + +def searchable_doc_tags(usage_key: UsageKey) -> dict: + """ + Generate a dictionary document suitable for ingestion into a search engine + like Meilisearch or Elasticsearch, with the tags data for the given content object. + """ + doc = { + Fields.id: meili_id_from_opaque_key(usage_key), + } + doc.update(_tags_for_content_object(usage_key)) + return doc @@ -233,7 +247,11 @@ def searchable_doc_for_course_block(block) -> dict: like Meilisearch or Elasticsearch, so that the given course block can be found using faceted search. """ - doc = _fields_from_block(block) - doc.update(_tags_for_content_object(block.usage_key)) - doc[Fields.type] = DocType.course_block + doc = { + Fields.id: meili_id_from_opaque_key(block.usage_key), + Fields.type: DocType.course_block, + } + + doc.update(_fields_from_block(block)) + return doc diff --git a/openedx/core/djangoapps/content/search/handlers.py b/openedx/core/djangoapps/content/search/handlers.py index 9184b2b2e273..d782d4a07040 100644 --- a/openedx/core/djangoapps/content/search/handlers.py +++ b/openedx/core/djangoapps/content/search/handlers.py @@ -1,14 +1,36 @@ """ Signal/event handlers for content search """ + +import logging + from django.db.models.signals import post_delete from django.dispatch import receiver -from openedx_events.content_authoring.data import ContentLibraryData -from openedx_events.content_authoring.signals import CONTENT_LIBRARY_DELETED +from openedx_events.content_authoring.data import ContentLibraryData, LibraryBlockData, XBlockData +from openedx_events.content_authoring.signals import ( + CONTENT_LIBRARY_DELETED, + CONTENT_LIBRARY_UPDATED, + LIBRARY_BLOCK_CREATED, + LIBRARY_BLOCK_DELETED, + XBLOCK_CREATED, + XBLOCK_DELETED, + XBLOCK_UPDATED +) from openedx.core.djangoapps.content.course_overviews.models import CourseOverview from openedx.core.djangoapps.content.search.models import SearchAccess +from .api import only_if_meilisearch_enabled +from .tasks import ( + delete_library_block_index_doc, + delete_xblock_index_doc, + update_content_library_index_docs, + upsert_library_block_index_doc, + upsert_xblock_index_doc +) + +log = logging.getLogger(__name__) + # Using post_delete here because there is no COURSE_DELETED event defined. @receiver(post_delete, sender=CourseOverview) @@ -21,3 +43,93 @@ def delete_course_search_access(sender, instance, **kwargs): # pylint: disable= def delete_library_search_access(content_library: ContentLibraryData, **kwargs): """Deletes the SearchAccess instance for deleted content libraries""" SearchAccess.objects.filter(context_key=content_library.library_key).delete() + + +@receiver(XBLOCK_CREATED) +@only_if_meilisearch_enabled +def xblock_created_handler(**kwargs) -> None: + """ + Create the index for the XBlock + """ + xblock_info = kwargs.get("xblock_info", None) + if not xblock_info or not isinstance(xblock_info, XBlockData): # pragma: no cover + log.error("Received null or incorrect data for event") + return + + upsert_xblock_index_doc.delay( + str(xblock_info.usage_key), + recursive=False, + ) + + +@receiver(XBLOCK_UPDATED) +@only_if_meilisearch_enabled +def xblock_updated_handler(**kwargs) -> None: + """ + Update the index for the XBlock and its children + """ + xblock_info = kwargs.get("xblock_info", None) + if not xblock_info or not isinstance(xblock_info, XBlockData): # pragma: no cover + log.error("Received null or incorrect data for event") + return + + upsert_xblock_index_doc.delay( + str(xblock_info.usage_key), + recursive=True, # Update all children because the breadcrumb may have changed + ) + + +@receiver(XBLOCK_DELETED) +@only_if_meilisearch_enabled +def xblock_deleted_handler(**kwargs) -> None: + """ + Delete the index for the XBlock + """ + xblock_info = kwargs.get("xblock_info", None) + if not xblock_info or not isinstance(xblock_info, XBlockData): # pragma: no cover + log.error("Received null or incorrect data for event") + return + + delete_xblock_index_doc.delay(str(xblock_info.usage_key)) + + +@receiver(LIBRARY_BLOCK_CREATED) +@only_if_meilisearch_enabled +def library_block_updated_handler(**kwargs) -> None: + """ + Create or update the index for the content library block + """ + library_block_data = kwargs.get("library_block", None) + if not library_block_data or not isinstance(library_block_data, LibraryBlockData): # pragma: no cover + log.error("Received null or incorrect data for event") + return + + upsert_library_block_index_doc.delay(str(library_block_data.usage_key)) + + +@receiver(LIBRARY_BLOCK_DELETED) +@only_if_meilisearch_enabled +def library_block_deleted(**kwargs) -> None: + """ + Delete the index for the content library block + """ + library_block_data = kwargs.get("library_block", None) + if not library_block_data or not isinstance(library_block_data, LibraryBlockData): # pragma: no cover + log.error("Received null or incorrect data for event") + return + + delete_library_block_index_doc.delay(str(library_block_data.usage_key)) + + +@receiver(CONTENT_LIBRARY_UPDATED) +@only_if_meilisearch_enabled +def content_library_updated_handler(**kwargs) -> None: + """ + Update the index for the content library + """ + content_library_data = kwargs.get("content_library", None) + if not content_library_data or not isinstance(content_library_data, ContentLibraryData): # pragma: no cover + log.error("Received null or incorrect data for event") + return + + update_content_library_index_docs.delay(str(content_library_data.library_key)) diff --git a/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py b/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py deleted file mode 100644 index 18c7681879cb..000000000000 --- a/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -Mixin for Django management commands that interact with Meilisearch -""" -from contextlib import contextmanager -import time - -from django.conf import settings -from django.core.management import CommandError -import meilisearch -from meilisearch.errors import MeilisearchError -from meilisearch.models.task import TaskInfo - - -class MeiliCommandMixin: - """ - Mixin for Django management commands that interact with Meilisearch - """ - def get_meilisearch_client(self): - """ - Get the Meiliesearch client - """ - if hasattr(self, "_meili_client"): - return self._meili_client - # Connect to Meilisearch - if not settings.MEILISEARCH_ENABLED: - raise CommandError("MEILISEARCH_ENABLED is not set - search functionality disabled.") - - self._meili_client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) - try: - self._meili_client.health() - except MeilisearchError as err: - self.stderr.write(err.message) # print this because 'raise...from...' doesn't print the details - raise CommandError("Unable to connect to Meilisearch") from err - return self._meili_client - - def wait_for_meili_task(self, info: TaskInfo): - """ - Simple helper method to wait for a Meilisearch task to complete - """ - client = self.get_meilisearch_client() - current_status = client.get_task(info.task_uid) - while current_status.status in ("enqueued", "processing"): - self.stdout.write("...") - time.sleep(1) - current_status = client.get_task(info.task_uid) - if current_status.status != "succeeded": - self.stderr.write(f"Task has status: {current_status.status}") - self.stderr.write(str(current_status.error)) - try: - err_reason = current_status.error['message'] - except (TypeError, KeyError): - err_reason = "Unknown error" - raise MeilisearchError(err_reason) - - def index_exists(self, index_name: str) -> bool: - """ - Check if an index exists - """ - client = self.get_meilisearch_client() - try: - client.get_index(index_name) - except MeilisearchError as err: - if err.code == "index_not_found": - return False - else: - raise err - return True - - @contextmanager - def using_temp_index(self, target_index): - """ - Create a new temporary Meilisearch index, populate it, then swap it to - become the active index. - """ - client = self.get_meilisearch_client() - self.stdout.write("Checking index...") - temp_index_name = target_index + "_new" - if self.index_exists(temp_index_name): - self.stdout.write("Temporary index already exists. Deleting it...") - self.wait_for_meili_task(client.delete_index(temp_index_name)) - - self.stdout.write("Creating new index...") - self.wait_for_meili_task( - client.create_index(temp_index_name, {'primaryKey': 'id'}) - ) - new_index_created = client.get_index(temp_index_name).created_at - - yield temp_index_name - - if not self.index_exists(target_index): - # We have to create the "target" index before we can successfully swap the new one into it: - self.stdout.write("Preparing to swap into index (first time)...") - self.wait_for_meili_task(client.create_index(target_index)) - self.stdout.write("Swapping index...") - client.swap_indexes([{'indexes': [temp_index_name, target_index]}]) - # If we're using an API key that's restricted to certain index prefix(es), we won't be able to get the status - # of this request unfortunately. https://github.com/meilisearch/meilisearch/issues/4103 - while True: - time.sleep(1) - if client.get_index(target_index).created_at != new_index_created: - self.stdout.write("Waiting for swap completion...") - else: - break - self.stdout.write("Deleting old index...") - self.wait_for_meili_task(client.delete_index(temp_index_name)) diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 8d39ef3746eb..3767ebcba6c9 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -5,28 +5,12 @@ See also cms/djangoapps/contentstore/management/commands/reindex_course.py which indexes LMS (published) courses in ElasticSearch. """ -import logging -import time - -from django.conf import settings from django.core.management import BaseCommand, CommandError -from openedx.core.djangoapps.content_libraries import api as lib_api -from openedx.core.djangoapps.content.search.documents import ( - Fields, - searchable_doc_for_course_block, - searchable_doc_for_library_block, - STUDIO_INDEX_NAME, -) -from xmodule.modulestore import ModuleStoreEnum -from xmodule.modulestore.django import modulestore -from .meili_mixin import MeiliCommandMixin - - -log = logging.getLogger(__name__) +from ... import api -class Command(MeiliCommandMixin, BaseCommand): +class Command(BaseCommand): """ Build or re-build the search index for courses and libraries (in Studio, i.e. Draft mode) @@ -41,120 +25,13 @@ def handle(self, *args, **options): """ Build a new search index for Studio, containing content from courses and libraries """ + if not api.is_meilisearch_enabled(): + raise CommandError("Meilisearch is not enabled. Please set MEILISEARCH_ENABLED to True in your settings.") + if not options["experimental"]: raise CommandError( "This command is experimental and not recommended for production. " "Use the --experimental argument to acknowledge and run it." ) - start_time = time.perf_counter() - client = self.get_meilisearch_client() - store = modulestore() - - # Get the lists of libraries - self.stdout.write("Counting libraries...") - lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')] - num_libraries = len(lib_keys) - - # Get the list of courses - self.stdout.write("Counting courses...") - with store.branch_setting(ModuleStoreEnum.Branch.draft_preferred): - all_courses = store.get_courses() - num_courses = len(all_courses) - - # Some counters so we can track our progress as indexing progresses: - num_contexts = num_courses + num_libraries - num_contexts_done = 0 # How many courses/libraries we've indexed - num_blocks_done = 0 # How many individual components/XBlocks we've indexed - - self.stdout.write(f"Found {num_courses} courses and {num_libraries} libraries.") - index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_NAME - with self.using_temp_index(index_name) as temp_index_name: - ############## Configure the index ############## - - # The following index settings are best changed on an empty index. - # Changing them on a populated index will "re-index all documents in the index, which can take some time" - # and use more RAM. Instead, we configure an empty index then populate it one course/library at a time. - - # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): - client.index(temp_index_name).update_distinct_attribute(Fields.usage_key) - # Mark which attributes can be used for filtering/faceted search: - client.index(temp_index_name).update_filterable_attributes([ - Fields.block_type, - Fields.context_key, - Fields.org, - Fields.tags, - Fields.type, - Fields.access_id, - ]) - # Mark which attributes are used for keyword search, in order of importance: - client.index(temp_index_name).update_searchable_attributes([ - Fields.display_name, - Fields.block_id, - Fields.content, - Fields.tags, - # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. - ]) - ############## Libraries ############## - self.stdout.write("Indexing libraries...") - for lib_key in lib_keys: - self.stdout.write(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}") - docs = [] - for component in lib_api.get_library_components(lib_key): - metadata = lib_api.LibraryXBlockMetadata.from_component(lib_key, component) - doc = searchable_doc_for_library_block(metadata) - docs.append(doc) - num_blocks_done += 1 - if docs: - # Add all the docs in this library at once (usually faster than adding one at a time): - self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) - self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) - num_contexts_done += 1 - - ############## Courses ############## - self.stdout.write("Indexing courses...") - for course in all_courses: - self.stdout.write( - f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})" - ) - docs = [] - - # Pre-fetch the course with all of its children: - course = store.get_course(course.id, depth=None) - - def add_with_children(block): - """ Recursively index the given XBlock/component """ - doc = searchable_doc_for_course_block(block) - docs.append(doc) # pylint: disable=cell-var-from-loop - self.recurse_children(block, add_with_children) # pylint: disable=cell-var-from-loop - - self.recurse_children(course, add_with_children) - - if docs: - # Add all the docs in this course at once (usually faster than adding one at a time): - self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) - num_contexts_done += 1 - num_blocks_done += len(docs) - - elapsed_time = time.perf_counter() - start_time - self.stdout.write( - f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses " - f"and libraries in {elapsed_time:.0f}s." - ) - - def recurse_children(self, block, fn): - """ - Recurse the children of an XBlock and call the given function for each - - The main purpose of this is just to wrap the loading of each child in - try...except. Otherwise block.get_children() would do what we need. - """ - if block.has_children: - for child_id in block.children: - try: - child = block.get_child(child_id) - except Exception as err: # pylint: disable=broad-except - log.exception(err) - self.stderr.write(f"Unable to load block {child_id}") - else: - fn(child) + api.rebuild_index(self.stdout.write) diff --git a/openedx/core/djangoapps/content/search/tasks.py b/openedx/core/djangoapps/content/search/tasks.py new file mode 100644 index 000000000000..06ea3e777c61 --- /dev/null +++ b/openedx/core/djangoapps/content/search/tasks.py @@ -0,0 +1,83 @@ +""" +Defines asynchronous celery task for content indexing +""" + +from __future__ import annotations + +import logging + +from celery import shared_task +from celery_utils.logged_task import LoggedTask +from edx_django_utils.monitoring import set_code_owner_attribute +from opaque_keys.edx.keys import UsageKey +from opaque_keys.edx.locator import LibraryLocatorV2, LibraryUsageLocatorV2 +from meilisearch.errors import MeilisearchError + +from . import api + +log = logging.getLogger(__name__) + + +@shared_task(base=LoggedTask, autoretry_for=(MeilisearchError, ConnectionError)) +@set_code_owner_attribute +def upsert_xblock_index_doc(usage_key_str: str, recursive: bool) -> None: + """ + Celery task to update the content index document for an XBlock + """ + usage_key = UsageKey.from_string(usage_key_str) + + log.info("Updating content index document for XBlock with id: %s", usage_key) + + api.upsert_xblock_index_doc(usage_key, recursive) + + +@shared_task(base=LoggedTask, autoretry_for=(MeilisearchError, ConnectionError)) +@set_code_owner_attribute +def delete_xblock_index_doc(usage_key_str: str) -> None: + """ + Celery task to delete the content index document for an XBlock + """ + usage_key = UsageKey.from_string(usage_key_str) + + log.info("Updating content index document for XBlock with id: %s", usage_key) + + api.delete_index_doc(usage_key) + + +@shared_task(base=LoggedTask, autoretry_for=(MeilisearchError, ConnectionError)) +@set_code_owner_attribute +def upsert_library_block_index_doc(usage_key_str: str) -> None: + """ + Celery task to update the content index document for a library block + """ + usage_key = LibraryUsageLocatorV2.from_string(usage_key_str) + + log.info("Updating content index document for library block with id: %s", usage_key) + + api.upsert_library_block_index_doc(usage_key) + + +@shared_task(base=LoggedTask, autoretry_for=(MeilisearchError, ConnectionError)) +@set_code_owner_attribute +def delete_library_block_index_doc(usage_key_str: str) -> None: + """ + Celery task to delete the content index document for a library block + """ + usage_key = LibraryUsageLocatorV2.from_string(usage_key_str) + + log.info("Deleting content index document for library block with id: %s", usage_key) + + api.delete_index_doc(usage_key) + + +@shared_task(base=LoggedTask, autoretry_for=(MeilisearchError, ConnectionError)) +@set_code_owner_attribute +def update_content_library_index_docs(library_key_str: str) -> None: + """ + Celery task to update the content index documents for all library blocks in a library + """ + library_key = LibraryLocatorV2.from_string(library_key_str) + + log.info("Updating content index documents for library with id: %s", library_key) + + api.upsert_content_library_index_docs(library_key) diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py new file mode 100644 index 000000000000..43e1eb20e68e --- /dev/null +++ b/openedx/core/djangoapps/content/search/tests/test_api.py @@ -0,0 +1,195 @@ +""" +Tests for the Studio content search API. +""" +from __future__ import annotations + +from unittest.mock import MagicMock, call, patch + +import ddt +from django.test import override_settings +from organizations.tests.factories import OrganizationFactory + +from common.djangoapps.student.tests.factories import UserFactory +from openedx.core.djangoapps.content_libraries import api as library_api +from openedx.core.djangolib.testing.utils import skip_unless_cms +from xmodule.modulestore.tests.django_utils import TEST_DATA_SPLIT_MODULESTORE, ModuleStoreTestCase + + +try: + # This import errors in the lms because content.search is not an installed app there. + from .. import api + from ..models import SearchAccess +except RuntimeError: + SearchAccess = {} + +STUDIO_SEARCH_ENDPOINT_URL = "/api/content_search/v2/studio/" + + +@ddt.ddt +@skip_unless_cms +@patch("openedx.core.djangoapps.content.search.api._wait_for_meili_task", new=MagicMock(return_value=None)) +@patch("openedx.core.djangoapps.content.search.api.MeilisearchClient") +class TestSearchApi(ModuleStoreTestCase): + """ + Tests for the Studio content search and index API. + """ + + MODULESTORE = TEST_DATA_SPLIT_MODULESTORE + + def setUp(self): + super().setUp() + self.user = UserFactory.create() + self.user_id = self.user.id + + self.modulestore_patcher = patch( + "openedx.core.djangoapps.content.search.api.modulestore", return_value=self.store + ) + self.addCleanup(self.modulestore_patcher.stop) + self.modulestore_patcher.start() + + # Clear the Meilisearch client to avoid side effects from other tests + api.clear_meilisearch_client() + + # Create course + self.course = self.store.create_course( + "org1", + "test_course", + "test_run", + self.user_id, + fields={"display_name": "Test Course"}, + ) + course_access, _ = SearchAccess.objects.get_or_create(context_key=self.course.id) + + # Create XBlocks + self.sequential = self.store.create_child(self.user_id, self.course.location, "sequential", "test_sequential") + self.doc_sequential = { + 'id': 'block-v1org1test_coursetest_runtypesequentialblocktest_sequential-f702c144', + 'type': 'course_block', + 'usage_key': 'block-v1:org1+test_course+test_run+type@sequential+block@test_sequential', + 'block_id': 'test_sequential', + 'display_name': 'sequential', + 'block_type': 'sequential', + 'context_key': 'course-v1:org1+test_course+test_run', + 'org': 'org1', + 'breadcrumbs': [{'display_name': 'Test Course'}], + 'content': {}, + 'access_id': course_access.id, + } + self.store.create_child(self.user_id, self.sequential.location, "vertical", "test_vertical") + self.doc_vertical = { + 'id': 'block-v1org1test_coursetest_runtypeverticalblocktest_vertical-e76a10a4', + 'type': 'course_block', + 'usage_key': 'block-v1:org1+test_course+test_run+type@vertical+block@test_vertical', + 'block_id': 'test_vertical', + 'display_name': 'vertical', + 'block_type': 'vertical', + 'context_key': 'course-v1:org1+test_course+test_run', + 'org': 'org1', + 'breadcrumbs': [ + {'display_name': 'Test Course'}, + {'display_name': 'sequential'} + ], + 'content': {}, + 'access_id': course_access.id, + } + + # Create a content library: + self.library = library_api.create_library( + library_type=library_api.COMPLEX, + org=OrganizationFactory.create(short_name="org1"), + slug="lib", + title="Library", + ) + lib_access, _ = SearchAccess.objects.get_or_create(context_key=self.library.key) + # Populate it with a problem: + self.problem = library_api.create_library_block(self.library.key, "problem", "p1") + self.doc_problem = { + "id": "lborg1libproblemp1-a698218e", + "usage_key": "lb:org1:lib:problem:p1", + "block_id": "p1", + "display_name": "Blank Problem", + "block_type": "problem", + "context_key": "lib:org1:lib", + "org": "org1", + "breadcrumbs": [{"display_name": "Library"}], + "content": {"problem_types": [], "capa_content": " "}, + "type": "library_block", + "access_id": lib_access.id, + } + + @override_settings(MEILISEARCH_ENABLED=False) + def test_reindex_meilisearch_disabled(self, mock_meilisearch): + with self.assertRaises(RuntimeError): + api.rebuild_index() + + mock_meilisearch.return_value.swap_indexes.assert_not_called() + + @override_settings(MEILISEARCH_ENABLED=True) + def test_reindex_meilisearch(self, mock_meilisearch): + + api.rebuild_index() + mock_meilisearch.return_value.index.return_value.add_documents.assert_has_calls( + [ + call([self.doc_sequential, self.doc_vertical]), + call([self.doc_problem]), + ], + any_order=True, + ) + + @ddt.data( + True, + False + ) + @override_settings(MEILISEARCH_ENABLED=True) + def test_index_xblock_metadata(self, recursive, mock_meilisearch): + """ + Test indexing an XBlock. + """ + api.upsert_xblock_index_doc(self.sequential.usage_key, recursive=recursive) + + if recursive: + expected_docs = [self.doc_sequential, self.doc_vertical] + else: + expected_docs = [self.doc_sequential] + + mock_meilisearch.return_value.index.return_value.update_documents.assert_called_once_with(expected_docs) + + @override_settings(MEILISEARCH_ENABLED=True) + def test_delete_index_xblock(self, mock_meilisearch): + """ + Test deleting an XBlock doc from the index. + """ + api.delete_index_doc(self.sequential.usage_key) + + mock_meilisearch.return_value.index.return_value.delete_document.assert_called_once_with( + self.doc_sequential['id'] + ) + + @override_settings(MEILISEARCH_ENABLED=True) + def test_index_library_block_metadata(self, mock_meilisearch): + """ + Test indexing a Library Block. + """ + api.upsert_library_block_index_doc(self.problem.usage_key) + + mock_meilisearch.return_value.index.return_value.update_documents.assert_called_once_with([self.doc_problem]) + + @override_settings(MEILISEARCH_ENABLED=True) + def test_delete_index_library_block(self, mock_meilisearch): + """ + Test deleting a Library Block doc from the index. + """ + api.delete_index_doc(self.problem.usage_key) + + mock_meilisearch.return_value.index.return_value.delete_document.assert_called_once_with( + self.doc_problem['id'] + ) + + @override_settings(MEILISEARCH_ENABLED=True) + def test_index_content_library_metadata(self, mock_meilisearch): + """ + Test indexing a whole content library. + """ + api.upsert_content_library_index_docs(self.library.key) + + mock_meilisearch.return_value.index.return_value.update_documents.assert_called_once_with([self.doc_problem]) diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py index 914cae880a41..79efda11177b 100644 --- a/openedx/core/djangoapps/content/search/tests/test_documents.py +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -2,6 +2,7 @@ Tests for the Studio content search documents (what gets stored in the index) """ from organizations.models import Organization + from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangolib.testing.utils import skip_unless_cms from xmodule.modulestore.django import modulestore @@ -10,7 +11,7 @@ try: # This import errors in the lms because content.search is not an installed app there. - from ..documents import searchable_doc_for_course_block + from ..documents import searchable_doc_for_course_block, searchable_doc_tags from ..models import SearchAccess except RuntimeError: searchable_doc_for_course_block = lambda x: x @@ -78,7 +79,10 @@ def test_problem_block(self): Test how a problem block gets represented in the search index """ block = self.store.get_item(self.problem_block.usage_key) - doc = searchable_doc_for_course_block(block) + doc = {} + doc.update(searchable_doc_for_course_block(block)) + doc.update(searchable_doc_tags(block.usage_key)) + assert doc == { # Note the 'id' has been stripped of special characters to meet Meilisearch requirements. # The '-8516ed8' suffix is deterministic based on the original usage key. @@ -115,7 +119,9 @@ def test_html_block(self): Test how an HTML block gets represented in the search index """ block = self.store.get_item(self.html_block_key) - doc = searchable_doc_for_course_block(block) + doc = {} + doc.update(searchable_doc_for_course_block(block)) + doc.update(searchable_doc_tags(block.usage_key)) assert doc == { "id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-efb9c601", "type": "course_block", diff --git a/openedx/core/djangoapps/content/search/tests/test_handlers.py b/openedx/core/djangoapps/content/search/tests/test_handlers.py new file mode 100644 index 000000000000..1a1134547666 --- /dev/null +++ b/openedx/core/djangoapps/content/search/tests/test_handlers.py @@ -0,0 +1,156 @@ +""" +Tests for the search index update handlers +""" +from unittest.mock import MagicMock, patch + +from django.test import LiveServerTestCase, override_settings +from organizations.tests.factories import OrganizationFactory + +from common.djangoapps.student.tests.factories import UserFactory +from openedx.core.djangoapps.content_libraries import api as library_api +from openedx.core.djangolib.testing.utils import skip_unless_cms +from openedx.core.lib.blockstore_api.tests.base import BlockstoreAppTestMixin +from xmodule.modulestore.tests.django_utils import TEST_DATA_SPLIT_MODULESTORE, ModuleStoreTestCase + + +try: + # This import errors in the lms because content.search is not an installed app there. + from .. import api + from ..models import SearchAccess +except RuntimeError: + SearchAccess = {} + + +@patch("openedx.core.djangoapps.content.search.api._wait_for_meili_task", new=MagicMock(return_value=None)) +@patch("openedx.core.djangoapps.content.search.api.MeilisearchClient") +@override_settings(MEILISEARCH_ENABLED=True) +@skip_unless_cms +class TestUpdateIndexHandlers( + ModuleStoreTestCase, + BlockstoreAppTestMixin, + LiveServerTestCase, +): + """ + Test that the search index is updated when XBlocks and Library Blocks are modified + """ + + MODULESTORE = TEST_DATA_SPLIT_MODULESTORE + + def setUp(self): + super().setUp() + # Create user + self.user = UserFactory.create() + self.user_id = self.user.id + + self.orgA = OrganizationFactory.create(short_name="orgA") + + self.patcher = patch("openedx.core.djangoapps.content_tagging.tasks.modulestore", return_value=self.store) + self.addCleanup(self.patcher.stop) + self.patcher.start() + + api.clear_meilisearch_client() # Clear the Meilisearch client to avoid leaking state from other tests + + def test_create_delete_xblock(self, meilisearch_client): + # Create course + course = self.store.create_course( + self.orgA.short_name, + "test_course", + "test_run", + self.user_id, + fields={"display_name": "Test Course"}, + ) + course_access, _ = SearchAccess.objects.get_or_create(context_key=course.id) + + # Create XBlocks + sequential = self.store.create_child(self.user_id, course.location, "sequential", "test_sequential") + doc_sequential = { + "id": "block-v1orgatest_coursetest_runtypesequentialblocktest_sequential-0cdb9395", + "type": "course_block", + "usage_key": "block-v1:orgA+test_course+test_run+type@sequential+block@test_sequential", + "block_id": "test_sequential", + "display_name": "sequential", + "block_type": "sequential", + "context_key": "course-v1:orgA+test_course+test_run", + "org": "orgA", + "breadcrumbs": [{"display_name": "Test Course"}], "content": {}, + "access_id": course_access.id, + + } + meilisearch_client.return_value.index.return_value.update_documents.assert_called_with([doc_sequential]) + vertical = self.store.create_child(self.user_id, sequential.location, "vertical", "test_vertical") + doc_vertical = { + "id": "block-v1orgatest_coursetest_runtypeverticalblocktest_vertical-011f143b", + "type": "course_block", + "usage_key": "block-v1:orgA+test_course+test_run+type@vertical+block@test_vertical", + "block_id": "test_vertical", + "display_name": "vertical", + "block_type": "vertical", + "context_key": "course-v1:orgA+test_course+test_run", + "org": "orgA", + "breadcrumbs": [{"display_name": "Test Course"}, {"display_name": "sequential"}], + "content": {}, + "access_id": course_access.id, + } + + meilisearch_client.return_value.index.return_value.update_documents.assert_called_with([doc_vertical]) + + # Update the XBlock + sequential = self.store.get_item(sequential.location, self.user_id) # Refresh the XBlock + sequential.display_name = "Updated Sequential" + self.store.update_item(sequential, self.user_id) + + # The display name and the child's breadcrumbs should be updated + doc_sequential["display_name"] = "Updated Sequential" + doc_vertical["breadcrumbs"][1]["display_name"] = "Updated Sequential" + meilisearch_client.return_value.index.return_value.update_documents.assert_called_with([ + doc_sequential, + doc_vertical, + ]) + + # Delete the XBlock + self.store.delete_item(vertical.location, self.user_id) + + meilisearch_client.return_value.index.return_value.delete_document.assert_called_with( + "block-v1orgatest_coursetest_runtypeverticalblocktest_vertical-011f143b" + ) + + def test_create_delete_library_block(self, meilisearch_client): + # Create library + library = library_api.create_library( + org=self.orgA, + slug="lib_a", + title="Library Org A", + description="This is a library from Org A", + ) + lib_access, _ = SearchAccess.objects.get_or_create(context_key=library.key) + + problem = library_api.create_library_block(library.key, "problem", "Problem1") + doc_problem = { + "id": "lborgalib_aproblemproblem1-ca3186e9", + "type": "library_block", + "usage_key": "lb:orgA:lib_a:problem:Problem1", + "block_id": "Problem1", + "display_name": "Blank Problem", + "block_type": "problem", + "context_key": "lib:orgA:lib_a", + "org": "orgA", + "breadcrumbs": [{"display_name": "Library Org A"}], + "content": {"problem_types": [], "capa_content": " "}, + "access_id": lib_access.id, + } + + meilisearch_client.return_value.index.return_value.update_documents.assert_called_with([doc_problem]) + + # Rename the content library + library_api.update_library(library.key, title="Updated Library Org A") + + # The breadcrumbs should be updated + doc_problem["breadcrumbs"][0]["display_name"] = "Updated Library Org A" + meilisearch_client.return_value.index.return_value.update_documents.assert_called_with([doc_problem]) + + # Delete the Library Block + library_api.delete_library_block(problem.usage_key) + + meilisearch_client.return_value.index.return_value.delete_document.assert_called_with( + "lborgalib_aproblemproblem1-ca3186e9" + ) diff --git a/openedx/core/djangoapps/content/search/tests/test_views.py b/openedx/core/djangoapps/content/search/tests/test_views.py index 853f5b5149a6..7c1b8e99850a 100644 --- a/openedx/core/djangoapps/content/search/tests/test_views.py +++ b/openedx/core/djangoapps/content/search/tests/test_views.py @@ -1,8 +1,10 @@ """ Tests for the Studio content search REST API. """ +from __future__ import annotations + import functools -from unittest import mock +from unittest.mock import ANY, MagicMock, Mock, patch import ddt from django.test import override_settings @@ -17,7 +19,8 @@ try: # This import errors in the lms because content.search is not an installed app there. - from openedx.core.djangoapps.content.search.models import SearchAccess + from .. import api + from ..models import SearchAccess except RuntimeError: SearchAccess = {} @@ -28,7 +31,7 @@ def mock_meilisearch(enabled=True): """ - Decorator that mocks the required parts of content.search.views to simulate a running Meilisearch instance. + Decorator that mocks the required parts of content.search.api to simulate a running Meilisearch instance. """ def decorator(func): """ @@ -40,17 +43,19 @@ def wrapper(*args, **kwargs): MEILISEARCH_ENABLED=enabled, MEILISEARCH_PUBLIC_URL="http://meilisearch.url", ): - with mock.patch( - 'openedx.core.djangoapps.content.search.views._get_meili_api_key_uid', + with patch( + 'openedx.core.djangoapps.content.search.api._get_meili_api_key_uid', return_value=MOCK_API_KEY_UID, ): return func(*args, **kwargs) + return wrapper return decorator @ddt.ddt @skip_unless_cms +@patch("openedx.core.djangoapps.content.search.api._wait_for_meili_task", new=MagicMock(return_value=None)) class StudioSearchViewTest(StudioSearchTestMixin, SharedModuleStoreTestCase): """ General tests for the Studio search REST API. @@ -59,6 +64,9 @@ def setUp(self): super().setUp() self.client = APIClient() + # Clear the Meilisearch client to avoid side effects from other tests + api.clear_meilisearch_client() + @mock_meilisearch(enabled=False) def test_studio_search_unathenticated_disabled(self): """ @@ -84,31 +92,33 @@ def test_studio_search_disabled(self): result = self.client.get(STUDIO_SEARCH_ENDPOINT_URL) assert result.status_code == 404 + def _mock_generate_tenant_token(self, mock_search_client): + """ + Return a mocked meilisearch.Client.generate_tenant_token method. + """ + mock_generate_tenant_token = Mock(return_value='restricted_api_key') + mock_search_client.return_value = Mock( + generate_tenant_token=mock_generate_tenant_token, + ) + return mock_generate_tenant_token + @mock_meilisearch(enabled=True) - def test_studio_search_enabled(self): + @patch('openedx.core.djangoapps.content.search.api.MeilisearchClient') + def test_studio_search_enabled(self, mock_search_client): """ We've implement fine-grained permissions on the meilisearch content, so any logged-in user can get a restricted API key for Meilisearch using the REST API. """ self.client.login(username='student', password='student_pass') + mock_generate_tenant_token = self._mock_generate_tenant_token(mock_search_client) result = self.client.get(STUDIO_SEARCH_ENDPOINT_URL) assert result.status_code == 200 assert result.data["index_name"] == "studio_content" assert result.data["url"] == "http://meilisearch.url" assert result.data["api_key"] and isinstance(result.data["api_key"], str) - def _mock_generate_tenant_token(self, mock_search_client): - """ - Return a mocked meilisearch.Client.generate_tenant_token method. - """ - mock_generate_tenant_token = mock.Mock(return_value='restricted_api_key') - mock_search_client.return_value = mock.Mock( - generate_tenant_token=mock_generate_tenant_token, - ) - return mock_generate_tenant_token - @mock_meilisearch(enabled=True) - @mock.patch('openedx.core.djangoapps.content.search.views.meilisearch.Client') + @patch('openedx.core.djangoapps.content.search.api.MeilisearchClient') def test_studio_search_student_no_access(self, mock_search_client): """ Users without access to any courses or libraries will have all documents filtered out. @@ -124,11 +134,11 @@ def test_studio_search_student_no_access(self, mock_search_client): "filter": "org IN [] OR access_id IN []", } }, - expires_at=mock.ANY, + expires_at=ANY, ) @mock_meilisearch(enabled=True) - @mock.patch('openedx.core.djangoapps.content.search.views.meilisearch.Client') + @patch('openedx.core.djangoapps.content.search.api.MeilisearchClient') def test_studio_search_staff(self, mock_search_client): """ Users with global staff access can search any document. @@ -142,11 +152,11 @@ def test_studio_search_staff(self, mock_search_client): search_rules={ "studio_content": {} }, - expires_at=mock.ANY, + expires_at=ANY, ) @mock_meilisearch(enabled=True) - @mock.patch('openedx.core.djangoapps.content.search.views.meilisearch.Client') + @patch('openedx.core.djangoapps.content.search.api.MeilisearchClient') def test_studio_search_course_staff_access(self, mock_search_client): """ Users with staff or instructor access to a course or library will be limited to these courses/libraries. @@ -168,7 +178,7 @@ def test_studio_search_course_staff_access(self, mock_search_client): "filter": f"org IN [] OR access_id IN {expected_access_ids}", } }, - expires_at=mock.ANY, + expires_at=ANY, ) @ddt.data( @@ -176,7 +186,7 @@ def test_studio_search_course_staff_access(self, mock_search_client): 'org_instr', ) @mock_meilisearch(enabled=True) - @mock.patch('openedx.core.djangoapps.content.search.views.meilisearch.Client') + @patch('openedx.core.djangoapps.content.search.api.MeilisearchClient') def test_studio_search_org_access(self, username, mock_search_client): """ Users with org access to any courses or libraries will use the org filter. @@ -192,11 +202,11 @@ def test_studio_search_org_access(self, username, mock_search_client): "filter": "org IN ['org1'] OR access_id IN []", } }, - expires_at=mock.ANY, + expires_at=ANY, ) @mock_meilisearch(enabled=True) - @mock.patch('openedx.core.djangoapps.content.search.views.meilisearch.Client') + @patch('openedx.core.djangoapps.content.search.api.MeilisearchClient') def test_studio_search_omit_orgs(self, mock_search_client): """ Grant org access to our staff user to ensure that org's access_ids are omitted from the search filter. @@ -219,13 +229,13 @@ def test_studio_search_omit_orgs(self, mock_search_client): "filter": f"org IN ['org1'] OR access_id IN {expected_access_ids}", } }, - expires_at=mock.ANY, + expires_at=ANY, ) @mock_meilisearch(enabled=True) - @mock.patch('openedx.core.djangoapps.content.search.views._get_user_orgs') - @mock.patch('openedx.core.djangoapps.content.search.views.get_access_ids_for_request') - @mock.patch('openedx.core.djangoapps.content.search.views.meilisearch.Client') + @patch('openedx.core.djangoapps.content.search.api._get_user_orgs') + @patch('openedx.core.djangoapps.content.search.api.get_access_ids_for_request') + @patch('openedx.core.djangoapps.content.search.api.MeilisearchClient') def test_studio_search_limits(self, mock_search_client, mock_get_access_ids, mock_get_user_orgs): """ Users with access to many courses/libraries or orgs will only be able to search content @@ -254,5 +264,5 @@ def test_studio_search_limits(self, mock_search_client, mock_get_access_ids, moc "filter": f"org IN {expected_user_orgs} OR access_id IN {expected_access_ids}", } }, - expires_at=mock.ANY, + expires_at=ANY, ) diff --git a/openedx/core/djangoapps/content/search/views.py b/openedx/core/djangoapps/content/search/views.py index 5b29a8db1b94..f747fe77e8b5 100644 --- a/openedx/core/djangoapps/content/search/views.py +++ b/openedx/core/djangoapps/content/search/views.py @@ -3,69 +3,19 @@ """ from __future__ import annotations -from datetime import datetime, timedelta, timezone import logging -from django.conf import settings from django.contrib.auth import get_user_model -import meilisearch from rest_framework.exceptions import NotFound -from rest_framework.request import Request from rest_framework.response import Response from rest_framework.views import APIView -from common.djangoapps.student.role_helpers import get_course_roles -from common.djangoapps.student.roles import GlobalStaff from openedx.core.lib.api.view_utils import view_auth_classes -from openedx.core.djangoapps.content.search.documents import STUDIO_INDEX_NAME -from openedx.core.djangoapps.content.search.models import get_access_ids_for_request + +from . import api User = get_user_model() log = logging.getLogger(__name__) -MAX_ACCESS_IDS_IN_FILTER = 1_000 -MAX_ORGS_IN_FILTER = 1_000 - - -def _get_meili_api_key_uid(): - """ - Helper method to get the UID of the API key we're using for Meilisearch - """ - if not hasattr(_get_meili_api_key_uid, "uid"): - client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) - _get_meili_api_key_uid.uid = client.get_key(settings.MEILISEARCH_API_KEY).uid - return _get_meili_api_key_uid.uid - - -def _get_meili_access_filter(request: Request) -> dict: - """ - Return meilisearch filter based on the requesting user's permissions. - """ - # Global staff can see anything, so no filters required. - if GlobalStaff().has_user(request.user): - return {} - - # Everyone else is limited to their org staff roles... - user_orgs = _get_user_orgs(request)[:MAX_ORGS_IN_FILTER] - - # ...or the N most recent courses and libraries they can access. - access_ids = get_access_ids_for_request(request, omit_orgs=user_orgs)[:MAX_ACCESS_IDS_IN_FILTER] - return { - "filter": f"org IN {user_orgs} OR access_id IN {access_ids}", - } - - -def _get_user_orgs(request: Request) -> list[str]: - """ - Get the org.short_names for the organizations that the requesting user has OrgStaffRole or OrgInstructorRole. - - Note: org-level roles have course_id=None to distinguish them from course-level roles. - """ - course_roles = get_course_roles(request.user) - return list(set( - role.org - for role in course_roles - if role.course_id is None and role.role in ['staff', 'instructor'] - )) @view_auth_classes(is_authenticated=True) @@ -78,25 +28,9 @@ def get(self, request): """ Give user details on how they can search studio content """ - if not settings.MEILISEARCH_ENABLED: + if not api.is_meilisearch_enabled(): raise NotFound("Meilisearch features are not enabled.") - client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) - index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_NAME - # Create an API key that only allows the user to search content that they have permission to view: - expires_at = datetime.now(tz=timezone.utc) + timedelta(days=7) - search_rules = { - index_name: _get_meili_access_filter(request), - } - # Note: the following is just generating a JWT. It doesn't actually make an API call to Meilisearch. - restricted_api_key = client.generate_tenant_token( - api_key_uid=_get_meili_api_key_uid(), - search_rules=search_rules, - expires_at=expires_at, - ) + response_data = api.generate_user_token_for_studio_search(request) - return Response({ - "url": settings.MEILISEARCH_PUBLIC_URL, - "index_name": index_name, - "api_key": restricted_api_key, - }) + return Response(response_data)