From 463549f9711e66180189d15aeb441174f79b2902 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Tue, 27 Feb 2024 14:36:38 -0800 Subject: [PATCH 01/28] feat: management command to index content libraries using Meilisearch --- cms/envs/common.py | 11 ++ .../management/commands/reindex_libraries.py | 132 ++++++++++++++++++ .../djangoapps/content_libraries/search.py | 56 ++++++++ requirements/edx/base.txt | 14 ++ requirements/edx/development.txt | 18 ++- requirements/edx/doc.txt | 22 +++ requirements/edx/kernel.in | 1 + requirements/edx/testing.txt | 24 +++- 8 files changed, 271 insertions(+), 7 deletions(-) create mode 100644 openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py create mode 100644 openedx/core/djangoapps/content_libraries/search.py diff --git a/cms/envs/common.py b/cms/envs/common.py index 754e6e9755a5..98b28b45ab24 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -2869,3 +2869,14 @@ def _should_send_xblock_events(settings): REST_FRAMEWORK['DEFAULT_SCHEMA_CLASS'] = 'drf_spectacular.openapi.AutoSchema' BEAMER_PRODUCT_ID = "" + +################### Search ################### + +# To support multi-tenancy, you can prefix all indexes with a common key like "sandbox7-" +# and use a restricted tenant token in place of an API key, so that this Open edX instance +# can only use the index(es) that start with this prefix. +# See https://www.meilisearch.com/docs/learn/security/tenant_tokens +MEILISEARCH_INDEX_PREFIX = "" +# Set this to None to disable search functionality +MEILISEARCH_URL = "http://meilisearch" +MEILISEARCH_API_KEY = "devkey" diff --git a/openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py b/openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py new file mode 100644 index 000000000000..fd982665f793 --- /dev/null +++ b/openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py @@ -0,0 +1,132 @@ +""" +Command to build or re-build the search index for content libraries. +""" +import logging +import time + +from django.conf import settings +from django.core.management import BaseCommand, CommandError +import meilisearch +from meilisearch.errors import MeilisearchError +from meilisearch.models.task import TaskInfo + +from openedx.core.djangoapps.content_libraries import api as lib_api +from openedx.core.djangoapps.content_libraries.search import searchable_doc_for_library_block +from openedx.core.djangoapps.content_libraries.models import ContentLibrary + + +log = logging.getLogger(__name__) + +LIBRARIES_INDEX_NAME = "content_libraries" + + +class Command(BaseCommand): + """ + Build or re-build the search index for content libraries. + """ + + def handle(self, *args, **options): + """ + Build a new search index + """ + + # Connect to Meilisearch + if not settings.MEILISEARCH_URL: + raise CommandError("MEILISEARCH_URL is not set - search functionality disabled.") + # TODO: put this into a helper lib? + client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) + try: + client.health() + except MeilisearchError as err: + self.stderr.write(err.message) # print this because 'raise...from...' doesn't print the details + raise CommandError("Unable to connect to Meilisearch") from err + + # Get the list of libraries + self.stdout.write("Counting libraries...") + lib_keys = [lib.library_key for lib in ContentLibrary.objects.select_related('org').only('org', 'slug')] + blocks_by_lib_key = {} + num_blocks = 0 + for lib_key in lib_keys: + blocks_by_lib_key[lib_key] = [] + for component in lib_api.get_library_components(lib_key): + blocks_by_lib_key[lib_key].append(lib_api.LibraryXBlockMetadata.from_component(lib_key, component)) + num_blocks += 1 + + self.stdout.write(f"Found {num_blocks} XBlocks among {len(lib_keys)} libraries.") + + # Check if the index exists already: + self.stdout.write("Checking index...") + index_name = settings.MEILISEARCH_INDEX_PREFIX + LIBRARIES_INDEX_NAME + temp_index_name = index_name + "_new" + try: + client.get_index(temp_index_name) + except MeilisearchError as err: + pass + else: + self.stdout.write("Index already exists. Deleting it...") + self._wait_for_meili_task(client, client.delete_index(temp_index_name)) + + self.stdout.write("Creating new index...") + self._wait_for_meili_task( + client, + client.create_index(temp_index_name, {'primaryKey': 'id'}) + ) + + self.stdout.write("Indexing documents...") + num_done = 0 + for lib_key in lib_keys: + self.stdout.write(f"{num_done}/{num_blocks}. Now indexing {lib_key}") + docs = [] + for metadata in blocks_by_lib_key[lib_key]: + doc = searchable_doc_for_library_block(metadata) + docs.append(doc) + # Add all the docs in this library at once (usually faster than adding one at a time): + self._wait_for_meili_task(client, client.index(temp_index_name).add_documents(docs)) + num_done += len(docs) + + new_index_created = client.get_index(temp_index_name).created_at + if not self._index_exists(index_name, client): + # We have to create the "target" index before we can successfully swap the new one into it: + self.stdout.write(f"Preparing to swap into index (first time)...") + self._wait_for_meili_task(client, client.create_index(index_name)) + self.stdout.write(f"Swapping index...") + client.swap_indexes([{'indexes': [temp_index_name, index_name]}]) + # If we're using an API key that's restricted to certain index prefix(es), we won't be able to get the status + # of this request unfortunately. https://github.com/meilisearch/meilisearch/issues/4103 + while True: + time.sleep(1) + if client.get_index(index_name).created_at != new_index_created: + self.stdout.write("Waiting for swap completion...") + else: + break + self.stdout.write("Deleting old index...") + self._wait_for_meili_task(client, client.delete_index(temp_index_name)) + + self.stdout.write(f"Done! {num_blocks} blocks indexed.") + + def _wait_for_meili_task(self, client: meilisearch.Client, info: TaskInfo): + """ + Simple helper method to wait for a Meilisearch task to complete + """ + current_status = client.get_task(info.task_uid) + while current_status.status in ("enqueued", "processing"): + self.stdout.write("...") + time.sleep(1) + current_status = client.get_task(info.task_uid) + if current_status.status != "succeeded": + self.stderr.write(f"Task has status: {current_status.status}") + self.stderr.write(str(current_status.error)) + try: + raise MeilisearchError(current_status.error['message']) + except (TypeError, KeyError): + raise MeilisearchError("Unknown error") + + def _index_exists(self, index_name: str, client: meilisearch.Client) -> bool: + """ + Check if an index exists + """ + try: + client.get_index(index_name) + except MeilisearchError as err: + return False + return True diff --git a/openedx/core/djangoapps/content_libraries/search.py b/openedx/core/djangoapps/content_libraries/search.py new file mode 100644 index 000000000000..e5931c5c4c34 --- /dev/null +++ b/openedx/core/djangoapps/content_libraries/search.py @@ -0,0 +1,56 @@ +import logging + +from django.utils.text import slugify + +from openedx.core.djangoapps.content_libraries import api as lib_api +from openedx.core.djangoapps.xblock import api as xblock_api + +log = logging.getLogger(__name__) + + +def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> dict: + """ + Generate a dictionary document suitable for ingestion into a search engine + like Meilisearch or Elasticsearch, so that the given library block can be + found using faceted search. + """ + doc = {} + try: + block = xblock_api.load_block(metadata.usage_key, user=None) + block_data = block.index_dictionary() + # Will be something like: + # { + # 'content': {'display_name': '...', 'capa_content': '...'}, + # 'content_type': 'CAPA', + # 'problem_types': ['multiplechoiceresponse'] + # } + # Which we need to flatten: + if "content_type" in block_data: + del block_data["content_type"] # Redundant with our "type" field + if "content" in block_data and isinstance(block_data["content"], dict): + content = block_data["content"] + if "display_name" in content: + del content["display_name"] + del block_data["content"] + block_data.update(content) + # Now we have + # { 'capa_content': '...', 'problem_types': ['multiplechoiceresponse'] } + doc.update(block_data) + except Exception as err: + log.exception(f"Failed to get index_dictionary for {metadata.usage_key}: {err}") + # The data below must always override any values from index_dictionary: + doc.update({ + # A Meilisearch document identifier can be of type integer or string, + # only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_). + # So our usage keys with ":" characters cannot be used as primary keys. + "id": slugify(str(metadata.usage_key)) + "-" + str(hash(str(metadata.usage_key)) % 1_000), + "usage_key": str(metadata.usage_key), + "block_id": str(metadata.usage_key.block_id), + "display_name": metadata.display_name, + "type": metadata.usage_key.block_type, + # This is called contextKey not libKey so we can use the same keys with courses, and maybe search + # both courses and libraries together in the future? + "context_key": str(metadata.usage_key.context_key), # same as lib_key + "org": str(metadata.usage_key.context_key.org), + }) + return doc diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt index 4fc911d74c43..2b0d648ec5ac 100644 --- a/requirements/edx/base.txt +++ b/requirements/edx/base.txt @@ -22,6 +22,8 @@ analytics-python==1.4.post1 # via -r requirements/edx/kernel.in aniso8601==9.0.1 # via edx-tincan-py35 +annotated-types==0.6.0 + # via pydantic appdirs==1.4.4 # via fs asgiref==3.7.2 @@ -87,6 +89,8 @@ botocore==1.34.45 # s3transfer bridgekeeper==0.9 # via -r requirements/edx/kernel.in +camel-converter[pydantic]==3.1.1 + # via meilisearch celery==5.3.6 # via # -c requirements/edx/../constraints.txt @@ -716,6 +720,8 @@ markupsafe==2.1.5 # xblock maxminddb==2.5.2 # via geoip2 +meilisearch==0.30.0 + # via -r requirements/edx/kernel.in mock==5.1.0 # via -r requirements/edx/paver.txt mongoengine==0.27.0 @@ -860,6 +866,10 @@ pycryptodomex==3.20.0 # edx-proctoring # lti-consumer-xblock # pyjwkest +pydantic==2.6.3 + # via camel-converter +pydantic-core==2.16.3 + # via pydantic pygments==2.17.2 # via # -r requirements/edx/bundled.in @@ -999,6 +1009,7 @@ requests==2.31.0 # edx-rest-api-client # geoip2 # mailsnake + # meilisearch # openai # optimizely-sdk # pyjwkest @@ -1141,12 +1152,15 @@ tqdm==4.66.2 typing-extensions==4.9.0 # via # -r requirements/edx/paver.txt + # annotated-types # asgiref # django-countries # drf-spectacular # edx-opaque-keys # jwcrypto # kombu + # pydantic + # pydantic-core # pylti1p3 # snowflake-connector-python tzdata==2024.1 diff --git a/requirements/edx/development.txt b/requirements/edx/development.txt index 78a4bfb532ea..8dc79ee4fe03 100644 --- a/requirements/edx/development.txt +++ b/requirements/edx/development.txt @@ -51,6 +51,7 @@ aniso8601==9.0.1 # edx-tincan-py35 annotated-types==0.6.0 # via + # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt # pydantic anyio==4.3.0 @@ -168,6 +169,11 @@ cachetools==5.3.2 # via # -r requirements/edx/testing.txt # tox +camel-converter[pydantic]==3.1.1 + # via + # -r requirements/edx/doc.txt + # -r requirements/edx/testing.txt + # meilisearch celery==5.3.6 # via # -c requirements/edx/../constraints.txt @@ -1187,6 +1193,10 @@ mccabe==0.7.0 # via # -r requirements/edx/testing.txt # pylint +meilisearch==0.30.0 + # via + # -r requirements/edx/doc.txt + # -r requirements/edx/testing.txt mistune==2.0.5 # via # -r requirements/edx/doc.txt @@ -1459,12 +1469,15 @@ pycryptodomex==3.20.0 # edx-proctoring # lti-consumer-xblock # pyjwkest -pydantic==2.6.1 +pydantic==2.6.3 # via + # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt + # camel-converter # fastapi -pydantic-core==2.16.2 +pydantic-core==2.16.3 # via + # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt # pydantic pydata-sphinx-theme==0.14.4 @@ -1736,6 +1749,7 @@ requests==2.31.0 # edx-rest-api-client # geoip2 # mailsnake + # meilisearch # openai # optimizely-sdk # pact-python diff --git a/requirements/edx/doc.txt b/requirements/edx/doc.txt index c06a83d0b185..790b7c26f51f 100644 --- a/requirements/edx/doc.txt +++ b/requirements/edx/doc.txt @@ -33,6 +33,10 @@ aniso8601==9.0.1 # via # -r requirements/edx/base.txt # edx-tincan-py35 +annotated-types==0.6.0 + # via + # -r requirements/edx/base.txt + # pydantic appdirs==1.4.4 # via # -r requirements/edx/base.txt @@ -114,6 +118,10 @@ botocore==1.34.45 # s3transfer bridgekeeper==0.9 # via -r requirements/edx/base.txt +camel-converter[pydantic]==3.1.1 + # via + # -r requirements/edx/base.txt + # meilisearch celery==5.3.6 # via # -c requirements/edx/../constraints.txt @@ -840,6 +848,8 @@ maxminddb==2.5.2 # via # -r requirements/edx/base.txt # geoip2 +meilisearch==0.30.0 + # via -r requirements/edx/base.txt mistune==2.0.5 # via sphinx-mdinclude mock==5.1.0 @@ -1016,6 +1026,14 @@ pycryptodomex==3.20.0 # edx-proctoring # lti-consumer-xblock # pyjwkest +pydantic==2.6.3 + # via + # -r requirements/edx/base.txt + # camel-converter +pydantic-core==2.16.3 + # via + # -r requirements/edx/base.txt + # pydantic pydata-sphinx-theme==0.14.4 # via sphinx-book-theme pygments==2.17.2 @@ -1178,6 +1196,7 @@ requests==2.31.0 # edx-rest-api-client # geoip2 # mailsnake + # meilisearch # openai # optimizely-sdk # pyjwkest @@ -1389,12 +1408,15 @@ tqdm==4.66.2 typing-extensions==4.9.0 # via # -r requirements/edx/base.txt + # annotated-types # asgiref # django-countries # drf-spectacular # edx-opaque-keys # jwcrypto # kombu + # pydantic + # pydantic-core # pydata-sphinx-theme # pylti1p3 # snowflake-connector-python diff --git a/requirements/edx/kernel.in b/requirements/edx/kernel.in index 1f2b7a6c5bfb..0c2f110c84b9 100644 --- a/requirements/edx/kernel.in +++ b/requirements/edx/kernel.in @@ -107,6 +107,7 @@ lxml # XML parser lti-consumer-xblock>=7.3.0 mako # Primary template language used for server-side page rendering Markdown # Convert text markup to HTML; used in capa problems, forums, and course wikis +meilisearch # Library to access Meilisearch search engine (will replace ElasticSearch) mongoengine # Object-document mapper for MongoDB, used in the LMS dashboard mysqlclient # Driver for the default production relational database nodeenv # Utility for managing Node.js environments; we use this for deployments and testing diff --git a/requirements/edx/testing.txt b/requirements/edx/testing.txt index 6fcc7f7adab3..b622680b27a6 100644 --- a/requirements/edx/testing.txt +++ b/requirements/edx/testing.txt @@ -30,7 +30,9 @@ aniso8601==9.0.1 # -r requirements/edx/base.txt # edx-tincan-py35 annotated-types==0.6.0 - # via pydantic + # via + # -r requirements/edx/base.txt + # pydantic anyio==4.3.0 # via starlette appdirs==1.4.4 @@ -118,6 +120,10 @@ bridgekeeper==0.9 # via -r requirements/edx/base.txt cachetools==5.3.2 # via tox +camel-converter[pydantic]==3.1.1 + # via + # -r requirements/edx/base.txt + # meilisearch celery==5.3.6 # via # -c requirements/edx/../constraints.txt @@ -899,6 +905,8 @@ maxminddb==2.5.2 # geoip2 mccabe==0.7.0 # via pylint +meilisearch==0.30.0 + # via -r requirements/edx/base.txt mock==5.1.0 # via -r requirements/edx/base.txt mongoengine==0.27.0 @@ -1092,10 +1100,15 @@ pycryptodomex==3.20.0 # edx-proctoring # lti-consumer-xblock # pyjwkest -pydantic==2.6.1 - # via fastapi -pydantic-core==2.16.2 - # via pydantic +pydantic==2.6.3 + # via + # -r requirements/edx/base.txt + # camel-converter + # fastapi +pydantic-core==2.16.3 + # via + # -r requirements/edx/base.txt + # pydantic pygments==2.17.2 # via # -r requirements/edx/base.txt @@ -1304,6 +1317,7 @@ requests==2.31.0 # edx-rest-api-client # geoip2 # mailsnake + # meilisearch # openai # optimizely-sdk # pact-python From 81daab2b60a31720d31a63d3132e9afe31fe9bba Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Wed, 28 Feb 2024 17:55:17 -0800 Subject: [PATCH 02/28] feat: include tags in library content index --- .../core/djangoapps/content_libraries/search.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/openedx/core/djangoapps/content_libraries/search.py b/openedx/core/djangoapps/content_libraries/search.py index e5931c5c4c34..74624b4e7f94 100644 --- a/openedx/core/djangoapps/content_libraries/search.py +++ b/openedx/core/djangoapps/content_libraries/search.py @@ -1,8 +1,12 @@ +""" +Utilities related to searching content libraries +""" import logging from django.utils.text import slugify from openedx.core.djangoapps.content_libraries import api as lib_api +from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangoapps.xblock import api as xblock_api log = logging.getLogger(__name__) @@ -36,7 +40,7 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> # Now we have # { 'capa_content': '...', 'problem_types': ['multiplechoiceresponse'] } doc.update(block_data) - except Exception as err: + except Exception as err: # pylint: disable=broad-except log.exception(f"Failed to get index_dictionary for {metadata.usage_key}: {err}") # The data below must always override any values from index_dictionary: doc.update({ @@ -53,4 +57,15 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> "context_key": str(metadata.usage_key.context_key), # same as lib_key "org": str(metadata.usage_key.context_key.org), }) + # Add tags. Note that we could improve performance for indexing many components from the same library, + # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the + # tags for each component separately. + for obj_tag in tagging_api.get_object_tags(metadata.usage_key).all(): + key = f"tags:{obj_tag.name}" # Taxonomy name + if key not in doc: + doc[key] = [] + # Add the tag and all its parent tags, which are implied + for tag_value in obj_tag.get_lineage(): + if tag_value not in doc[key]: + doc[key].append(tag_value) return doc From ea01c7084968c1678f748f6b75df2e58ac19015d Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Wed, 28 Feb 2024 17:55:23 -0800 Subject: [PATCH 03/28] refactor: cleanup --- .../management/commands/reindex_libraries.py | 110 ++++++++++-------- 1 file changed, 61 insertions(+), 49 deletions(-) diff --git a/openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py b/openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py index fd982665f793..d8a8139d4f0e 100644 --- a/openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py +++ b/openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py @@ -20,26 +20,70 @@ LIBRARIES_INDEX_NAME = "content_libraries" -class Command(BaseCommand): +class MeiliCommandMixin: """ - Build or re-build the search index for content libraries. + Mixin for Django management commands that interact with Meilisearch """ - def handle(self, *args, **options): + def get_meilisearch_client(self): """ - Build a new search index + Get the Meiliesearch client """ - + if hasattr(self, "_meili_client"): + return self._meili_client # Connect to Meilisearch if not settings.MEILISEARCH_URL: raise CommandError("MEILISEARCH_URL is not set - search functionality disabled.") - # TODO: put this into a helper lib? - client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) + + self._meili_client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) try: - client.health() + self._meili_client.health() except MeilisearchError as err: self.stderr.write(err.message) # print this because 'raise...from...' doesn't print the details raise CommandError("Unable to connect to Meilisearch") from err + return self._meili_client + + def wait_for_meili_task(self, info: TaskInfo): + """ + Simple helper method to wait for a Meilisearch task to complete + """ + client = self.get_meilisearch_client() + current_status = client.get_task(info.task_uid) + while current_status.status in ("enqueued", "processing"): + self.stdout.write("...") + time.sleep(1) + current_status = client.get_task(info.task_uid) + if current_status.status != "succeeded": + self.stderr.write(f"Task has status: {current_status.status}") + self.stderr.write(str(current_status.error)) + try: + err_reason = current_status.error['message'] + except (TypeError, KeyError): + err_reason = "Unknown error" + raise MeilisearchError(err_reason) + + def index_exists(self, index_name: str) -> bool: + """ + Check if an index exists + """ + client = self.get_meilisearch_client() + try: + client.get_index(index_name) + except MeilisearchError as err: + return False + return True + + +class Command(MeiliCommandMixin, BaseCommand): + """ + Build or re-build the search index for content libraries. + """ + + def handle(self, *args, **options): + """ + Build a new search index + """ + client = self.get_meilisearch_client() # Get the list of libraries self.stdout.write("Counting libraries...") @@ -58,17 +102,12 @@ def handle(self, *args, **options): self.stdout.write("Checking index...") index_name = settings.MEILISEARCH_INDEX_PREFIX + LIBRARIES_INDEX_NAME temp_index_name = index_name + "_new" - try: - client.get_index(temp_index_name) - except MeilisearchError as err: - pass - else: + if self.index_exists(temp_index_name): self.stdout.write("Index already exists. Deleting it...") - self._wait_for_meili_task(client, client.delete_index(temp_index_name)) + self.wait_for_meili_task(client.delete_index(temp_index_name)) self.stdout.write("Creating new index...") - self._wait_for_meili_task( - client, + self.wait_for_meili_task( client.create_index(temp_index_name, {'primaryKey': 'id'}) ) @@ -81,15 +120,15 @@ def handle(self, *args, **options): doc = searchable_doc_for_library_block(metadata) docs.append(doc) # Add all the docs in this library at once (usually faster than adding one at a time): - self._wait_for_meili_task(client, client.index(temp_index_name).add_documents(docs)) + self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) num_done += len(docs) new_index_created = client.get_index(temp_index_name).created_at - if not self._index_exists(index_name, client): + if not self.index_exists(index_name): # We have to create the "target" index before we can successfully swap the new one into it: - self.stdout.write(f"Preparing to swap into index (first time)...") - self._wait_for_meili_task(client, client.create_index(index_name)) - self.stdout.write(f"Swapping index...") + self.stdout.write("Preparing to swap into index (first time)...") + self.wait_for_meili_task(client.create_index(index_name)) + self.stdout.write("Swapping index...") client.swap_indexes([{'indexes': [temp_index_name, index_name]}]) # If we're using an API key that's restricted to certain index prefix(es), we won't be able to get the status # of this request unfortunately. https://github.com/meilisearch/meilisearch/issues/4103 @@ -100,33 +139,6 @@ def handle(self, *args, **options): else: break self.stdout.write("Deleting old index...") - self._wait_for_meili_task(client, client.delete_index(temp_index_name)) + self.wait_for_meili_task(client.delete_index(temp_index_name)) self.stdout.write(f"Done! {num_blocks} blocks indexed.") - - def _wait_for_meili_task(self, client: meilisearch.Client, info: TaskInfo): - """ - Simple helper method to wait for a Meilisearch task to complete - """ - current_status = client.get_task(info.task_uid) - while current_status.status in ("enqueued", "processing"): - self.stdout.write("...") - time.sleep(1) - current_status = client.get_task(info.task_uid) - if current_status.status != "succeeded": - self.stderr.write(f"Task has status: {current_status.status}") - self.stderr.write(str(current_status.error)) - try: - raise MeilisearchError(current_status.error['message']) - except (TypeError, KeyError): - raise MeilisearchError("Unknown error") - - def _index_exists(self, index_name: str, client: meilisearch.Client) -> bool: - """ - Check if an index exists - """ - try: - client.get_index(index_name) - except MeilisearchError as err: - return False - return True From 9f75b4c53c494eaf3e2bc53d364b55e3c48507c1 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 29 Feb 2024 13:25:01 -0800 Subject: [PATCH 04/28] feat: index courseware too --- cms/envs/common.py | 3 + .../djangoapps/content/search/__init__.py | 0 .../core/djangoapps/content/search/apps.py | 16 +++ .../djangoapps/content/search/documents.py | 126 ++++++++++++++++++ .../djangoapps/content/search/handlers.py | 0 .../content/search/management/__init__.py | 0 .../search/management/commands/__init__.py | 0 .../management/commands/meili_mixin.py} | 72 +++------- .../commands/reindex_courses_studio.py | 82 ++++++++++++ .../management/commands/reindex_libraries.py | 57 ++++++++ .../djangoapps/content_libraries/search.py | 71 ---------- 11 files changed, 299 insertions(+), 128 deletions(-) create mode 100644 openedx/core/djangoapps/content/search/__init__.py create mode 100644 openedx/core/djangoapps/content/search/apps.py create mode 100644 openedx/core/djangoapps/content/search/documents.py create mode 100644 openedx/core/djangoapps/content/search/handlers.py create mode 100644 openedx/core/djangoapps/content/search/management/__init__.py create mode 100644 openedx/core/djangoapps/content/search/management/commands/__init__.py rename openedx/core/djangoapps/{content_libraries/management/commands/reindex_libraries.py => content/search/management/commands/meili_mixin.py} (57%) create mode 100644 openedx/core/djangoapps/content/search/management/commands/reindex_courses_studio.py create mode 100644 openedx/core/djangoapps/content/search/management/commands/reindex_libraries.py delete mode 100644 openedx/core/djangoapps/content_libraries/search.py diff --git a/cms/envs/common.py b/cms/envs/common.py index 98b28b45ab24..d721535785c6 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -1776,6 +1776,9 @@ 'openedx_tagging.core.tagging.apps.TaggingConfig', 'openedx.core.djangoapps.content_tagging', + # Search + 'openedx.core.djangoapps.content.search.apps.ContentSearchConfig', + 'openedx.features.course_duration_limits', 'openedx.features.content_type_gating', 'openedx.features.discounts', diff --git a/openedx/core/djangoapps/content/search/__init__.py b/openedx/core/djangoapps/content/search/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/openedx/core/djangoapps/content/search/apps.py b/openedx/core/djangoapps/content/search/apps.py new file mode 100644 index 000000000000..13a83a11a6a6 --- /dev/null +++ b/openedx/core/djangoapps/content/search/apps.py @@ -0,0 +1,16 @@ +""" +Define the content search Django App. +""" + +from django.apps import AppConfig + + +class ContentSearchConfig(AppConfig): + """App config for the content search feature""" + + default_auto_field = "django.db.models.BigAutoField" + name = "openedx.core.djangoapps.content.search" + + def ready(self): + # Connect signal handlers + from . import handlers # pylint: disable=unused-import diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py new file mode 100644 index 000000000000..4bccd406f7aa --- /dev/null +++ b/openedx/core/djangoapps/content/search/documents.py @@ -0,0 +1,126 @@ +""" +Utilities related to searching content libraries +""" +import logging + +from django.utils.text import slugify +from opaque_keys.edx.keys import UsageKey + +from openedx.core.djangoapps.content_libraries import api as lib_api +from openedx.core.djangoapps.content_tagging import api as tagging_api +from openedx.core.djangoapps.xblock import api as xblock_api + +log = logging.getLogger(__name__) + + +def _meili_id_from_opaque_key(usage_key: UsageKey) -> str: + """ + Meilisearch requires each document to have a primary key that's either an + integer or a string composed of alphanumeric characters (a-z A-Z 0-9), + hyphens (-) and underscores (_). Since our opaque keys don't meet this + requirement, we transform them to a similar slug ID string that does. + """ + # The slugified key _may_ not be unique so we append a hashed number too + return slugify(str(usage_key)) + "-" + str(hash(str(usage_key)) % 1_000) + + +def _fields_from_block(block) -> dict: + """ + Given an XBlock instance, call its index_dictionary() method to load any + data that it wants included in the search index. Format into a flat dict. + + Note: the format of index_dictionary() depends on the block type. The base + class implementation returns only: + {"content": {"display_name": "..."}, "content_type": "..."} + """ + try: + block_data = block.index_dictionary() + # Will be something like: + # { + # 'content': {'display_name': '...', 'capa_content': '...'}, + # 'content_type': 'CAPA', + # 'problem_types': ['multiplechoiceresponse'] + # } + # Which we need to flatten: + if "content_type" in block_data: + del block_data["content_type"] # Redundant with our "type" field that we add later + if "content" in block_data and isinstance(block_data["content"], dict): + content = block_data["content"] + if "display_name" in content: + del content["display_name"] + del block_data["content"] + block_data.update(content) + # Now we have something like: + # { 'capa_content': '...', 'problem_types': ['multiplechoiceresponse'] } + except Exception as err: # pylint: disable=broad-except + log.exception(f"Failed to process index_dictionary for {block.usage_key}: {err}") + block_data = {} + block_data.update({ + "id": _meili_id_from_opaque_key(block.usage_key), + "usage_key": str(block.usage_key), + "block_id": str(block.usage_key.block_id), + "display_name": block.display_name, # TODO: there is some function to get the fallback display_name + "type": block.scope_ids.block_type, + # This is called context_key so it's the same for courses and libraries + "context_key": str(block.usage_key.context_key), # same as lib_key + "org": str(block.usage_key.context_key.org), + }) + return block_data + + +def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> dict: + """ + Generate a dictionary document suitable for ingestion into a search engine + like Meilisearch or Elasticsearch, so that the given library block can be + found using faceted search. + """ + doc = {} + try: + block = xblock_api.load_block(metadata.usage_key, user=None) + except Exception as err: # pylint: disable=broad-except + log.exception(f"Failed to load XBlock {metadata.usage_key}: {err}") + # Even though we couldn't load the block, we can still include basic data about it in the index, from 'metadata' + doc.update({ + "id": _meili_id_from_opaque_key(metadata.usage_key), + "usage_key": str(metadata.usage_key), + "block_id": str(metadata.usage_key.block_id), + "display_name": metadata.display_name, + "type": metadata.usage_key.block_type, + "context_key": str(metadata.usage_key.context_key), + "org": str(metadata.usage_key.context_key.org), + }) + else: + doc.update(_fields_from_block(block)) + # Add tags. Note that we could improve performance for indexing many components from the same library, + # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the + # tags for each component separately. + for obj_tag in tagging_api.get_object_tags(metadata.usage_key).all(): + key = f"tags:{obj_tag.name}" # Taxonomy name + if key not in doc: + doc[key] = [] + # Add the tag and all its parent tags, which are implied + for tag_value in obj_tag.get_lineage(): + if tag_value not in doc[key]: + doc[key].append(tag_value) + return doc + + +def searchable_doc_for_course_block(block) -> dict: + """ + Generate a dictionary document suitable for ingestion into a search engine + like Meilisearch or Elasticsearch, so that the given library block can be + found using faceted search. + """ + doc = _fields_from_block(block) + # Add tags. Note that we could improve performance for indexing many components from the same library, + # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the + # tags for each component separately. + for obj_tag in tagging_api.get_object_tags(block.usage_key).all(): + key = f"tags:{obj_tag.name}" # Taxonomy name + if key not in doc: + doc[key] = [] + # Add the tag and all its parent tags, which are implied + for tag_value in obj_tag.get_lineage(): + if tag_value not in doc[key]: + doc[key].append(tag_value) + return doc diff --git a/openedx/core/djangoapps/content/search/handlers.py b/openedx/core/djangoapps/content/search/handlers.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/openedx/core/djangoapps/content/search/management/__init__.py b/openedx/core/djangoapps/content/search/management/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/openedx/core/djangoapps/content/search/management/commands/__init__.py b/openedx/core/djangoapps/content/search/management/commands/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py b/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py similarity index 57% rename from openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py rename to openedx/core/djangoapps/content/search/management/commands/meili_mixin.py index d8a8139d4f0e..8fb85a598ef2 100644 --- a/openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py +++ b/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py @@ -1,30 +1,20 @@ """ -Command to build or re-build the search index for content libraries. +Mixin for Django management commands that interact with Meilisearch """ -import logging +from contextlib import contextmanager import time from django.conf import settings -from django.core.management import BaseCommand, CommandError +from django.core.management import CommandError import meilisearch from meilisearch.errors import MeilisearchError from meilisearch.models.task import TaskInfo -from openedx.core.djangoapps.content_libraries import api as lib_api -from openedx.core.djangoapps.content_libraries.search import searchable_doc_for_library_block -from openedx.core.djangoapps.content_libraries.models import ContentLibrary - - -log = logging.getLogger(__name__) - -LIBRARIES_INDEX_NAME = "content_libraries" - class MeiliCommandMixin: """ Mixin for Django management commands that interact with Meilisearch """ - def get_meilisearch_client(self): """ Get the Meiliesearch client @@ -73,72 +63,40 @@ def index_exists(self, index_name: str) -> bool: return False return True - -class Command(MeiliCommandMixin, BaseCommand): - """ - Build or re-build the search index for content libraries. - """ - - def handle(self, *args, **options): + @contextmanager + def using_temp_index(self, target_index): """ - Build a new search index + Create a new temporary Meilisearch index, populate it, then swap it to + become the active index. """ client = self.get_meilisearch_client() - - # Get the list of libraries - self.stdout.write("Counting libraries...") - lib_keys = [lib.library_key for lib in ContentLibrary.objects.select_related('org').only('org', 'slug')] - blocks_by_lib_key = {} - num_blocks = 0 - for lib_key in lib_keys: - blocks_by_lib_key[lib_key] = [] - for component in lib_api.get_library_components(lib_key): - blocks_by_lib_key[lib_key].append(lib_api.LibraryXBlockMetadata.from_component(lib_key, component)) - num_blocks += 1 - - self.stdout.write(f"Found {num_blocks} XBlocks among {len(lib_keys)} libraries.") - - # Check if the index exists already: self.stdout.write("Checking index...") - index_name = settings.MEILISEARCH_INDEX_PREFIX + LIBRARIES_INDEX_NAME - temp_index_name = index_name + "_new" + temp_index_name = target_index + "_new" if self.index_exists(temp_index_name): - self.stdout.write("Index already exists. Deleting it...") + self.stdout.write("Temporary index already exists. Deleting it...") self.wait_for_meili_task(client.delete_index(temp_index_name)) self.stdout.write("Creating new index...") self.wait_for_meili_task( client.create_index(temp_index_name, {'primaryKey': 'id'}) ) + new_index_created = client.get_index(temp_index_name).created_at - self.stdout.write("Indexing documents...") - num_done = 0 - for lib_key in lib_keys: - self.stdout.write(f"{num_done}/{num_blocks}. Now indexing {lib_key}") - docs = [] - for metadata in blocks_by_lib_key[lib_key]: - doc = searchable_doc_for_library_block(metadata) - docs.append(doc) - # Add all the docs in this library at once (usually faster than adding one at a time): - self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) - num_done += len(docs) + yield temp_index_name - new_index_created = client.get_index(temp_index_name).created_at - if not self.index_exists(index_name): + if not self.index_exists(target_index): # We have to create the "target" index before we can successfully swap the new one into it: self.stdout.write("Preparing to swap into index (first time)...") - self.wait_for_meili_task(client.create_index(index_name)) + self.wait_for_meili_task(client.create_index(target_index)) self.stdout.write("Swapping index...") - client.swap_indexes([{'indexes': [temp_index_name, index_name]}]) + client.swap_indexes([{'indexes': [temp_index_name, target_index]}]) # If we're using an API key that's restricted to certain index prefix(es), we won't be able to get the status # of this request unfortunately. https://github.com/meilisearch/meilisearch/issues/4103 while True: time.sleep(1) - if client.get_index(index_name).created_at != new_index_created: + if client.get_index(target_index).created_at != new_index_created: self.stdout.write("Waiting for swap completion...") else: break self.stdout.write("Deleting old index...") self.wait_for_meili_task(client.delete_index(temp_index_name)) - - self.stdout.write(f"Done! {num_blocks} blocks indexed.") diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_courses_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_courses_studio.py new file mode 100644 index 000000000000..d05975257acd --- /dev/null +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_courses_studio.py @@ -0,0 +1,82 @@ +""" +Command to build or re-build the search index for courses (in Studio, i.e. Draft +mode), in Meilisearch. + +See also cms/djangoapps/contentstore/management/commands/reindex_course.py which +indexes LMS (published) courses in ElasticSearch. +""" +import logging + +from django.conf import settings +from django.core.management import BaseCommand + +from openedx.core.djangoapps.content.search.documents import searchable_doc_for_course_block +from xmodule.modulestore import ModuleStoreEnum +from xmodule.modulestore.django import modulestore +from .meili_mixin import MeiliCommandMixin + + +log = logging.getLogger(__name__) + +STUDIO_COURSES_INDEX_NAME = "courseware_draft" + + +class Command(MeiliCommandMixin, BaseCommand): + """ + Build or re-build the search index for courses (in Studio, i.e. Draft mode) + """ + + def handle(self, *args, **options): + """ + Build a new search index + """ + client = self.get_meilisearch_client() + store = modulestore() + + # Get the list of libraries + self.stdout.write("Counting courses...") + with store.branch_setting(ModuleStoreEnum.Branch.draft_preferred): + all_courses = store.get_courses() + num_courses = len(all_courses) + self.stdout.write(f"Found {num_courses} courses.") + + index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_COURSES_INDEX_NAME + with self.using_temp_index(index_name) as temp_index_name: + self.stdout.write("Indexing documents...") + num_courses_done = 0 + num_blocks_done = 0 + for course in all_courses: + self.stdout.write(f"{num_courses_done}/{num_courses}. Now indexing {course.display_name} ({course.id})") + docs = [] + + def add_with_children(block): + """ Recursively index the given XBlock/component """ + doc = searchable_doc_for_course_block(block) + docs.append(doc) # pylint: disable=cell-var-from-loop + self.recurse_children(block, add_with_children) # pylint: disable=cell-var-from-loop + + self.recurse_children(course, add_with_children) + + # Add all the docs in this library at once (usually faster than adding one at a time): + self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + num_courses_done += 1 + num_blocks_done += len(docs) + + self.stdout.write(f"Done! {num_blocks_done} blocks indexed across {num_courses_done} courses.") + + def recurse_children(self, block, fn): + """ + Recurse the children of an XBlock and call the given function for each + + The main purpose of this is just to wrap the loading of each child in + try...except. Otherwise block.get_children() would do what we need. + """ + if block.has_children: + for child_id in block.children: + try: + child = block.get_child(child_id) + except Exception as err: # pylint: disable=broad-except + log.exception(err) + self.stderr.write(f"Unable to load block {child_id}") + else: + fn(child) diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_libraries.py b/openedx/core/djangoapps/content/search/management/commands/reindex_libraries.py new file mode 100644 index 000000000000..61677eb13939 --- /dev/null +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_libraries.py @@ -0,0 +1,57 @@ +""" +Command to build or re-build the search index for content libraries. +""" +import logging + +from django.conf import settings +from django.core.management import BaseCommand + +from openedx.core.djangoapps.content_libraries import api as lib_api +from openedx.core.djangoapps.content.search.documents import searchable_doc_for_library_block +from .meili_mixin import MeiliCommandMixin + + +log = logging.getLogger(__name__) + +LIBRARIES_INDEX_NAME = "content_libraries" + + +class Command(MeiliCommandMixin, BaseCommand): + """ + Build or re-build the search index for content libraries. + """ + + def handle(self, *args, **options): + """ + Build a new search index + """ + client = self.get_meilisearch_client() + + # Get the list of libraries + self.stdout.write("Counting libraries...") + lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')] + blocks_by_lib_key = {} + num_blocks = 0 + for lib_key in lib_keys: + blocks_by_lib_key[lib_key] = [] + for component in lib_api.get_library_components(lib_key): + blocks_by_lib_key[lib_key].append(lib_api.LibraryXBlockMetadata.from_component(lib_key, component)) + num_blocks += 1 + + self.stdout.write(f"Found {num_blocks} XBlocks among {len(lib_keys)} libraries.") + + index_name = settings.MEILISEARCH_INDEX_PREFIX + LIBRARIES_INDEX_NAME + with self.using_temp_index(index_name) as temp_index_name: + self.stdout.write("Indexing documents...") + num_done = 0 + for lib_key in lib_keys: + self.stdout.write(f"{num_done}/{num_blocks}. Now indexing {lib_key}") + docs = [] + for metadata in blocks_by_lib_key[lib_key]: + doc = searchable_doc_for_library_block(metadata) + docs.append(doc) + # Add all the docs in this library at once (usually faster than adding one at a time): + self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + num_done += len(docs) + + self.stdout.write(f"Done! {num_done} blocks indexed.") diff --git a/openedx/core/djangoapps/content_libraries/search.py b/openedx/core/djangoapps/content_libraries/search.py deleted file mode 100644 index 74624b4e7f94..000000000000 --- a/openedx/core/djangoapps/content_libraries/search.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Utilities related to searching content libraries -""" -import logging - -from django.utils.text import slugify - -from openedx.core.djangoapps.content_libraries import api as lib_api -from openedx.core.djangoapps.content_tagging import api as tagging_api -from openedx.core.djangoapps.xblock import api as xblock_api - -log = logging.getLogger(__name__) - - -def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> dict: - """ - Generate a dictionary document suitable for ingestion into a search engine - like Meilisearch or Elasticsearch, so that the given library block can be - found using faceted search. - """ - doc = {} - try: - block = xblock_api.load_block(metadata.usage_key, user=None) - block_data = block.index_dictionary() - # Will be something like: - # { - # 'content': {'display_name': '...', 'capa_content': '...'}, - # 'content_type': 'CAPA', - # 'problem_types': ['multiplechoiceresponse'] - # } - # Which we need to flatten: - if "content_type" in block_data: - del block_data["content_type"] # Redundant with our "type" field - if "content" in block_data and isinstance(block_data["content"], dict): - content = block_data["content"] - if "display_name" in content: - del content["display_name"] - del block_data["content"] - block_data.update(content) - # Now we have - # { 'capa_content': '...', 'problem_types': ['multiplechoiceresponse'] } - doc.update(block_data) - except Exception as err: # pylint: disable=broad-except - log.exception(f"Failed to get index_dictionary for {metadata.usage_key}: {err}") - # The data below must always override any values from index_dictionary: - doc.update({ - # A Meilisearch document identifier can be of type integer or string, - # only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_). - # So our usage keys with ":" characters cannot be used as primary keys. - "id": slugify(str(metadata.usage_key)) + "-" + str(hash(str(metadata.usage_key)) % 1_000), - "usage_key": str(metadata.usage_key), - "block_id": str(metadata.usage_key.block_id), - "display_name": metadata.display_name, - "type": metadata.usage_key.block_type, - # This is called contextKey not libKey so we can use the same keys with courses, and maybe search - # both courses and libraries together in the future? - "context_key": str(metadata.usage_key.context_key), # same as lib_key - "org": str(metadata.usage_key.context_key.org), - }) - # Add tags. Note that we could improve performance for indexing many components from the same library, - # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the - # tags for each component separately. - for obj_tag in tagging_api.get_object_tags(metadata.usage_key).all(): - key = f"tags:{obj_tag.name}" # Taxonomy name - if key not in doc: - doc[key] = [] - # Add the tag and all its parent tags, which are implied - for tag_value in obj_tag.get_lineage(): - if tag_value not in doc[key]: - doc[key].append(tag_value) - return doc From b9eddb8e654e280d03e3545f0ad70f48f5702049 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 29 Feb 2024 14:12:47 -0800 Subject: [PATCH 05/28] refactor: combine indexes into one --- .../djangoapps/content/search/documents.py | 54 +++++++++++----- .../management/commands/reindex_libraries.py | 57 ----------------- ...ex_courses_studio.py => reindex_studio.py} | 63 +++++++++++++++---- 3 files changed, 91 insertions(+), 83 deletions(-) delete mode 100644 openedx/core/djangoapps/content/search/management/commands/reindex_libraries.py rename openedx/core/djangoapps/content/search/management/commands/{reindex_courses_studio.py => reindex_studio.py} (50%) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 4bccd406f7aa..af53d1b43613 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -13,6 +13,30 @@ log = logging.getLogger(__name__) +class Fields: + """ + Fields that exist on the documents in our search index + """ + # Meilisearch primary key. String. + id = "id" + usage_key = "usage_key" + block_id = "block_id" + display_name = "display_name" + block_type = "block_type" + context_key = "context_key" + org = "org" + # type: "course_block", "library_block" + type = "type" + + +class DocType: + """ + Values for the 'type' field on each doc in the search index + """ + course_block = "course_block" + library_block = "library_block" + + def _meili_id_from_opaque_key(usage_key: UsageKey) -> str: """ Meilisearch requires each document to have a primary key that's either an @@ -56,14 +80,14 @@ class implementation returns only: log.exception(f"Failed to process index_dictionary for {block.usage_key}: {err}") block_data = {} block_data.update({ - "id": _meili_id_from_opaque_key(block.usage_key), - "usage_key": str(block.usage_key), - "block_id": str(block.usage_key.block_id), - "display_name": block.display_name, # TODO: there is some function to get the fallback display_name - "type": block.scope_ids.block_type, + Fields.id: _meili_id_from_opaque_key(block.usage_key), + Fields.usage_key: str(block.usage_key), + Fields.block_id: str(block.usage_key.block_id), + Fields.display_name: block.display_name, # TODO: there is some function to get the fallback display_name + Fields.block_type: block.scope_ids.block_type, # This is called context_key so it's the same for courses and libraries - "context_key": str(block.usage_key.context_key), # same as lib_key - "org": str(block.usage_key.context_key.org), + Fields.context_key: str(block.usage_key.context_key), # same as lib_key + Fields.org: str(block.usage_key.context_key.org), }) return block_data @@ -81,16 +105,17 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> log.exception(f"Failed to load XBlock {metadata.usage_key}: {err}") # Even though we couldn't load the block, we can still include basic data about it in the index, from 'metadata' doc.update({ - "id": _meili_id_from_opaque_key(metadata.usage_key), - "usage_key": str(metadata.usage_key), - "block_id": str(metadata.usage_key.block_id), - "display_name": metadata.display_name, - "type": metadata.usage_key.block_type, - "context_key": str(metadata.usage_key.context_key), - "org": str(metadata.usage_key.context_key.org), + Fields.id: _meili_id_from_opaque_key(metadata.usage_key), + Fields.usage_key: str(metadata.usage_key), + Fields.block_id: str(metadata.usage_key.block_id), + Fields.display_name: metadata.display_name, + Fields.type: metadata.usage_key.block_type, + Fields.context_key: str(metadata.usage_key.context_key), + Fields.org: str(metadata.usage_key.context_key.org), }) else: doc.update(_fields_from_block(block)) + doc[Fields.type] = DocType.library_block # Add tags. Note that we could improve performance for indexing many components from the same library, # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the # tags for each component separately. @@ -112,6 +137,7 @@ def searchable_doc_for_course_block(block) -> dict: found using faceted search. """ doc = _fields_from_block(block) + doc[Fields.type] = DocType.course_block # Add tags. Note that we could improve performance for indexing many components from the same library, # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the # tags for each component separately. diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_libraries.py b/openedx/core/djangoapps/content/search/management/commands/reindex_libraries.py deleted file mode 100644 index 61677eb13939..000000000000 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_libraries.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Command to build or re-build the search index for content libraries. -""" -import logging - -from django.conf import settings -from django.core.management import BaseCommand - -from openedx.core.djangoapps.content_libraries import api as lib_api -from openedx.core.djangoapps.content.search.documents import searchable_doc_for_library_block -from .meili_mixin import MeiliCommandMixin - - -log = logging.getLogger(__name__) - -LIBRARIES_INDEX_NAME = "content_libraries" - - -class Command(MeiliCommandMixin, BaseCommand): - """ - Build or re-build the search index for content libraries. - """ - - def handle(self, *args, **options): - """ - Build a new search index - """ - client = self.get_meilisearch_client() - - # Get the list of libraries - self.stdout.write("Counting libraries...") - lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')] - blocks_by_lib_key = {} - num_blocks = 0 - for lib_key in lib_keys: - blocks_by_lib_key[lib_key] = [] - for component in lib_api.get_library_components(lib_key): - blocks_by_lib_key[lib_key].append(lib_api.LibraryXBlockMetadata.from_component(lib_key, component)) - num_blocks += 1 - - self.stdout.write(f"Found {num_blocks} XBlocks among {len(lib_keys)} libraries.") - - index_name = settings.MEILISEARCH_INDEX_PREFIX + LIBRARIES_INDEX_NAME - with self.using_temp_index(index_name) as temp_index_name: - self.stdout.write("Indexing documents...") - num_done = 0 - for lib_key in lib_keys: - self.stdout.write(f"{num_done}/{num_blocks}. Now indexing {lib_key}") - docs = [] - for metadata in blocks_by_lib_key[lib_key]: - doc = searchable_doc_for_library_block(metadata) - docs.append(doc) - # Add all the docs in this library at once (usually faster than adding one at a time): - self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) - num_done += len(docs) - - self.stdout.write(f"Done! {num_done} blocks indexed.") diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_courses_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py similarity index 50% rename from openedx/core/djangoapps/content/search/management/commands/reindex_courses_studio.py rename to openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index d05975257acd..4af2aa5540e9 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_courses_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -10,7 +10,12 @@ from django.conf import settings from django.core.management import BaseCommand -from openedx.core.djangoapps.content.search.documents import searchable_doc_for_course_block +from openedx.core.djangoapps.content_libraries import api as lib_api +from openedx.core.djangoapps.content.search.documents import ( + Fields, + searchable_doc_for_course_block, + searchable_doc_for_library_block, +) from xmodule.modulestore import ModuleStoreEnum from xmodule.modulestore.django import modulestore from .meili_mixin import MeiliCommandMixin @@ -18,7 +23,7 @@ log = logging.getLogger(__name__) -STUDIO_COURSES_INDEX_NAME = "courseware_draft" +STUDIO_INDEX_NAME = "studio_content" class Command(MeiliCommandMixin, BaseCommand): @@ -28,25 +33,59 @@ class Command(MeiliCommandMixin, BaseCommand): def handle(self, *args, **options): """ - Build a new search index + Build a new search index for Studio, containing content from courses and libraries """ client = self.get_meilisearch_client() store = modulestore() - # Get the list of libraries + # Get the lists of libraries + self.stdout.write("Counting libraries...") + lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')] + num_libraries = len(lib_keys) + + # Get the list of courses self.stdout.write("Counting courses...") with store.branch_setting(ModuleStoreEnum.Branch.draft_preferred): all_courses = store.get_courses() num_courses = len(all_courses) - self.stdout.write(f"Found {num_courses} courses.") + num_contexts = num_courses + num_libraries + num_contexts_done = 0 # How many courses/libraries we've done + num_blocks_done = 0 - index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_COURSES_INDEX_NAME + self.stdout.write(f"Found {num_courses} courses and {num_libraries} libraries.") + index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_NAME with self.using_temp_index(index_name) as temp_index_name: - self.stdout.write("Indexing documents...") - num_courses_done = 0 - num_blocks_done = 0 + ############## Configure the index ############## + # usage_key is not the primary key but nevertheless must be unique: + client.index(temp_index_name).update_distinct_attribute(Fields.usage_key) + client.index(temp_index_name).update_filterable_attributes([ + Fields.block_type, + Fields.context_key, + Fields.org, + Fields.type, + # TODO: tags + ]) + + ############## Libraries ############## + self.stdout.write("Indexing libraries...") + for lib_key in lib_keys: + self.stdout.write(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}") + docs = [] + for component in lib_api.get_library_components(lib_key): + metadata = lib_api.LibraryXBlockMetadata.from_component(lib_key, component) + doc = searchable_doc_for_library_block(metadata) + docs.append(doc) + num_blocks_done += 1 + # Add all the docs in this library at once (usually faster than adding one at a time): + self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + num_contexts_done += 1 + + ############## Courses ############## + self.stdout.write("Indexing courses...") for course in all_courses: - self.stdout.write(f"{num_courses_done}/{num_courses}. Now indexing {course.display_name} ({course.id})") + self.stdout.write( + f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})" + ) docs = [] def add_with_children(block): @@ -59,10 +98,10 @@ def add_with_children(block): # Add all the docs in this library at once (usually faster than adding one at a time): self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) - num_courses_done += 1 + num_contexts_done += 1 num_blocks_done += len(docs) - self.stdout.write(f"Done! {num_blocks_done} blocks indexed across {num_courses_done} courses.") + self.stdout.write(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and libraries.") def recurse_children(self, block, fn): """ From adbf3576f3a59e51598119236253c3acf871070c Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 29 Feb 2024 14:51:34 -0800 Subject: [PATCH 06/28] feat: index tags (in a way compatible with InstantSearch) --- .../djangoapps/content/search/documents.py | 84 ++++++++++++++----- .../management/commands/reindex_studio.py | 2 +- 2 files changed, 62 insertions(+), 24 deletions(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index af53d1b43613..12034428fdf5 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -1,10 +1,11 @@ """ Utilities related to searching content libraries """ +from __future__ import annotations import logging from django.utils.text import slugify -from opaque_keys.edx.keys import UsageKey +from opaque_keys.edx.keys import UsageKey, LearningContextKey from openedx.core.djangoapps.content_libraries import api as lib_api from openedx.core.djangoapps.content_tagging import api as tagging_api @@ -27,6 +28,16 @@ class Fields: org = "org" # type: "course_block", "library_block" type = "type" + # tags (dictionary) + # See https://blog.meilisearch.com/nested-hierarchical-facets-guide/ + # and https://www.algolia.com/doc/api-reference/widgets/hierarchical-menu/js/ + # For details on the format of the hierarchical tag data. + tags = "tags" + tags_taxonomy = "taxonomy" + tags_level0 = "level0" + tags_level1 = "level1" + tags_level2 = "level2" + tags_level3 = "level3" class DocType: @@ -92,6 +103,53 @@ class implementation returns only: return block_data +def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: + """ + Given an XBlock, course, library, etc., get the tag data for its index doc. + + See the comments above on "Field.tags" for an explanation of the format. + + e.g. for something tagged "Difficulty: Hard" and "Location: Vancouver" this + would return: + { + "tags": { + "taxonomy": ["Location", "Difficulty"], + "level0": ["Location\tNorth America", "Difficulty\tHard"], + "level1": ["Location\tNorth America\tCanada"], + "level2": ["Location\tNorth America\tCanada\tVancouver"], + } + } + """ + # Note that we could improve performance for indexing many components from the same library/course, + # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the + # tags for each component separately. + all_tags = tagging_api.get_object_tags(object_id).all() + if not all_tags: + return {} + result = { + Fields.tags_taxonomy: [], + Fields.tags_level0: [], + # ... other levels added as needed + } + for obj_tag in all_tags: + # Add the taxonomy name: + if obj_tag.name not in result[Fields.tags_taxonomy]: + result[Fields.tags_taxonomy].append(obj_tag.name) + # Taxonomy name plus each level of tags, in a list: + parts = [obj_tag.name] + obj_tag.get_lineage() # e.g. ["Location", "North America", "Canada", "Vancouver"] + parts = [part.replace(" > ", " _ ") for part in parts] # Escape our separator + for level in range(0, 4): + new_value = " > ".join(parts[0:level + 2]) + if not f"level{level}" in result: + result[f"level{level}"] = [new_value] + elif new_value not in result[f"level{level}"]: + result[f"level{level}"].append(new_value) + if len(parts) == level + 2: + break + + return {Fields.tags: result} + + def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> dict: """ Generate a dictionary document suitable for ingestion into a search engine @@ -115,18 +173,8 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> }) else: doc.update(_fields_from_block(block)) + doc.update(_tags_for_content_object(metadata.usage_key)) doc[Fields.type] = DocType.library_block - # Add tags. Note that we could improve performance for indexing many components from the same library, - # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the - # tags for each component separately. - for obj_tag in tagging_api.get_object_tags(metadata.usage_key).all(): - key = f"tags:{obj_tag.name}" # Taxonomy name - if key not in doc: - doc[key] = [] - # Add the tag and all its parent tags, which are implied - for tag_value in obj_tag.get_lineage(): - if tag_value not in doc[key]: - doc[key].append(tag_value) return doc @@ -137,16 +185,6 @@ def searchable_doc_for_course_block(block) -> dict: found using faceted search. """ doc = _fields_from_block(block) + doc.update(_tags_for_content_object(block.usage_key)) doc[Fields.type] = DocType.course_block - # Add tags. Note that we could improve performance for indexing many components from the same library, - # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the - # tags for each component separately. - for obj_tag in tagging_api.get_object_tags(block.usage_key).all(): - key = f"tags:{obj_tag.name}" # Taxonomy name - if key not in doc: - doc[key] = [] - # Add the tag and all its parent tags, which are implied - for tag_value in obj_tag.get_lineage(): - if tag_value not in doc[key]: - doc[key].append(tag_value) return doc diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 4af2aa5540e9..7bddccd7ae97 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -62,8 +62,8 @@ def handle(self, *args, **options): Fields.block_type, Fields.context_key, Fields.org, + Fields.tags, Fields.type, - # TODO: tags ]) ############## Libraries ############## From 47dc3c1b8c00607e2fde3a361cec6a8c2e73a4ec Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Tue, 5 Mar 2024 11:00:44 -0800 Subject: [PATCH 07/28] feat: REST API to retrieve a user-specific API key --- cms/envs/common.py | 4 +- cms/urls.py | 4 ++ .../djangoapps/content/search/documents.py | 1 + .../management/commands/reindex_studio.py | 3 +- .../core/djangoapps/content/search/urls.py | 11 +++ .../core/djangoapps/content/search/views.py | 70 +++++++++++++++++++ 6 files changed, 90 insertions(+), 3 deletions(-) create mode 100644 openedx/core/djangoapps/content/search/urls.py create mode 100644 openedx/core/djangoapps/content/search/views.py diff --git a/cms/envs/common.py b/cms/envs/common.py index d721535785c6..a85c85787f8a 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -2880,6 +2880,8 @@ def _should_send_xblock_events(settings): # can only use the index(es) that start with this prefix. # See https://www.meilisearch.com/docs/learn/security/tenant_tokens MEILISEARCH_INDEX_PREFIX = "" -# Set this to None to disable search functionality +# Meilisearch URL that the python backend can use. Often points to another docker container or k8s service. MEILISEARCH_URL = "http://meilisearch" +# URL that browsers (end users) can user to reach Meilisearch. Should be HTTPS in production. +MEILISEARCH_PUBLIC_URL = "http://meilisearch.example.com" MEILISEARCH_API_KEY = "devkey" diff --git a/cms/urls.py b/cms/urls.py index 8c3588b04960..9828e9d0fbf0 100644 --- a/cms/urls.py +++ b/cms/urls.py @@ -307,6 +307,10 @@ ), ) +urlpatterns.append( + path('', include(('openedx.core.djangoapps.content.search.urls', 'content_search'), namespace='content_search')), +) + # display error page templates, for testing purposes urlpatterns += [ path('404', handler404), diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 12034428fdf5..7cb61e5dca06 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -12,6 +12,7 @@ from openedx.core.djangoapps.xblock import api as xblock_api log = logging.getLogger(__name__) +STUDIO_INDEX_NAME = "studio_content" class Fields: diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 7bddccd7ae97..3b46370ebf21 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -15,6 +15,7 @@ Fields, searchable_doc_for_course_block, searchable_doc_for_library_block, + STUDIO_INDEX_NAME, ) from xmodule.modulestore import ModuleStoreEnum from xmodule.modulestore.django import modulestore @@ -23,8 +24,6 @@ log = logging.getLogger(__name__) -STUDIO_INDEX_NAME = "studio_content" - class Command(MeiliCommandMixin, BaseCommand): """ diff --git a/openedx/core/djangoapps/content/search/urls.py b/openedx/core/djangoapps/content/search/urls.py new file mode 100644 index 000000000000..c69be33d5bb0 --- /dev/null +++ b/openedx/core/djangoapps/content/search/urls.py @@ -0,0 +1,11 @@ +""" +URLs for content sesarch +""" +from django.urls import path + +from .views import StudioSearchView + + +urlpatterns = [ + path('api/content_search/v2/studio/', StudioSearchView.as_view(), name='studio_content_search') +] diff --git a/openedx/core/djangoapps/content/search/views.py b/openedx/core/djangoapps/content/search/views.py new file mode 100644 index 000000000000..807e716ccff7 --- /dev/null +++ b/openedx/core/djangoapps/content/search/views.py @@ -0,0 +1,70 @@ +""" +REST API for content search +""" +from datetime import datetime, timedelta, timezone +import logging + +from django.conf import settings +from django.contrib.auth import get_user_model +from django.contrib.auth.models import AnonymousUser +from opaque_keys.edx.keys import CourseKey +import meilisearch +from rest_framework import serializers +from rest_framework.exceptions import NotAuthenticated, NotFound, PermissionDenied +from rest_framework.response import Response +from rest_framework.views import APIView + +from lms.djangoapps.courseware.access import has_access +from lms.djangoapps.courseware.masquerade import setup_masquerade +from openedx.core import types +from openedx.core.lib.api.view_utils import validate_course_key, view_auth_classes +from openedx.core.djangoapps.content.search.documents import STUDIO_INDEX_NAME + +User = get_user_model() +log = logging.getLogger(__name__) + + +def _get_meili_api_key_uid(): + """ + Helper method to get the UID of the API key we're using for Meilisearch + """ + if not hasattr(_get_meili_api_key_uid, "uid"): + client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) + _get_meili_api_key_uid.uid = client.get_key(settings.MEILISEARCH_API_KEY).uid + return _get_meili_api_key_uid.uid + + +@view_auth_classes(is_authenticated=True) +class StudioSearchView(APIView): + """ + Give user details on how they can search studio content + """ + + def get(self, request): + """ + Give user details on how they can search studio content + """ + client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) + index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_NAME + + # Create an API key that only allows the user to search content that they have permission to view: + expires_at = datetime.now(tz=timezone.utc) + timedelta(days=7) + search_rules = { + index_name: { + # TODO: Apply filters here based on the user's permissions, so they can only search for content + # that they have permission to view. Example: + # 'filter': 'org = BradenX' + } + } + # Note: the following is just generating a JWT. It doesn't actually make an API call to Meilisearch. + restricted_api_key = client.generate_tenant_token( + api_key_uid=_get_meili_api_key_uid(), + search_rules=search_rules, + expires_at=expires_at, + ) + + return Response({ + "url": settings.MEILISEARCH_PUBLIC_URL, + "index_name": index_name, + "api_key": restricted_api_key, + }) From 6cf9e04f10f1d1cad810529dab3e48a45c2e42c7 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Tue, 5 Mar 2024 14:38:19 -0800 Subject: [PATCH 08/28] fix: better handling index existence check, better comments --- .../content/search/management/commands/meili_mixin.py | 5 ++++- .../search/management/commands/reindex_studio.py | 11 +++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py b/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py index 8fb85a598ef2..798dfd8a7e47 100644 --- a/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py +++ b/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py @@ -60,7 +60,10 @@ def index_exists(self, index_name: str) -> bool: try: client.get_index(index_name) except MeilisearchError as err: - return False + if err.code == "index_not_found": + return False + else: + raise err return True @contextmanager diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 3b46370ebf21..fe0981e24ca4 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -47,16 +47,19 @@ def handle(self, *args, **options): with store.branch_setting(ModuleStoreEnum.Branch.draft_preferred): all_courses = store.get_courses() num_courses = len(all_courses) + + # Some counters so we can track our progress as indexing progresses: num_contexts = num_courses + num_libraries - num_contexts_done = 0 # How many courses/libraries we've done - num_blocks_done = 0 + num_contexts_done = 0 # How many courses/libraries we've indexed + num_blocks_done = 0 # How many individual components/XBlocks we've indexed self.stdout.write(f"Found {num_courses} courses and {num_libraries} libraries.") index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_NAME with self.using_temp_index(index_name) as temp_index_name: ############## Configure the index ############## - # usage_key is not the primary key but nevertheless must be unique: + # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): client.index(temp_index_name).update_distinct_attribute(Fields.usage_key) + # Mark which attributes can be used for filtering/faceted search: client.index(temp_index_name).update_filterable_attributes([ Fields.block_type, Fields.context_key, @@ -95,7 +98,7 @@ def add_with_children(block): self.recurse_children(course, add_with_children) - # Add all the docs in this library at once (usually faster than adding one at a time): + # Add all the docs in this course at once (usually faster than adding one at a time): self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) num_contexts_done += 1 num_blocks_done += len(docs) From 8637e16328865be25d35461dc77725305a483dba Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 14 Mar 2024 19:12:33 -0700 Subject: [PATCH 09/28] feat: disable studio search (w/ Meilisearch) by default --- cms/envs/common.py | 13 ++++++++----- .../search/management/commands/meili_mixin.py | 4 ++-- openedx/core/djangoapps/content/search/views.py | 2 ++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/cms/envs/common.py b/cms/envs/common.py index a85c85787f8a..b8c892145d34 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -2873,15 +2873,18 @@ def _should_send_xblock_events(settings): BEAMER_PRODUCT_ID = "" -################### Search ################### +################### Studio Search (alpha, using Meilisearch) ################### +# Enable Studio search features (powered by Meilisearch) (beta, off by default) +MEILISEARCH_ENABLED = False +# Meilisearch URL that the python backend can use. Often points to another docker container or k8s service. +# Set this to None (default) to disable +MEILISEARCH_URL = "http://meilisearch" +# URL that browsers (end users) can user to reach Meilisearch. Should be HTTPS in production. +MEILISEARCH_PUBLIC_URL = "http://meilisearch.example.com" # To support multi-tenancy, you can prefix all indexes with a common key like "sandbox7-" # and use a restricted tenant token in place of an API key, so that this Open edX instance # can only use the index(es) that start with this prefix. # See https://www.meilisearch.com/docs/learn/security/tenant_tokens MEILISEARCH_INDEX_PREFIX = "" -# Meilisearch URL that the python backend can use. Often points to another docker container or k8s service. -MEILISEARCH_URL = "http://meilisearch" -# URL that browsers (end users) can user to reach Meilisearch. Should be HTTPS in production. -MEILISEARCH_PUBLIC_URL = "http://meilisearch.example.com" MEILISEARCH_API_KEY = "devkey" diff --git a/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py b/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py index 798dfd8a7e47..18c7681879cb 100644 --- a/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py +++ b/openedx/core/djangoapps/content/search/management/commands/meili_mixin.py @@ -22,8 +22,8 @@ def get_meilisearch_client(self): if hasattr(self, "_meili_client"): return self._meili_client # Connect to Meilisearch - if not settings.MEILISEARCH_URL: - raise CommandError("MEILISEARCH_URL is not set - search functionality disabled.") + if not settings.MEILISEARCH_ENABLED: + raise CommandError("MEILISEARCH_ENABLED is not set - search functionality disabled.") self._meili_client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) try: diff --git a/openedx/core/djangoapps/content/search/views.py b/openedx/core/djangoapps/content/search/views.py index 807e716ccff7..76e5eb91781e 100644 --- a/openedx/core/djangoapps/content/search/views.py +++ b/openedx/core/djangoapps/content/search/views.py @@ -44,6 +44,8 @@ def get(self, request): """ Give user details on how they can search studio content """ + if not settings.MEILISEARCH_ENABLED: + raise NotFound("Meilisearch features are not enabled.") client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_NAME From 64319488c4e587a62b1705c0d0144bd70aaf31a5 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 14 Mar 2024 19:12:44 -0700 Subject: [PATCH 10/28] feat: limit search endpoint usage to global staff for now --- openedx/core/djangoapps/content/search/views.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/openedx/core/djangoapps/content/search/views.py b/openedx/core/djangoapps/content/search/views.py index 76e5eb91781e..4a19a023ad63 100644 --- a/openedx/core/djangoapps/content/search/views.py +++ b/openedx/core/djangoapps/content/search/views.py @@ -6,14 +6,12 @@ from django.conf import settings from django.contrib.auth import get_user_model -from django.contrib.auth.models import AnonymousUser -from opaque_keys.edx.keys import CourseKey import meilisearch -from rest_framework import serializers -from rest_framework.exceptions import NotAuthenticated, NotFound, PermissionDenied +from rest_framework.exceptions import NotFound, PermissionDenied from rest_framework.response import Response from rest_framework.views import APIView +from common.djangoapps.student.roles import GlobalStaff from lms.djangoapps.courseware.access import has_access from lms.djangoapps.courseware.masquerade import setup_masquerade from openedx.core import types @@ -46,6 +44,10 @@ def get(self, request): """ if not settings.MEILISEARCH_ENABLED: raise NotFound("Meilisearch features are not enabled.") + if not GlobalStaff().has_user(request.user): + # Until we enforce permissions properly (see below), this endpoint is restricted to global staff, + # because it lets you search data from any course/library. + raise PermissionDenied("For the moment, use of this search preview is restricted to global staff.") client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_NAME From 9c058062c3350fd792a4de999ea133105fd389d0 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Fri, 15 Mar 2024 11:13:04 -0700 Subject: [PATCH 11/28] chore: fix quality issues --- cms/envs/common.py | 1 - openedx/core/djangoapps/content/search/documents.py | 5 ++--- .../content/search/management/commands/reindex_studio.py | 2 +- openedx/core/djangoapps/content/search/views.py | 5 +---- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/cms/envs/common.py b/cms/envs/common.py index b8c892145d34..d38b156a0691 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -2878,7 +2878,6 @@ def _should_send_xblock_events(settings): # Enable Studio search features (powered by Meilisearch) (beta, off by default) MEILISEARCH_ENABLED = False # Meilisearch URL that the python backend can use. Often points to another docker container or k8s service. -# Set this to None (default) to disable MEILISEARCH_URL = "http://meilisearch" # URL that browsers (end users) can user to reach Meilisearch. Should be HTTPS in production. MEILISEARCH_PUBLIC_URL = "http://meilisearch.example.com" diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 7cb61e5dca06..d09892981c96 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -27,8 +27,7 @@ class Fields: block_type = "block_type" context_key = "context_key" org = "org" - # type: "course_block", "library_block" - type = "type" + type = "type" # "course_block" or "library_block" # tags (dictionary) # See https://blog.meilisearch.com/nested-hierarchical-facets-guide/ # and https://www.algolia.com/doc/api-reference/widgets/hierarchical-menu/js/ @@ -141,7 +140,7 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: parts = [part.replace(" > ", " _ ") for part in parts] # Escape our separator for level in range(0, 4): new_value = " > ".join(parts[0:level + 2]) - if not f"level{level}" in result: + if f"level{level}" not in result: result[f"level{level}"] = [new_value] elif new_value not in result[f"level{level}"]: result[f"level{level}"].append(new_value) diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index fe0981e24ca4..e672fcc60203 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -41,7 +41,7 @@ def handle(self, *args, **options): self.stdout.write("Counting libraries...") lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')] num_libraries = len(lib_keys) - + # Get the list of courses self.stdout.write("Counting courses...") with store.branch_setting(ModuleStoreEnum.Branch.draft_preferred): diff --git a/openedx/core/djangoapps/content/search/views.py b/openedx/core/djangoapps/content/search/views.py index 4a19a023ad63..69b686a3434f 100644 --- a/openedx/core/djangoapps/content/search/views.py +++ b/openedx/core/djangoapps/content/search/views.py @@ -12,10 +12,7 @@ from rest_framework.views import APIView from common.djangoapps.student.roles import GlobalStaff -from lms.djangoapps.courseware.access import has_access -from lms.djangoapps.courseware.masquerade import setup_masquerade -from openedx.core import types -from openedx.core.lib.api.view_utils import validate_course_key, view_auth_classes +from openedx.core.lib.api.view_utils import view_auth_classes from openedx.core.djangoapps.content.search.documents import STUDIO_INDEX_NAME User = get_user_model() From af1402faac50040ab2b26d486d6dc6d737413cc7 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Fri, 15 Mar 2024 11:32:46 -0700 Subject: [PATCH 12/28] refactor: put all "content" data into the "content" field --- .../djangoapps/content/search/documents.py | 42 ++++++++++--------- .../management/commands/reindex_studio.py | 8 ++++ 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index d09892981c96..b2ec0907fa4c 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -27,7 +27,7 @@ class Fields: block_type = "block_type" context_key = "context_key" org = "org" - type = "type" # "course_block" or "library_block" + type = "type" # DocType.course_block or DocType.library_block (see below) # tags (dictionary) # See https://blog.meilisearch.com/nested-hierarchical-facets-guide/ # and https://www.algolia.com/doc/api-reference/widgets/hierarchical-menu/js/ @@ -38,6 +38,10 @@ class Fields: tags_level1 = "level1" tags_level2 = "level2" tags_level3 = "level3" + # The "content" field is a dictionary of arbitrary data, depending on the block_type. + # It comes from each XBlock's index_dictionary() method (if present) plus some processing. + # Text (html) blocks have an "html_content" key in here, capa has "capa_content" and "problem_types", and so on. + content = "content" class DocType: @@ -68,8 +72,18 @@ def _fields_from_block(block) -> dict: class implementation returns only: {"content": {"display_name": "..."}, "content_type": "..."} """ + block_data = { + Fields.id: _meili_id_from_opaque_key(block.usage_key), + Fields.usage_key: str(block.usage_key), + Fields.block_id: str(block.usage_key.block_id), + Fields.display_name: block.display_name, # TODO: there is some function to get the fallback display_name + Fields.block_type: block.scope_ids.block_type, + # This is called context_key so it's the same for courses and libraries + Fields.context_key: str(block.usage_key.context_key), # same as lib_key + Fields.org: str(block.usage_key.context_key.org), + } try: - block_data = block.index_dictionary() + content_data = block.index_dictionary() # Will be something like: # { # 'content': {'display_name': '...', 'capa_content': '...'}, @@ -77,29 +91,19 @@ class implementation returns only: # 'problem_types': ['multiplechoiceresponse'] # } # Which we need to flatten: - if "content_type" in block_data: - del block_data["content_type"] # Redundant with our "type" field that we add later - if "content" in block_data and isinstance(block_data["content"], dict): - content = block_data["content"] + if "content_type" in content_data: + del content_data["content_type"] # Redundant with our standard Fields.block_type field. + if "content" in content_data and isinstance(content_data["content"], dict): + content = content_data["content"] if "display_name" in content: del content["display_name"] - del block_data["content"] - block_data.update(content) + del content_data["content"] + content_data.update(content) # Now we have something like: # { 'capa_content': '...', 'problem_types': ['multiplechoiceresponse'] } + block_data[Fields.content] = content_data except Exception as err: # pylint: disable=broad-except log.exception(f"Failed to process index_dictionary for {block.usage_key}: {err}") - block_data = {} - block_data.update({ - Fields.id: _meili_id_from_opaque_key(block.usage_key), - Fields.usage_key: str(block.usage_key), - Fields.block_id: str(block.usage_key.block_id), - Fields.display_name: block.display_name, # TODO: there is some function to get the fallback display_name - Fields.block_type: block.scope_ids.block_type, - # This is called context_key so it's the same for courses and libraries - Fields.context_key: str(block.usage_key.context_key), # same as lib_key - Fields.org: str(block.usage_key.context_key.org), - }) return block_data diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index e672fcc60203..623f01bc05cc 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -67,6 +67,14 @@ def handle(self, *args, **options): Fields.tags, Fields.type, ]) + # Mark which attributes are used for keyword search, in order of importance: + client.index(temp_index_name).update_searchable_attributes([ + Fields.display_name, + Fields.block_id, + Fields.content, + Fields.tags, + # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. + ]) ############## Libraries ############## self.stdout.write("Indexing libraries...") From 7c5cd7fa40823347320e2c1c39f0c389620f8d0a Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Fri, 15 Mar 2024 11:35:43 -0700 Subject: [PATCH 13/28] chore: fix lint issue and comment --- openedx/core/djangoapps/content/search/documents.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index b2ec0907fa4c..fc608688d9f5 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -118,9 +118,9 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: { "tags": { "taxonomy": ["Location", "Difficulty"], - "level0": ["Location\tNorth America", "Difficulty\tHard"], - "level1": ["Location\tNorth America\tCanada"], - "level2": ["Location\tNorth America\tCanada\tVancouver"], + "level0": ["Location > North America", "Difficulty > Hard"], + "level1": ["Location > North America > Canada"], + "level2": ["Location > North America > Canada > Vancouver"], } } """ @@ -142,7 +142,7 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: # Taxonomy name plus each level of tags, in a list: parts = [obj_tag.name] + obj_tag.get_lineage() # e.g. ["Location", "North America", "Canada", "Vancouver"] parts = [part.replace(" > ", " _ ") for part in parts] # Escape our separator - for level in range(0, 4): + for level in range(4): new_value = " > ".join(parts[0:level + 2]) if f"level{level}" not in result: result[f"level{level}"] = [new_value] From 4c66d068a180e2724e5e9757624e6d6b96d22c32 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Fri, 15 Mar 2024 12:13:43 -0700 Subject: [PATCH 14/28] feat: add breadcrumbs to index --- .../djangoapps/content/search/documents.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index fc608688d9f5..8e69ee08a763 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -22,12 +22,16 @@ class Fields: # Meilisearch primary key. String. id = "id" usage_key = "usage_key" - block_id = "block_id" + type = "type" # DocType.course_block or DocType.library_block (see below) + block_id = "block_id" # The block_id part of the usage key. Sometimes human-readable, sometimes a random hex ID display_name = "display_name" block_type = "block_type" context_key = "context_key" org = "org" - type = "type" # DocType.course_block or DocType.library_block (see below) + # breadcrumbs: an array of {"display_name": "..."} entries. First one is the name of the course/library itself. + # After that is the name of any parent Section/Subsection/Unit/etc. + # It's a list of dictionaries because for now we just include the name of each but in future we may add their IDs. + breadcrumbs = "breadcrumbs" # tags (dictionary) # See https://blog.meilisearch.com/nested-hierarchical-facets-guide/ # and https://www.algolia.com/doc/api-reference/widgets/hierarchical-menu/js/ @@ -81,7 +85,18 @@ class implementation returns only: # This is called context_key so it's the same for courses and libraries Fields.context_key: str(block.usage_key.context_key), # same as lib_key Fields.org: str(block.usage_key.context_key.org), + Fields.breadcrumbs: [] } + # Get the breadcrumbs (course, section, subsection, etc.): + if block.usage_key.context_key.is_course: # Getting parent is not yet implemented in Learning Core (for libraries). + cur_block = block + while cur_block.parent: + if not cur_block.has_cached_parent: + # This is not a big deal, but if you're updating many blocks in the same course at once, + # this would be very inefficient. Better to recurse the tree top-down with the parent blocks loaded. + log.warning(f"Updating Studio search index for XBlock {block.usage_key} but ancestors weren't cached.") + cur_block = cur_block.get_parent() + block_data[Fields.breadcrumbs].insert(0, {"display_name": cur_block.display_name}) try: content_data = block.index_dictionary() # Will be something like: @@ -160,6 +175,7 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> like Meilisearch or Elasticsearch, so that the given library block can be found using faceted search. """ + library_name = lib_api.get_library(metadata.usage_key.context_key).title doc = {} try: block = xblock_api.load_block(metadata.usage_key, user=None) @@ -179,6 +195,8 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> doc.update(_fields_from_block(block)) doc.update(_tags_for_content_object(metadata.usage_key)) doc[Fields.type] = DocType.library_block + # Add the breadcrumbs. In v2 libraries, the library itself is not a "parent" of the XBlocks so we add it here: + doc[Fields.breadcrumbs] = [{"display_name": library_name}] return doc From 67cf0ccbc6a3670e993946f19a5d77912100d88f Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Fri, 15 Mar 2024 12:13:55 -0700 Subject: [PATCH 15/28] feat: use fallback display_name if needed --- openedx/core/djangoapps/content/search/documents.py | 2 +- openedx/core/djangoapps/xblock/api.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 8e69ee08a763..aee51a53e65e 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -80,7 +80,7 @@ class implementation returns only: Fields.id: _meili_id_from_opaque_key(block.usage_key), Fields.usage_key: str(block.usage_key), Fields.block_id: str(block.usage_key.block_id), - Fields.display_name: block.display_name, # TODO: there is some function to get the fallback display_name + Fields.display_name: xblock_api.get_block_display_name(block), Fields.block_type: block.scope_ids.block_type, # This is called context_key so it's the same for courses and libraries Fields.context_key: str(block.usage_key.context_key), # same as lib_key diff --git a/openedx/core/djangoapps/xblock/api.py b/openedx/core/djangoapps/xblock/api.py index 98ba9e222dd0..f54464de9f4e 100644 --- a/openedx/core/djangoapps/xblock/api.py +++ b/openedx/core/djangoapps/xblock/api.py @@ -180,7 +180,7 @@ def get_block_display_name(block: XBlock) -> str: if display_name is not None: return display_name else: - return xblock_type_display_name(block.block_type) + return xblock_type_display_name(block.scope_ids.block_type) def get_component_from_usage_key(usage_key: UsageKeyV2) -> Component: From c53963ed470ac94a60951ba0576831fe3d7d8c37 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Fri, 15 Mar 2024 12:14:07 -0700 Subject: [PATCH 16/28] feat: state how long the reindex_studio command took --- .../content/search/management/commands/reindex_studio.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 623f01bc05cc..462c0a732e5f 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -6,6 +6,7 @@ indexes LMS (published) courses in ElasticSearch. """ import logging +import time from django.conf import settings from django.core.management import BaseCommand @@ -34,6 +35,7 @@ def handle(self, *args, **options): """ Build a new search index for Studio, containing content from courses and libraries """ + start_time = time.perf_counter() client = self.get_meilisearch_client() store = modulestore() @@ -111,7 +113,11 @@ def add_with_children(block): num_contexts_done += 1 num_blocks_done += len(docs) - self.stdout.write(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and libraries.") + elapsed_time = time.perf_counter() - start_time + self.stdout.write( + f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses " + f"and libraries in {elapsed_time:.0f}s." + ) def recurse_children(self, block, fn): """ From 3884fa2fe89bff4f9012fe2f552e051a706e338f Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Sun, 17 Mar 2024 18:57:31 -0700 Subject: [PATCH 17/28] test: add tests for the Studio search REST API --- .../content/search/tests/__init__.py | 0 .../content/search/tests/test_views.py | 81 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 openedx/core/djangoapps/content/search/tests/__init__.py create mode 100644 openedx/core/djangoapps/content/search/tests/test_views.py diff --git a/openedx/core/djangoapps/content/search/tests/__init__.py b/openedx/core/djangoapps/content/search/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/openedx/core/djangoapps/content/search/tests/test_views.py b/openedx/core/djangoapps/content/search/tests/test_views.py new file mode 100644 index 000000000000..825d1e2924d5 --- /dev/null +++ b/openedx/core/djangoapps/content/search/tests/test_views.py @@ -0,0 +1,81 @@ +""" +Tests for the Studio content search REST API. +""" +from django.test import override_settings +from rest_framework.test import APITestCase, APIClient +from unittest import mock + +from common.djangoapps.student.tests.factories import UserFactory +from openedx.core.djangolib.testing.utils import CacheIsolationTestCase, skip_unless_cms + +STUDIO_SEARCH_ENDPOINT_URL = "/api/content_search/v2/studio/" + + +@skip_unless_cms +class StudioSearchViewTest(CacheIsolationTestCase, APITestCase): + """ + General tests for the Studio search REST API. + """ + + @classmethod + def setUpTestData(cls): # lint-amnesty, pylint: disable=super-method-not-called + cls.staff = UserFactory.create( + username='staff', email='staff@example.com', is_staff=True, password='staff_pass' + ) + cls.student = UserFactory.create( + username='student', email='student@example.com', is_staff=False, password='student_pass' + ) + + def setUp(self): + super().setUp() + self.client = APIClient() + + @override_settings(MEILISEARCH_ENABLED=False) + def test_studio_search_unathenticated_disabled(self): + """ + Whether or not Meilisearch is enabled, the API endpoint requires authentication. + """ + result = self.client.get(STUDIO_SEARCH_ENDPOINT_URL) + assert result.status_code == 401 + + @override_settings(MEILISEARCH_ENABLED=True) + def test_studio_search_unathenticated_enabled(self): + """ + Whether or not Meilisearch is enabled, the API endpoint requires authentication. + """ + result = self.client.get(STUDIO_SEARCH_ENDPOINT_URL) + assert result.status_code == 401 + + @override_settings(MEILISEARCH_ENABLED=False) + def test_studio_search_disabled(self): + """ + When Meilisearch is disabled, the Studio search endpoint gives a 404 + """ + self.client.login(username='student', password='student_pass') + result = self.client.get(STUDIO_SEARCH_ENDPOINT_URL) + assert result.status_code == 404 + + @override_settings(MEILISEARCH_ENABLED=True) + def test_studio_search_student_forbidden(self): + """ + Until we implement fine-grained permissions, only global staff can use + the Studio search endpoint. + """ + self.client.login(username='student', password='student_pass') + result = self.client.get(STUDIO_SEARCH_ENDPOINT_URL) + assert result.status_code == 403 + + @override_settings(MEILISEARCH_ENABLED=True) + @mock.patch('openedx.core.djangoapps.content.search.views._get_meili_api_key_uid') + def test_studio_search_staff(self, mock_get_api_key_uid): + """ + Global staff can get a restricted API key for Meilisearch using the REST + API. + """ + self.client.login(username='staff', password='staff_pass') + mock_get_api_key_uid.return_value = "3203d764-370f-4e99-a917-d47ab7f29739" + result = self.client.get(STUDIO_SEARCH_ENDPOINT_URL) + assert result.status_code == 200 + assert result.data["index_name"] == "studio_content" + assert result.data["url"].startswith("http") + assert result.data["api_key"] and isinstance(result.data["api_key"], str) From 6be622e3a8563070e553bbfe61c5adf01a35176f Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Sun, 17 Mar 2024 19:47:37 -0700 Subject: [PATCH 18/28] docs: added ADR --- .../docs/decisions/0001-meilisearch.rst | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 openedx/core/djangoapps/content/search/docs/decisions/0001-meilisearch.rst diff --git a/openedx/core/djangoapps/content/search/docs/decisions/0001-meilisearch.rst b/openedx/core/djangoapps/content/search/docs/decisions/0001-meilisearch.rst new file mode 100644 index 000000000000..1f355409da0e --- /dev/null +++ b/openedx/core/djangoapps/content/search/docs/decisions/0001-meilisearch.rst @@ -0,0 +1,151 @@ +Studio Content Search Powered by Meilisearch +############################################ + +Status +****** + +Draft + + +Context +******* + +Existing search functionality +============================= + +The Open edX platform currently implements many different forms of search. For +example, users can search for course content, library content, forum posts, and +more. Most of the search functionality in the core platform is powered by the +Elasticsearch search engine (though other functionality developed by 2U, such as +in edx-enterprise, is powered by Algolia). + +Most uses of Elasticsearch in Open edX use +`edx-search `_ which provides a partial +abstraction over Elasticsearch. The edx-search library formerly used +`django-haystack `_ as an abstraction +layer across search engines, but "that was ripped out after the package was +abandoned upstream and it became an obstacle to upgrades and efficiently +utilizing Elasticsearch (the abstraction layer imposed significant limits)" +(thanks to Jeremy Bowman for this context). Due to these changes, the current +edx-search API is a mix of abstractions and direct usage of the Elasticsearch +API, which makes it confusing and difficult to work with. In addition, each +usage of edx-search has been implemented fairly differently. See +`State of edx-search `_ +for details (thanks to Andy Shultz). + +Other platform components use Elasticsearch more directly: + +* ``course-discovery`` and ``edx-notes-api`` do not use ``edx-search``, but are + very tied to Elasticsearch via the use of ``django-elasticsearch-dsl`` and + ``django-elasticsearch-drf``. +* ``cs_comments_service`` uses Elasticsearch via the official ruby gems. + +Problems with Elasticsearch +=========================== + +At the same time, there are many problems with the current reliance on +Elasticsearch: + +1. In 2021, the license of Elasticsearch changed from Apache 2.0 to a more + restrictive license that prohibits providing "the products to others as a + managed service". Consequently, AWS forked the search engine to create + OpenSearch and no longer offers Elasticsearch as a service. This is + problematic for many Open edX operators that use AWS and prefer to avoid + any third-party services. +2. Elasticsearch is very resource-intensive and often uses more than a gigabyte + of memory just for small search use cases. +3. Elasticsearch has poor support for multi-tenancy, which multiplies the + problem of resource usage for organizations with many small Open edX sites. +4. The existing usage of edx-search/Elasticsearch routes all search requests and + result processing through edxapp (the LMS) or other IDAs, increasing the + load on those applications. + +Need for Studio Search +====================== + +At the time of this ADR, we have a goal to implement new search functionality in +Studio, to support various course authoring workflows. + +Meilisearch +=========== + +Meilisearch ("MAY-lee search") is a new, promising search engine that offers a +compelling alternative to Elasticsearch. It is open source, feature rich, and +very fast and memory efficient (written in Rust, uses orders of magnitude less +memory than Elasticsearch for small datasets). It has a simple API with an +official python driver, and has official integrations with the popular +Instantsearch frontend library from Algolia. It has strong support for +multi-tenancy, and allows creating restricted API keys that incorporate a user's +permissions, so that search requests can be made directly from the user to +Meilisearch, rather than routing them through Django. Initial testing has shown +it to be much more developer friendly than Elasticsearch/OpenSearch. + +At the time of writing, there are only two known concerns with Meilisearch: + +1. It doesn't (yet) support High Availability via replication, although this is + planned and under development. It does have other features to support high + availability, such as very low restart time (in ms). +2. It doesn't support boolean operators in keyword search ("red AND panda"), + though it does of course support boolean operators in filters. This is a + product decision aimed at keeping the user experience simple, and is unlikely + to change. + + +Decision +******** + +1. We will implement the new Studio search functionality using Meilisearch, + as an experiment and to evaluate it more thoroughly. +2. The Studio search functionality will be disabled by default in the next + Open edX release (Redwood), so that Meilisearch will not be a requirement + for any default nor existing features. This will also allow us to evaluate it + before deciding to embrace it or replace it. +3. We will keep the Meilisearch-specific code isolated to the + new ``content/search`` Django app, so it's relatively easy to swap out later + if this experiment doesn't pan out. +4. We will not use ``edx-search`` for the new search functionality. + + +Consequences +************ + +1. Organizations that wish to try out the new Studio Search functionality in + the Redwood release will have to install and configure Meilisearch. +2. Building both the backend and frontend components of the Studio search + project will be much faster and simpler than if we used ElasticSearch, + edx-search, OpenSearch, django-haystack, etc. +3. Keyword search with boolean operators will not be supported in any of the new + search features. + + +Alternatives Considered +*********************** + +OpenSearch Only +=============== + +Moving existing search functionality to OpenSearch is a possibility, but though +it mostly addresses the licensing issue, it doesn't solve the problems of +resource usage, API complexity, edx-search API complexity, lack of Instantsearch +integration, and poor multi-tenancy. + +OpenSearch and Elasticsearch +============================ + +When OpenSearch was originally forked from Elasticsearch, it was completely API +compatible, but over time they have developed along divergent paths. Regardless +of whether ElasticSearch and OpenSearch are actually wire-compatible, recent +versions of all the official ElasticSearch clients have been made to actively +reject connections to OpenSearch, which is why you generally won't find client +libraries that work with both engines, and why there are OpenSearch forks of +everything on the client side as well as the server side. + +As there is no ready-to-use abstraction layer that would allow us to comfortably +support both, and no interest in maintaining one ourselves, this is not an +appealing option. + +Algolia +======= + +Algolia is a great search engine service, but as it is a proprietary product, it +is not suitable as a requirement for an open source platform like Open edX. From 9b77442587ac23bbbdcc37f69a54288770f04bbc Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Mon, 18 Mar 2024 09:40:05 -0700 Subject: [PATCH 19/28] chore: fix quality issue --- openedx/core/djangoapps/content/search/tests/test_views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openedx/core/djangoapps/content/search/tests/test_views.py b/openedx/core/djangoapps/content/search/tests/test_views.py index 825d1e2924d5..198971609e97 100644 --- a/openedx/core/djangoapps/content/search/tests/test_views.py +++ b/openedx/core/djangoapps/content/search/tests/test_views.py @@ -54,7 +54,7 @@ def test_studio_search_disabled(self): self.client.login(username='student', password='student_pass') result = self.client.get(STUDIO_SEARCH_ENDPOINT_URL) assert result.status_code == 404 - + @override_settings(MEILISEARCH_ENABLED=True) def test_studio_search_student_forbidden(self): """ From 91d0358f7b420f523055e072e9c634ccdd082386 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Mon, 18 Mar 2024 14:17:25 -0700 Subject: [PATCH 20/28] fix: handle case where there are no library/course blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rômulo Penido --- .../content/search/management/commands/reindex_studio.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 462c0a732e5f..34f182b211a3 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -88,7 +88,9 @@ def handle(self, *args, **options): doc = searchable_doc_for_library_block(metadata) docs.append(doc) num_blocks_done += 1 - # Add all the docs in this library at once (usually faster than adding one at a time): + if docs: + # Add all the docs in this library at once (usually faster than adding one at a time): + self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) num_contexts_done += 1 @@ -108,8 +110,9 @@ def add_with_children(block): self.recurse_children(course, add_with_children) - # Add all the docs in this course at once (usually faster than adding one at a time): - self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + if docs: + # Add all the docs in this course at once (usually faster than adding one at a time): + self.wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) num_contexts_done += 1 num_blocks_done += len(docs) From 0af6939608941235c44ef5e481b3f47f4d322400 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Mon, 18 Mar 2024 15:01:17 -0700 Subject: [PATCH 21/28] fix: hash used for ID wasn't stable --- openedx/core/djangoapps/content/search/documents.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index aee51a53e65e..3e8f42848cfd 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -1,7 +1,8 @@ """ -Utilities related to searching content libraries +Utilities related to indexing content for search """ from __future__ import annotations +import hashlib import logging from django.utils.text import slugify @@ -64,7 +65,13 @@ def _meili_id_from_opaque_key(usage_key: UsageKey) -> str: requirement, we transform them to a similar slug ID string that does. """ # The slugified key _may_ not be unique so we append a hashed number too - return slugify(str(usage_key)) + "-" + str(hash(str(usage_key)) % 1_000) + key_bin = str(usage_key).encode() + try: + suffix = hashlib.sha1(key_bin, usedforsecurity=False).hexdigest()[:7] + except TypeError: + # Remove this when we don't need to support Python 3.8 and older + suffix = hashlib.sha1(key_bin).hexdigest()[:7] + return slugify(str(usage_key)) + "-" + suffix def _fields_from_block(block) -> dict: From 106cfd071c874b2d08efaa6d9533f0cb2cd8500a Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Mon, 18 Mar 2024 15:01:28 -0700 Subject: [PATCH 22/28] fix: breadcrumbs weren't using fallback display_names --- openedx/core/djangoapps/content/search/documents.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 3e8f42848cfd..7c299e05ea2d 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -103,7 +103,7 @@ class implementation returns only: # this would be very inefficient. Better to recurse the tree top-down with the parent blocks loaded. log.warning(f"Updating Studio search index for XBlock {block.usage_key} but ancestors weren't cached.") cur_block = cur_block.get_parent() - block_data[Fields.breadcrumbs].insert(0, {"display_name": cur_block.display_name}) + block_data[Fields.breadcrumbs].insert(0, {"display_name": xblock_api.get_block_display_name(cur_block)}) try: content_data = block.index_dictionary() # Will be something like: From 1393d4139f4ee18692cbf242df8b32d1a303909c Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Mon, 18 Mar 2024 15:01:39 -0700 Subject: [PATCH 23/28] test: expand tests --- .../content/search/tests/test_documents.py | 66 +++++++++++++++++++ .../content/search/tests/test_views.py | 7 +- 2 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 openedx/core/djangoapps/content/search/tests/test_documents.py diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py new file mode 100644 index 000000000000..3f31111f7280 --- /dev/null +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -0,0 +1,66 @@ +""" +Tests for the Studio content search documents (what gets stored in the index) +""" +from openedx.core.djangolib.testing.utils import skip_unless_cms +from xmodule.modulestore.django import modulestore +from xmodule.modulestore.tests.django_utils import SharedModuleStoreTestCase +from xmodule.modulestore.tests.factories import BlockFactory, ToyCourseFactory + +from ..documents import searchable_doc_for_course_block + +STUDIO_SEARCH_ENDPOINT_URL = "/api/content_search/v2/studio/" + + +@skip_unless_cms +class StudioDocumentsTest(SharedModuleStoreTestCase): + """ + Tests for the Studio content search documents (what gets stored in the + search index) + """ + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.store = modulestore() + cls.toy_course = ToyCourseFactory.create() # See xmodule/modulestore/tests/sample_courses.py + cls.toy_course_key = cls.toy_course.id + # Create a problem in library + cls.problem_block = BlockFactory.create( + category="problem", + parent_location=cls.toy_course_key.make_usage_key("vertical", "vertical_test"), + display_name='Test Problem', + data="What is a test?", + ) + + def setUp(self): + super().setUp() + + def test_problem_block(self): + """ + Test how a problem block gets represented in the search index + """ + # block_usage_key = self.toy_course_key.make_usage_key("problem", "test_problem") + block = self.store.get_item(self.problem_block.usage_key) + doc = searchable_doc_for_course_block(block) + assert doc == { + # Note the 'id' has been stripped of special characters to meet Meilisearch requirements. + # The '-8516ed8' suffix is deterministic based on the original usage key. + "id": "block-v1edxtoy2012_falltypeproblemblocktest_problem-8516ed8", + "type": "course_block", + "block_type": "problem", + "usage_key": "block-v1:edX+toy+2012_Fall+type@problem+block@Test_Problem", + "block_id": "Test_Problem", + "context_key": "course-v1:edX+toy+2012_Fall", + "org": "edX", + "display_name": "Test Problem", + "breadcrumbs": [ + {"display_name": "Toy Course"}, + {"display_name": "chapter"}, + {"display_name": "sequential"}, + {"display_name": "vertical"}, + ], + "content": { + "capa_content": "What is a test?", + "problem_types": ["multiplechoiceresponse"], + }, + } diff --git a/openedx/core/djangoapps/content/search/tests/test_views.py b/openedx/core/djangoapps/content/search/tests/test_views.py index 198971609e97..e117cf1b969a 100644 --- a/openedx/core/djangoapps/content/search/tests/test_views.py +++ b/openedx/core/djangoapps/content/search/tests/test_views.py @@ -6,19 +6,20 @@ from unittest import mock from common.djangoapps.student.tests.factories import UserFactory -from openedx.core.djangolib.testing.utils import CacheIsolationTestCase, skip_unless_cms +from openedx.core.djangolib.testing.utils import skip_unless_cms STUDIO_SEARCH_ENDPOINT_URL = "/api/content_search/v2/studio/" @skip_unless_cms -class StudioSearchViewTest(CacheIsolationTestCase, APITestCase): +class StudioSearchViewTest(APITestCase): """ General tests for the Studio search REST API. """ @classmethod - def setUpTestData(cls): # lint-amnesty, pylint: disable=super-method-not-called + def setUpClass(cls): + super().setUpClass() cls.staff = UserFactory.create( username='staff', email='staff@example.com', is_staff=True, password='staff_pass' ) From 5a1429e713ddb61b3fabd3e1503b1aafdbc0a9e2 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Mon, 18 Mar 2024 18:34:16 -0700 Subject: [PATCH 24/28] fix: lint issues --- openedx/core/djangoapps/content/search/documents.py | 8 ++------ .../djangoapps/content/search/tests/test_documents.py | 3 --- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 7c299e05ea2d..68eebcfa0d76 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -64,13 +64,9 @@ def _meili_id_from_opaque_key(usage_key: UsageKey) -> str: hyphens (-) and underscores (_). Since our opaque keys don't meet this requirement, we transform them to a similar slug ID string that does. """ - # The slugified key _may_ not be unique so we append a hashed number too + # The slugified key _may_ not be unique so we append a hashed string to make it unique: key_bin = str(usage_key).encode() - try: - suffix = hashlib.sha1(key_bin, usedforsecurity=False).hexdigest()[:7] - except TypeError: - # Remove this when we don't need to support Python 3.8 and older - suffix = hashlib.sha1(key_bin).hexdigest()[:7] + suffix = hashlib.sha1(key_bin).hexdigest()[:7] # When we use Python 3.9+, should add usedforsecurity=False here. return slugify(str(usage_key)) + "-" + suffix diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py index 3f31111f7280..9394ca254bb2 100644 --- a/openedx/core/djangoapps/content/search/tests/test_documents.py +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -32,9 +32,6 @@ def setUpClass(cls): data="What is a test?", ) - def setUp(self): - super().setUp() - def test_problem_block(self): """ Test how a problem block gets represented in the search index From 69879e090c78ddcd9f842d90d0e45bd255264fc3 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Mon, 18 Mar 2024 19:21:59 -0700 Subject: [PATCH 25/28] test: expand tests --- .../content/search/tests/test_documents.py | 88 ++++++++++++++++++- .../core/djangoapps/content_tagging/api.py | 3 + 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py index 9394ca254bb2..78ed68039c51 100644 --- a/openedx/core/djangoapps/content/search/tests/test_documents.py +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -1,6 +1,8 @@ """ Tests for the Studio content search documents (what gets stored in the index) """ +from organizations.models import Organization +from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangolib.testing.utils import skip_unless_cms from xmodule.modulestore.django import modulestore from xmodule.modulestore.tests.django_utils import SharedModuleStoreTestCase @@ -24,6 +26,8 @@ def setUpClass(cls): cls.store = modulestore() cls.toy_course = ToyCourseFactory.create() # See xmodule/modulestore/tests/sample_courses.py cls.toy_course_key = cls.toy_course.id + # Get references to some blocks in the toy course + cls.html_block_key = cls.toy_course_key.make_usage_key("html", "toyjumpto") # Create a problem in library cls.problem_block = BlockFactory.create( category="problem", @@ -32,11 +36,29 @@ def setUpClass(cls): data="What is a test?", ) + # Create a couple taxonomies and some tags + cls.org = Organization.objects.create(name="edX", short_name="edX") + cls.difficulty_tags = tagging_api.create_taxonomy(name="Difficulty", orgs=[cls.org], allow_multiple=False) + tagging_api.add_tag_to_taxonomy(cls.difficulty_tags, tag="Easy") + tagging_api.add_tag_to_taxonomy(cls.difficulty_tags, tag="Normal") + tagging_api.add_tag_to_taxonomy(cls.difficulty_tags, tag="Difficult") + + cls.subject_tags = tagging_api.create_taxonomy(name="Subject", orgs=[cls.org], allow_multiple=True) + tagging_api.add_tag_to_taxonomy(cls.subject_tags, tag="Linguistics") + tagging_api.add_tag_to_taxonomy(cls.subject_tags, tag="Asian Languages", parent_tag_value="Linguistics") + tagging_api.add_tag_to_taxonomy(cls.subject_tags, tag="Chinese", parent_tag_value="Asian Languages") + tagging_api.add_tag_to_taxonomy(cls.subject_tags, tag="Hypertext") + tagging_api.add_tag_to_taxonomy(cls.subject_tags, tag="Jump Links", parent_tag_value="Hypertext") + + # Tag stuff: + tagging_api.tag_object(cls.problem_block.usage_key, cls.difficulty_tags, tags=["Easy"]) + tagging_api.tag_object(cls.html_block_key, cls.subject_tags, tags=["Chinese", "Jump Links"]) + tagging_api.tag_object(cls.html_block_key, cls.difficulty_tags, tags=["Normal"]) + def test_problem_block(self): """ Test how a problem block gets represented in the search index """ - # block_usage_key = self.toy_course_key.make_usage_key("problem", "test_problem") block = self.store.get_item(self.problem_block.usage_key) doc = searchable_doc_for_course_block(block) assert doc == { @@ -60,4 +82,68 @@ def test_problem_block(self): "capa_content": "What is a test?", "problem_types": ["multiplechoiceresponse"], }, + # See https://blog.meilisearch.com/nested-hierarchical-facets-guide/ + # and https://www.algolia.com/doc/api-reference/widgets/hierarchical-menu/js/ + # For details on why the hierarchical tag data is in this format. + "tags": { + "taxonomy": ["Difficulty"], + "level0": ["Difficulty > Easy"], + }, + } + + def test_html_block(self): + """ + Test how an HTML block gets represented in the search index + """ + block = self.store.get_item(self.html_block_key) + doc = searchable_doc_for_course_block(block) + assert doc == { + "id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-b0b4a10", + "type": "course_block", + "block_type": "html", + "usage_key": "block-v1:edX+toy+2012_Fall+type@html+block@toyjumpto", + "block_id": "toyjumpto", + "context_key": "course-v1:edX+toy+2012_Fall", + "org": "edX", + "display_name": "Text", + "breadcrumbs": [ + {"display_name": "Toy Course"}, + {"display_name": "Overview"}, + {"display_name": "Toy Videos"}, + ], + "content": { + "html_content": ( + "This is a link to another page and some Chinese 四節比分和七年前 Some more Chinese 四節比分和七年前 " + ), + }, + "tags": { + "taxonomy": ["Difficulty", "Subject"], + "level0": ["Difficulty > Normal", "Subject > Hypertext", "Subject > Linguistics"], + "level1": ["Subject > Hypertext > Jump Links", "Subject > Linguistics > Asian Languages"], + "level2": ["Subject > Linguistics > Asian Languages > Chinese"], + }, + } + + def test_video_block_untagged(self): + """ + Test how a video block gets represented in the search index. + """ + block_usage_key = self.toy_course_key.make_usage_key("video", "Welcome") + block = self.store.get_item(block_usage_key) + doc = searchable_doc_for_course_block(block) + assert doc == { + "id": "block-v1edxtoy2012_falltypevideoblockwelcome-b47fb14", + "type": "course_block", + "block_type": "video", + "usage_key": "block-v1:edX+toy+2012_Fall+type@video+block@Welcome", + "block_id": "Welcome", + "context_key": "course-v1:edX+toy+2012_Fall", + "org": "edX", + "display_name": "Welcome", + "breadcrumbs": [ + {"display_name": "Toy Course"}, + {"display_name": "Overview"}, + ], + "content": {}, + # This video has no tags. } diff --git a/openedx/core/djangoapps/content_tagging/api.py b/openedx/core/djangoapps/content_tagging/api.py index c167eba3a629..e50d96fe2ce2 100644 --- a/openedx/core/djangoapps/content_tagging/api.py +++ b/openedx/core/djangoapps/content_tagging/api.py @@ -200,6 +200,9 @@ def set_object_tags( # Expose the oel_tagging APIs +add_tag_to_taxonomy = oel_tagging.add_tag_to_taxonomy +update_tag_in_taxonomy = oel_tagging.update_tag_in_taxonomy +delete_tags_from_taxonomy = oel_tagging.delete_tags_from_taxonomy get_taxonomy = oel_tagging.get_taxonomy get_taxonomies = oel_tagging.get_taxonomies get_tags = oel_tagging.get_tags From 1b5dfdf3f89058e542fdb6de8be1cd8bbe78b074 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Tue, 19 Mar 2024 10:58:09 -0700 Subject: [PATCH 26/28] docs: fix typo pointed out in review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rômulo Penido --- openedx/core/djangoapps/content/search/documents.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 68eebcfa0d76..e149854dde57 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -206,7 +206,7 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> def searchable_doc_for_course_block(block) -> dict: """ Generate a dictionary document suitable for ingestion into a search engine - like Meilisearch or Elasticsearch, so that the given library block can be + like Meilisearch or Elasticsearch, so that the given course block can be found using faceted search. """ doc = _fields_from_block(block) From ec0c4a875d7bc4766ff02c50a5e36d8ec6cd3237 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 21 Mar 2024 11:17:56 -0700 Subject: [PATCH 27/28] docs: clarify code with comments, indicate command is experimental --- cms/envs/common.py | 2 +- .../djangoapps/content/search/documents.py | 49 ++++++++++++------- .../management/commands/reindex_studio.py | 20 +++++++- .../content/search/tests/test_documents.py | 6 +-- 4 files changed, 53 insertions(+), 24 deletions(-) diff --git a/cms/envs/common.py b/cms/envs/common.py index 1459d3bda4a8..e3e68df36d3c 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -1777,7 +1777,7 @@ 'openedx.core.djangoapps.content_tagging', # Search - 'openedx.core.djangoapps.content.search.apps.ContentSearchConfig', + 'openedx.core.djangoapps.content.search', 'openedx.features.course_duration_limits', 'openedx.features.content_type_gating', diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index e149854dde57..fc1388f82455 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -2,7 +2,7 @@ Utilities related to indexing content for search """ from __future__ import annotations -import hashlib +from hashlib import blake2b import logging from django.utils.text import slugify @@ -37,9 +37,10 @@ class Fields: # See https://blog.meilisearch.com/nested-hierarchical-facets-guide/ # and https://www.algolia.com/doc/api-reference/widgets/hierarchical-menu/js/ # For details on the format of the hierarchical tag data. + # We currently have a hard-coded limit of 4 levels of tags in the search index (level0..level3). tags = "tags" - tags_taxonomy = "taxonomy" - tags_level0 = "level0" + tags_taxonomy = "taxonomy" # subfield of tags, i.e. tags.taxonomy + tags_level0 = "level0" # subfield of tags, i.e. tags.level0 tags_level1 = "level1" tags_level2 = "level2" tags_level3 = "level3" @@ -48,6 +49,10 @@ class Fields: # Text (html) blocks have an "html_content" key in here, capa has "capa_content" and "problem_types", and so on. content = "content" + # Note: new fields or values can be added at any time, but if they need to be indexed for filtering or keyword + # search, the index configuration will need to be changed, which is only done as part of the 'reindex_studio' + # command (changing those settings on an large active index is not recommended). + class DocType: """ @@ -63,10 +68,13 @@ def _meili_id_from_opaque_key(usage_key: UsageKey) -> str: integer or a string composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_). Since our opaque keys don't meet this requirement, we transform them to a similar slug ID string that does. + + In the future, with Learning Core's data models in place for courseware, + we could use PublishableEntity's primary key / UUID instead. """ # The slugified key _may_ not be unique so we append a hashed string to make it unique: key_bin = str(usage_key).encode() - suffix = hashlib.sha1(key_bin).hexdigest()[:7] # When we use Python 3.9+, should add usedforsecurity=False here. + suffix = blake2b(key_bin, digest_size=4).hexdigest() # When we use Python 3.9+, should add usedforsecurity=False return slugify(str(usage_key)) + "-" + suffix @@ -141,6 +149,13 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: "level2": ["Location > North America > Canada > Vancouver"], } } + + Note: despite what you might expect, because this is only used for the + filtering/refinement UI, it's fine if this is a one-way transformation. + It's not necessary to be able to re-construct the exact tag IDs nor taxonomy + IDs from this data that's stored in the search index. It's just a bunch of + strings in a particular format that the frontend knows how to render to + support hierarchical refinement by tag. """ # Note that we could improve performance for indexing many components from the same library/course, # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the @@ -159,15 +174,24 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: result[Fields.tags_taxonomy].append(obj_tag.name) # Taxonomy name plus each level of tags, in a list: parts = [obj_tag.name] + obj_tag.get_lineage() # e.g. ["Location", "North America", "Canada", "Vancouver"] - parts = [part.replace(" > ", " _ ") for part in parts] # Escape our separator + parts = [part.replace(" > ", " _ ") for part in parts] # Escape our separator. + # Now we build each level (tags.level0, tags.level1, etc.) as applicable. + # We have a hard-coded limit of 4 levels of tags for now (see Fields.tags above). + # A tag like "Difficulty: Hard" will only result in one level (tags.level0) + # But a tag like "Location: North America > Canada > Vancouver" would result in three levels (tags.level0: + # "North America", tags.level1: "North America > Canada", tags.level2: "North America > Canada > Vancouver") + # See the comments above on "Field.tags" for an explanation of why we use this format (basically it's the format + # required by the Instantsearch frontend). for level in range(4): + # We use '>' as a separator because it's the default for the Instantsearch frontend library, and our + # preferred separator (\t) used in the database is ignored by Meilisearch since it's whitespace. new_value = " > ".join(parts[0:level + 2]) if f"level{level}" not in result: result[f"level{level}"] = [new_value] elif new_value not in result[f"level{level}"]: result[f"level{level}"].append(new_value) if len(parts) == level + 2: - break + break # We have all the levels for this tag now (e.g. parts=["Difficulty", "Hard"] -> need level0 only) return {Fields.tags: result} @@ -184,18 +208,7 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> block = xblock_api.load_block(metadata.usage_key, user=None) except Exception as err: # pylint: disable=broad-except log.exception(f"Failed to load XBlock {metadata.usage_key}: {err}") - # Even though we couldn't load the block, we can still include basic data about it in the index, from 'metadata' - doc.update({ - Fields.id: _meili_id_from_opaque_key(metadata.usage_key), - Fields.usage_key: str(metadata.usage_key), - Fields.block_id: str(metadata.usage_key.block_id), - Fields.display_name: metadata.display_name, - Fields.type: metadata.usage_key.block_type, - Fields.context_key: str(metadata.usage_key.context_key), - Fields.org: str(metadata.usage_key.context_key.org), - }) - else: - doc.update(_fields_from_block(block)) + doc.update(_fields_from_block(block)) doc.update(_tags_for_content_object(metadata.usage_key)) doc[Fields.type] = DocType.library_block # Add the breadcrumbs. In v2 libraries, the library itself is not a "parent" of the XBlocks so we add it here: diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 34f182b211a3..d2df30b2a016 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -9,7 +9,7 @@ import time from django.conf import settings -from django.core.management import BaseCommand +from django.core.management import BaseCommand, CommandError from openedx.core.djangoapps.content_libraries import api as lib_api from openedx.core.djangoapps.content.search.documents import ( @@ -28,13 +28,24 @@ class Command(MeiliCommandMixin, BaseCommand): """ - Build or re-build the search index for courses (in Studio, i.e. Draft mode) + Build or re-build the search index for courses and libraries (in Studio, i.e. Draft mode) + + This is experimental and not recommended for production use. """ + def add_arguments(self, parser): + parser.add_argument('--experimental', action='store_true') + parser.set_defaults(experimental=False) + def handle(self, *args, **options): """ Build a new search index for Studio, containing content from courses and libraries """ + if not options["experimental"]: + raise CommandError( + "This command is experimental and not recommended for production. " + "Use the --experimental argument to acknowledge and run it." + ) start_time = time.perf_counter() client = self.get_meilisearch_client() store = modulestore() @@ -59,6 +70,11 @@ def handle(self, *args, **options): index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_NAME with self.using_temp_index(index_name) as temp_index_name: ############## Configure the index ############## + + # The following index settings are best changed on an empty index. + # Changing them on a populated index will "re-index all documents in the index, which can take some time" + # and use more RAM. Instead, we configure an empty index then populate it one course/library at a time. + # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): client.index(temp_index_name).update_distinct_attribute(Fields.usage_key) # Mark which attributes can be used for filtering/faceted search: diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py index 78ed68039c51..97b2fb5d033f 100644 --- a/openedx/core/djangoapps/content/search/tests/test_documents.py +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -64,7 +64,7 @@ def test_problem_block(self): assert doc == { # Note the 'id' has been stripped of special characters to meet Meilisearch requirements. # The '-8516ed8' suffix is deterministic based on the original usage key. - "id": "block-v1edxtoy2012_falltypeproblemblocktest_problem-8516ed8", + "id": "block-v1edxtoy2012_falltypeproblemblocktest_problem-f46b6f1e", "type": "course_block", "block_type": "problem", "usage_key": "block-v1:edX+toy+2012_Fall+type@problem+block@Test_Problem", @@ -98,7 +98,7 @@ def test_html_block(self): block = self.store.get_item(self.html_block_key) doc = searchable_doc_for_course_block(block) assert doc == { - "id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-b0b4a10", + "id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-efb9c601", "type": "course_block", "block_type": "html", "usage_key": "block-v1:edX+toy+2012_Fall+type@html+block@toyjumpto", @@ -132,7 +132,7 @@ def test_video_block_untagged(self): block = self.store.get_item(block_usage_key) doc = searchable_doc_for_course_block(block) assert doc == { - "id": "block-v1edxtoy2012_falltypevideoblockwelcome-b47fb14", + "id": "block-v1edxtoy2012_falltypevideoblockwelcome-0c9fd626", "type": "course_block", "block_type": "video", "usage_key": "block-v1:edX+toy+2012_Fall+type@video+block@Welcome", From 1ff4b756074de333e31bb2b041c17f7c6612d52a Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 21 Mar 2024 12:08:50 -0700 Subject: [PATCH 28/28] perf: pre-fetch all of the blocks in the course --- .../content/search/management/commands/reindex_studio.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index d2df30b2a016..0f52eea51d4d 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -118,6 +118,9 @@ def handle(self, *args, **options): ) docs = [] + # Pre-fetch the course with all of its children: + course = store.get_course(course.id, depth=None) + def add_with_children(block): """ Recursively index the given XBlock/component """ doc = searchable_doc_for_course_block(block)