From 4f117bceca60402ed0c18755b57e85517783585c Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Mon, 3 Jun 2024 13:47:08 -0700 Subject: [PATCH] fix: reindex_studio was crashing if instance had too many courses --- openedx/core/djangoapps/content/search/api.py | 59 ++++++++++--------- .../content/search/tests/test_api.py | 3 + 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index bbde4fc98230..f658155eb812 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -13,6 +13,7 @@ from django.conf import settings from django.contrib.auth import get_user_model from django.core.cache import cache +from django.core.paginator import Paginator from meilisearch import Client as MeilisearchClient from meilisearch.errors import MeilisearchError from meilisearch.models.task import TaskInfo @@ -21,10 +22,9 @@ from common.djangoapps.student.roles import GlobalStaff from rest_framework.request import Request from common.djangoapps.student.role_helpers import get_course_roles +from openedx.core.djangoapps.content.course_overviews.models import CourseOverview from openedx.core.djangoapps.content.search.models import get_access_ids_for_request - from openedx.core.djangoapps.content_libraries import api as lib_api -from xmodule.modulestore import ModuleStoreEnum from xmodule.modulestore.django import modulestore from .documents import ( @@ -292,9 +292,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: # Get the list of courses status_cb("Counting courses...") - with store.branch_setting(ModuleStoreEnum.Branch.draft_preferred): - all_courses = store.get_courses() - num_courses = len(all_courses) + num_courses = CourseOverview.objects.count() # Some counters so we can track our progress as indexing progresses: num_contexts = num_courses + num_libraries @@ -358,30 +356,33 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: ############## Courses ############## status_cb("Indexing courses...") - for course in all_courses: - status_cb( - f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})" - ) - docs = [] - - # Pre-fetch the course with all of its children: - course = store.get_course(course.id, depth=None) - - def add_with_children(block): - """ Recursively index the given XBlock/component """ - doc = searchable_doc_for_course_block(block) - doc.update(searchable_doc_tags(block.usage_key)) - docs.append(doc) # pylint: disable=cell-var-from-loop - _recurse_children(block, add_with_children) # pylint: disable=cell-var-from-loop - - # Index course children - _recurse_children(course, add_with_children) - - if docs: - # Add all the docs in this course at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) - num_contexts_done += 1 - num_blocks_done += len(docs) + # To reduce memory usage on large instances, split up the CourseOverviews into pages of 1,000 courses: + paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000) + for p in paginator.page_range: + for course in paginator.page(p).object_list: + status_cb( + f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})" + ) + docs = [] + + # Pre-fetch the course with all of its children: + course = store.get_course(course.id, depth=None) + + def add_with_children(block): + """ Recursively index the given XBlock/component """ + doc = searchable_doc_for_course_block(block) + doc.update(searchable_doc_tags(block.usage_key)) + docs.append(doc) # pylint: disable=cell-var-from-loop + _recurse_children(block, add_with_children) # pylint: disable=cell-var-from-loop + + # Index course children + _recurse_children(course, add_with_children) + + if docs: + # Add all the docs in this course at once (usually faster than adding one at a time): + _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + num_contexts_done += 1 + num_blocks_done += len(docs) status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and libraries.") diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py index cd1acc31b7d9..1c78b28506fe 100644 --- a/openedx/core/djangoapps/content/search/tests/test_api.py +++ b/openedx/core/djangoapps/content/search/tests/test_api.py @@ -15,6 +15,7 @@ from common.djangoapps.student.tests.factories import UserFactory from openedx.core.djangoapps.content_libraries import api as library_api from openedx.core.djangoapps.content_tagging import api as tagging_api +from openedx.core.djangoapps.content.course_overviews.api import CourseOverview from openedx.core.djangolib.testing.utils import skip_unless_cms from xmodule.modulestore.tests.django_utils import TEST_DATA_SPLIT_MODULESTORE, ModuleStoreTestCase @@ -106,6 +107,8 @@ def setUp(self): "content": {}, "access_id": course_access.id, } + # Make sure the CourseOverview for the course is created: + CourseOverview.get_from_id(self.course.id) # Create a content library: self.library = library_api.create_library(