Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: incremental reindex_studio management command #35864

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 146 additions & 78 deletions openedx/core/djangoapps/content/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import logging
import time
from contextlib import contextmanager
from contextlib import contextmanager, nullcontext
from datetime import datetime, timedelta, timezone
from functools import wraps
from typing import Callable, Generator
Expand All @@ -24,7 +24,7 @@
from rest_framework.request import Request
from common.djangoapps.student.role_helpers import get_course_roles
from openedx.core.djangoapps.content.course_overviews.models import CourseOverview
from openedx.core.djangoapps.content.search.models import get_access_ids_for_request
from openedx.core.djangoapps.content.search.models import get_access_ids_for_request, IncrementalIndexCompleted
from openedx.core.djangoapps.content_libraries import api as lib_api
from xmodule.modulestore.django import modulestore

Expand Down Expand Up @@ -217,6 +217,91 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat
_wait_for_meili_task(client.delete_index(temp_index_name))


def _configure_index(index_name):
"""
Configure the index. The following index settings are best changed on an empty index.
Changing them on a populated index will "re-index all documents in the index", which can take some time.

Args:
index_name (str): The name of the index to configure
"""
client = _get_meilisearch_client()

# Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique):
client.index(index_name).update_distinct_attribute(Fields.usage_key)
# Mark which attributes can be used for filtering/faceted search:
client.index(index_name).update_filterable_attributes(
[
# Get specific block/collection using combination of block_id and context_key
Fields.block_id,
Fields.block_type,
Fields.context_key,
Fields.usage_key,
Fields.org,
Fields.tags,
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Fields.tags + "." + Fields.tags_level2,
Fields.tags + "." + Fields.tags_level3,
Fields.collections,
Fields.collections + "." + Fields.collections_display_name,
Fields.collections + "." + Fields.collections_key,
Fields.type,
Fields.access_id,
Fields.last_published,
Fields.content + "." + Fields.problem_types,
]
)
# Mark which attributes are used for keyword search, in order of importance:
client.index(index_name).update_searchable_attributes(
[
# Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields.
Fields.display_name,
Fields.block_id,
Fields.content,
Fields.description,
Fields.tags,
Fields.collections,
# If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
# are searchable only if at least one document in the index has a value. If we didn't list them here and,
# say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
# these sub-fields: "Attribute `tags.level3` is not searchable."
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Fields.tags + "." + Fields.tags_level2,
Fields.tags + "." + Fields.tags_level3,
Fields.collections + "." + Fields.collections_display_name,
Fields.collections + "." + Fields.collections_key,
Fields.published + "." + Fields.display_name,
Fields.published + "." + Fields.published_description,
]
)
# Mark which attributes can be used for sorting search results:
client.index(index_name).update_sortable_attributes(
[
Fields.display_name,
Fields.created,
Fields.modified,
Fields.last_published,
]
)

# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
# cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy
client.index(index_name).update_ranking_rules(
[
"sort",
"words",
"typo",
"proximity",
"attribute",
"exactness",
]
)


def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None:
"""
Recurse the children of an XBlock and call the given function for each
Expand Down Expand Up @@ -279,8 +364,40 @@ def is_meilisearch_enabled() -> bool:
return False


# pylint: disable=too-many-statements
def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
def reset_index(status_cb: Callable[[str], None] | None = None) -> None:
"""
Reset the Meilisearch index, deleting all documents and reconfiguring it
"""
if status_cb is None:
status_cb = log.info

status_cb("Creating new empty index...")
with _using_temp_index(status_cb) as temp_index_name:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DanielVZ96 @bradenmacdonald @pomegranited I am concerned about the functionality of this reset.

If we need to create and populate a new index on a large instance, we need to run the --reset, which removes the old index, and then I need to run the --incremental. However, the search results will be broken when the new index is populated.

Wouldn't it be better to add an option to --incremental, so that it creates a temporary index and does swap, so as not to break the search results, as the non-incremental form does?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ChrisChV I'm assuming that most times an administrator is running this incremental build, they either have no existing index or their existing index is from an old version and is missing some configuration/columns/etc. (so it actually is broken). In that case, removing the old index and starting an incremental build will result in incomplete search results for a while, but the search will be working without errors. And, the incremental index rebuilds the newest courses first, so the results should fill in relatively quickly.

I think for large instances, (where the reindex can take several days) it's better to have a working search with incomplete results, than to have a totally broken search (that displays errors because the old index does not exist or has the wrong configuration).

For Teak I would like to find a way to simplify this, maybe by only having the incremental option and allowing it to be either using a temporary index or not. But I think we'll need to see how this works first and hear from people testing it out.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK that's fine for me 👍

_configure_index(temp_index_name)
status_cb("Index recreated!")
status_cb("Index reset complete.")


def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None:
"""
Initialize the Meilisearch index, creating it and configuring it if it doesn't exist
"""
if status_cb is None:
status_cb = log.info
if warn_cb is None:
warn_cb = log.warning

if _index_exists(STUDIO_INDEX_NAME):
warn_cb(
"A rebuild of the index is required. Please run ./manage.py cms reindex_studio"
" --experimental [--incremental]"
)
return
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I realize that we shouldn't actually print this every time if the index exists. What we really want is to print it only if the index is empty or the index configuration (schema) is outdated. Sorry for not realizing this sooner.

Suggested change
if _index_exists(STUDIO_INDEX_NAME):
warn_cb(
"A rebuild of the index is required. Please run ./manage.py cms reindex_studio"
" --experimental [--incremental]"
)
return
if _index_exists(STUDIO_INDEX_NAME):
if _index_is_empty(STUDIO_INDEX_NAME):
warn_cb(
"The studio search index is empty. Please run ./manage.py cms reindex_studio"
" --experimental [--incremental]"
)
return
elif get_index_version(STUDIO_INDEX_NAME) < CURRENT_INDEX_SCHEMA_VERSION):
warn_cb(
"A rebuild of the index is required. Please run ./manage.py cms reindex_studio"
" --experimental [--incremental]"
)
return

and something like this:

CURRENT_INDEX_SCHEMA_VERSION = 20241021

def get_index_version(index_name: str) -> int:
    # use the get index settings endpoint to get the current settings: 
    # https://www.meilisearch.com/docs/reference/api/settings#get-settings
    current_settings = ...
    # Then compare the settings to determine the index schema version:
    if (Fields.published + "." + Fields.display_name) in current_settings["searchableAttributes"]:
        # We call this "version 20241021", including this change: 
        # https://github.com/openedx/edx-platform/commit/fb25a5d635c0f2650d514454927da666b057aa39
        return 20241021
    return 0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bradenmacdonald done. i did something slightly different for checking that the schema is up to date. let me know what you think.


reset_index(status_cb)


def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=False) -> None: # lint-amnesty, pylint: disable=too-many-statements
"""
Rebuild the Meilisearch index from scratch
"""
Expand All @@ -292,7 +409,14 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:

# Get the lists of libraries
status_cb("Counting libraries...")
lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')]
keys_indexed = []
if incremental:
keys_indexed = list(IncrementalIndexCompleted.objects.values_list("context_key", flat=True))
lib_keys = [
lib.library_key
for lib in lib_api.ContentLibrary.objects.select_related("org").only("org", "slug").order_by("-id")
if lib.library_key not in keys_indexed
]
num_libraries = len(lib_keys)

# Get the list of courses
Expand All @@ -305,83 +429,19 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
num_blocks_done = 0 # How many individual components/XBlocks we've indexed

status_cb(f"Found {num_courses} courses, {num_libraries} libraries.")
with _using_temp_index(status_cb) as temp_index_name:
with _using_temp_index(status_cb) if not incremental else nullcontext(STUDIO_INDEX_NAME) as index_name:
############## Configure the index ##############

# The following index settings are best changed on an empty index.
# Changing them on a populated index will "re-index all documents in the index, which can take some time"
# The index settings are best changed on an empty index.
# Changing them on a populated index will "re-index all documents in the index", which can take some time
# and use more RAM. Instead, we configure an empty index then populate it one course/library at a time.

# Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique):
client.index(temp_index_name).update_distinct_attribute(Fields.usage_key)
# Mark which attributes can be used for filtering/faceted search:
client.index(temp_index_name).update_filterable_attributes([
# Get specific block/collection using combination of block_id and context_key
Fields.block_id,
Fields.block_type,
Fields.context_key,
Fields.usage_key,
Fields.org,
Fields.tags,
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Fields.tags + "." + Fields.tags_level2,
Fields.tags + "." + Fields.tags_level3,
Fields.collections,
Fields.collections + "." + Fields.collections_display_name,
Fields.collections + "." + Fields.collections_key,
Fields.type,
Fields.access_id,
Fields.last_published,
Fields.content + "." + Fields.problem_types,
])
# Mark which attributes are used for keyword search, in order of importance:
client.index(temp_index_name).update_searchable_attributes([
# Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields.
Fields.display_name,
Fields.block_id,
Fields.content,
Fields.description,
Fields.tags,
Fields.collections,
# If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
# are searchable only if at least one document in the index has a value. If we didn't list them here and,
# say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
# these sub-fields: "Attribute `tags.level3` is not searchable."
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Fields.tags + "." + Fields.tags_level2,
Fields.tags + "." + Fields.tags_level3,
Fields.collections + "." + Fields.collections_display_name,
Fields.collections + "." + Fields.collections_key,
Fields.published + "." + Fields.display_name,
Fields.published + "." + Fields.published_description,
])
# Mark which attributes can be used for sorting search results:
client.index(temp_index_name).update_sortable_attributes([
Fields.display_name,
Fields.created,
Fields.modified,
Fields.last_published,
])

# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
# cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy
client.index(temp_index_name).update_ranking_rules([
"sort",
"words",
"typo",
"proximity",
"attribute",
"exactness",
])
if not incremental:
_configure_index(index_name)

############## Libraries ##############
status_cb("Indexing libraries...")

def index_library(lib_key: str) -> list:
def index_library(lib_key: LibraryLocatorV2) -> list:
docs = []
for component in lib_api.get_library_components(lib_key):
try:
Expand All @@ -396,7 +456,7 @@ def index_library(lib_key: str) -> list:
if docs:
try:
# Add all the docs in this library at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
_wait_for_meili_task(client.index(index_name).add_documents(docs))
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing library {lib_key}: {err}")
return docs
Expand All @@ -416,7 +476,7 @@ def index_collection_batch(batch, num_done, library_key) -> int:
if docs:
try:
# Add docs in batch of 100 at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
_wait_for_meili_task(client.index(index_name).add_documents(docs))
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing collection batch {p}: {err}")
return num_done
Expand All @@ -439,6 +499,8 @@ def index_collection_batch(batch, num_done, library_key) -> int:
num_collections_done,
lib_key,
)
if incremental:
IncrementalIndexCompleted.objects.get_or_create(context_key=lib_key)
status_cb(f"{num_collections_done}/{num_collections} collections indexed for library {lib_key}")

num_contexts_done += 1
Expand All @@ -464,7 +526,7 @@ def add_with_children(block):

if docs:
# Add all the docs in this course at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
_wait_for_meili_task(client.index(index_name).add_documents(docs))
return docs

paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000)
Expand All @@ -473,10 +535,16 @@ def add_with_children(block):
status_cb(
f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})"
)
if course.id in keys_indexed:
num_contexts_done += 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Skipped courses are still included in the total count and in the num_contexts_done. I like that. I think we should do the same for skipped (already-indexed) libraries. Currently, they're excluded altogether and won't be include din the num_contexts_done count.

continue
course_docs = index_course(course)
if incremental:
IncrementalIndexCompleted.objects.get_or_create(context_key=course.id)
num_contexts_done += 1
num_blocks_done += len(course_docs)

IncrementalIndexCompleted.objects.all().delete()
status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.")


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ class Command(BaseCommand):
"""

def add_arguments(self, parser):
parser.add_argument('--experimental', action='store_true')
parser.set_defaults(experimental=False)
parser.add_argument("--experimental", action="store_true")
parser.add_argument("--reset", action="store_true")
parser.add_argument("--init", action="store_true")
parser.add_argument("--incremental", action="store_true")
parser.set_defaults(experimental=False, reset=False, init=False, incremental=False)

def handle(self, *args, **options):
"""
Expand All @@ -34,4 +37,11 @@ def handle(self, *args, **options):
"Use the --experimental argument to acknowledge and run it."
)

api.rebuild_index(self.stdout.write)
if options["reset"]:
api.reset_index(self.stdout.write)
elif options["init"]:
api.init_index(self.stdout.write)
elif options["incremental"]:
api.rebuild_index(self.stdout.write, incremental=True)
else:
api.rebuild_index(self.stdout.write)
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by Django 4.2.16 on 2024-11-15 12:40

from django.db import migrations, models
import opaque_keys.edx.django.models


class Migration(migrations.Migration):

dependencies = [
('search', '0001_initial'),
]

operations = [
migrations.CreateModel(
name='IncrementalIndexCompleted',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('context_key', opaque_keys.edx.django.models.LearningContextKeyField(max_length=255, unique=True)),
],
),
]
12 changes: 12 additions & 0 deletions openedx/core/djangoapps/content/search/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,15 @@ def get_access_ids_for_request(request: Request, omit_orgs: list[str] = None) ->
course_clause | library_clause
).order_by('-id').values_list("id", flat=True)
)


class IncrementalIndexCompleted(models.Model):
"""
Stores the contex keys of aleady indexed courses and libraries for incremental indexing.
"""

context_key = LearningContextKeyField(
max_length=255,
unique=True,
null=False,
)
Loading
Loading