feat: restrict Studio search results based on user permissions (#34471)

* feat: adds SearchAccess model Stores a numeric ID for each course + library, which will generally be shorter than the full context_key, so we can pack more of them into the the Meilisearch search filter. Also: * Adds data migration pre-populates the SearchAccess model from the existing CourseOverview and ContentLibrary records * Adds signal handlers to add/remove SearchAccess entries when content is created or deleted. * Adds get_access_ids_for_request() helper method for use in views. * Adds tests. * test: can't import content.search in lms tests * feat: use SearchAccess in documents and views * Adds an access_id field to the document, which stores the SearchAccess.id for the block's context. * Use the requesting user's allowed access_ids to filter search results to documents with those access_ids. * Since some users have a lot of individual access granted, limit the number of access_ids in the filter to a large number (1_000) * Updates tests to demonstrate. * test: can't import content.search or content_staging in lms tests * fix: make access_id field filterable * fix: use SearchAccess.get_or_create in signal handlers In theory, we shouldn't have to do this, because the CREATE and DELETE events should keep the SearchAccess table up-to-date. But in practice, signals can be missed (or in tests, they may be disabled). So we assume that it's ok to re-use a SearchAccess.id created for a given course or library context_key. * refactor: refactors the view tests to make them clearer Uses helper methods and decorators to wrap the settings and patches used by multiple view tests. * feat: adds org filters to meilisearch filter * Uses content_tagging.rules.get_user_orgs to fetch the user's content-related orgs for use in the meilisearch filter. * Limits the number of orgs used to 1_000 to keep token size down * refactor: removes data migration Users should use the reindex_studio management command to populate SearchAccess. * refactor: adds functions to common.djangoapps.student.role_helpers to allow general access to the user's RoleCache without having to access private attributes of User or RoleCache. Related changes: * Moves some functionality from openedx.core.djangoapps.enrollments.data.get_user_roles to this new helper method. * Use these new helper method in content_tagging.rules * fix: get_access_ids_for_request only returns individual access instead of all course keys that the user can read. Org- and GlobalStaff access checks will handle the rest. * fix: use org-level permissions when generating search filter Also refactors tests to demonstrate this change for OrgStaff and OrgInstructor users. * refactor: remove SearchAccess creation signal handlers Lets SearchAccess entries be created on demand during search indexing. * feat: omit access_ids from the search filter that are covered by the user's org roles --------- Co-authored-by: Rômulo Penido <[email protected]>
openedx · Apr 17, 2024 · d672110 · d672110
1 parent 96f6349
commit d672110
Show file tree

Hide file tree

Showing 14 changed files with 718 additions and 59 deletions.
diff --git a/common/djangoapps/student/role_helpers.py b/common/djangoapps/student/role_helpers.py
@@ -1,7 +1,9 @@
 """
 Helpers for student roles
 """
+from __future__ import annotations
 
+from django.contrib.auth import get_user_model
 
 from openedx.core.djangoapps.django_comment_common.models import (
     FORUM_ROLE_ADMINISTRATOR,
@@ -12,15 +14,20 @@
 )
 from openedx.core.lib.cache_utils import request_cached
 from common.djangoapps.student.roles import (
+    CourseAccessRole,
     CourseBetaTesterRole,
     CourseInstructorRole,
     CourseStaffRole,
     GlobalStaff,
     OrgInstructorRole,
-    OrgStaffRole
+    OrgStaffRole,
+    RoleCache,
 )
 
 
+User = get_user_model()
+
+
 @request_cached()
 def has_staff_roles(user, course_key):
     """
@@ -40,3 +47,32 @@ def has_staff_roles(user, course_key):
             is_org_instructor, is_global_staff, has_forum_role]):
         return True
     return False
+
+
+@request_cached()
+def get_role_cache(user: User) -> RoleCache:
+    """
+    Returns a populated RoleCache for the given user.
+
+    The returned RoleCache is also cached on the provided `user` to improve performance on future roles checks.
+
+    :param user: User
+    :return: All roles for all courses that this user has access to.
+    """
+    # pylint: disable=protected-access
+    if not hasattr(user, '_roles'):
+        user._roles = RoleCache(user)
+    return user._roles
+
+
+@request_cached()
+def get_course_roles(user: User) -> list[CourseAccessRole]:
+    """
+    Returns a list of all course-level roles that this user has.
+
+    :param user: User
+    :return: All roles for all courses that this user has access to.
+    """
+    # pylint: disable=protected-access
+    role_cache = get_role_cache(user)
+    return list(role_cache._roles)
diff --git a/common/djangoapps/student/tests/test_roles.py b/common/djangoapps/student/tests/test_roles.py
@@ -8,6 +8,7 @@
 from opaque_keys.edx.keys import CourseKey
 
 from common.djangoapps.student.roles import (
+    CourseAccessRole,
     CourseBetaTesterRole,
     CourseInstructorRole,
     CourseRole,
@@ -23,6 +24,7 @@
     OrgStaffRole,
     RoleCache
 )
+from common.djangoapps.student.role_helpers import get_course_roles, has_staff_roles
 from common.djangoapps.student.tests.factories import AnonymousUserFactory, InstructorFactory, StaffFactory, UserFactory
 
 
@@ -48,6 +50,32 @@ def test_global_staff(self):
         assert not GlobalStaff().has_user(self.course_instructor)
         assert GlobalStaff().has_user(self.global_staff)
 
+    def test_has_staff_roles(self):
+        assert has_staff_roles(self.global_staff, self.course_key)
+        assert has_staff_roles(self.course_staff, self.course_key)
+        assert has_staff_roles(self.course_instructor, self.course_key)
+        assert not has_staff_roles(self.student, self.course_key)
+
+    def test_get_course_roles(self):
+        assert not list(get_course_roles(self.student))
+        assert not list(get_course_roles(self.global_staff))
+        assert list(get_course_roles(self.course_staff)) == [
+            CourseAccessRole(
+                user=self.course_staff,
+                course_id=self.course_key,
+                org=self.course_key.org,
+                role=CourseStaffRole.ROLE,
+            )
+        ]
+        assert list(get_course_roles(self.course_instructor)) == [
+            CourseAccessRole(
+                user=self.course_instructor,
+                course_id=self.course_key,
+                org=self.course_key.org,
+                role=CourseInstructorRole.ROLE,
+            )
+        ]
+
     def test_group_name_case_sensitive(self):
         uppercase_course_id = "ORG/COURSE/NAME"
         lowercase_course_id = uppercase_course_id.lower()

diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py
@@ -8,6 +8,7 @@
 from django.utils.text import slugify
 from opaque_keys.edx.keys import UsageKey, LearningContextKey
 
+from openedx.core.djangoapps.content.search.models import SearchAccess
 from openedx.core.djangoapps.content_libraries import api as lib_api
 from openedx.core.djangoapps.content_tagging import api as tagging_api
 from openedx.core.djangoapps.xblock import api as xblock_api
@@ -29,6 +30,7 @@ class Fields:
     block_type = "block_type"
     context_key = "context_key"
     org = "org"
+    access_id = "access_id"  # .models.SearchAccess.id
     # breadcrumbs: an array of {"display_name": "..."} entries. First one is the name of the course/library itself.
     # After that is the name of any parent Section/Subsection/Unit/etc.
     # It's a list of dictionaries because for now we just include the name of each but in future we may add their IDs.
@@ -78,6 +80,14 @@ def _meili_id_from_opaque_key(usage_key: UsageKey) -> str:
     return slugify(str(usage_key)) + "-" + suffix
 
 
+def _meili_access_id_from_context_key(context_key: LearningContextKey) -> int:
+    """
+    Retrieve the numeric access id for the given course/library context.
+    """
+    access, _ = SearchAccess.objects.get_or_create(context_key=context_key)
+    return access.id
+
+
 def _fields_from_block(block) -> dict:
     """
     Given an XBlock instance, call its index_dictionary() method to load any
@@ -96,6 +106,7 @@ class implementation returns only:
         # This is called context_key so it's the same for courses and libraries
         Fields.context_key: str(block.usage_key.context_key),  # same as lib_key
         Fields.org: str(block.usage_key.context_key.org),
+        Fields.access_id: _meili_access_id_from_context_key(block.usage_key.context_key),
         Fields.breadcrumbs: []
     }
     # Get the breadcrumbs (course, section, subsection, etc.):

diff --git a/openedx/core/djangoapps/content/search/handlers.py b/openedx/core/djangoapps/content/search/handlers.py
@@ -0,0 +1,23 @@
+"""
+Signal/event handlers for content search
+"""
+from django.db.models.signals import post_delete
+from django.dispatch import receiver
+from openedx_events.content_authoring.data import ContentLibraryData
+from openedx_events.content_authoring.signals import CONTENT_LIBRARY_DELETED
+
+from openedx.core.djangoapps.content.course_overviews.models import CourseOverview
+from openedx.core.djangoapps.content.search.models import SearchAccess
+
+
+# Using post_delete here because there is no COURSE_DELETED event defined.
+@receiver(post_delete, sender=CourseOverview)
+def delete_course_search_access(sender, instance, **kwargs):  # pylint: disable=unused-argument
+    """Deletes the SearchAccess instance for deleted CourseOverview"""
+    SearchAccess.objects.filter(context_key=instance.id).delete()
+
+
+@receiver(CONTENT_LIBRARY_DELETED)
+def delete_library_search_access(content_library: ContentLibraryData, **kwargs):
+    """Deletes the SearchAccess instance for deleted content libraries"""
+    SearchAccess.objects.filter(context_key=content_library.library_key).delete()
diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py
@@ -84,6 +84,7 @@ def handle(self, *args, **options):
                 Fields.org,
                 Fields.tags,
                 Fields.type,
+                Fields.access_id,
             ])
             # Mark which attributes are used for keyword search, in order of importance:
             client.index(temp_index_name).update_searchable_attributes([

diff --git a/openedx/core/djangoapps/content/search/migrations/0001_initial.py b/openedx/core/djangoapps/content/search/migrations/0001_initial.py
@@ -0,0 +1,25 @@
+# Generated by Django 4.2.10 on 2024-04-02 04:55
+
+from django.db import migrations, models
+from opaque_keys.edx.django.models import LearningContextKeyField
+from opaque_keys.edx.locator import LibraryLocatorV2
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        ('course_overviews', '0001_initial'),
+        ('content_libraries', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='SearchAccess',
+            fields=[
+                ('id', models.BigAutoField(help_text='Numeric ID for each Course / Library context. This ID will generally require fewer bits than the full LearningContextKey, allowing more courses and libraries to be represented in content search filters.', primary_key=True, serialize=False)),
+                ('context_key', LearningContextKeyField(max_length=255, unique=True)),
+            ],
+        ),
+    ]
diff --git a/openedx/core/djangoapps/content/search/migrations/__init__.py b/openedx/core/djangoapps/content/search/migrations/__init__.py
diff --git a/openedx/core/djangoapps/content/search/models.py b/openedx/core/djangoapps/content/search/models.py
@@ -0,0 +1,67 @@
+"""Database models for content search"""
+
+from __future__ import annotations
+
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+from opaque_keys.edx.django.models import LearningContextKeyField
+from rest_framework.request import Request
+
+from common.djangoapps.student.role_helpers import get_course_roles
+from common.djangoapps.student.roles import CourseInstructorRole, CourseStaffRole
+from openedx.core.djangoapps.content_libraries.api import get_libraries_for_user
+
+
+class SearchAccess(models.Model):
+    """
+    Stores a numeric ID for each ContextKey.
+
+    We use this shorter ID instead of the full ContextKey when determining a user's access to search-indexed course and
+    library content because:
+
+    a) in some deployments, users may be granted access to more than 1_000 individual courses, and
+    b) the search filter request is stored in the JWT, which is limited to 8Kib.
+    """
+    id = models.BigAutoField(
+        primary_key=True,
+        help_text=_(
+            "Numeric ID for each Course / Library context. This ID will generally require fewer bits than the full "
+            "LearningContextKey, allowing more courses and libraries to be represented in content search filters."
+        ),
+    )
+    context_key = LearningContextKeyField(
+        max_length=255, unique=True, null=False,
+    )
+
+
+def get_access_ids_for_request(request: Request, omit_orgs: list[str] = None) -> list[int]:
+    """
+    Returns a list of SearchAccess.id values for courses and content libraries that the requesting user has been
+    individually grated access to.
+
+    Omits any courses/libraries with orgs in the `omit_orgs` list.
+    """
+    omit_orgs = omit_orgs or []
+
+    course_roles = get_course_roles(request.user)
+    course_clause = models.Q(context_key__in=[
+        role.course_id
+        for role in course_roles
+        if (
+            role.role in [CourseInstructorRole.ROLE, CourseStaffRole.ROLE]
+            and role.org not in omit_orgs
+        )
+    ])
+
+    libraries = get_libraries_for_user(user=request.user)
+    library_clause = models.Q(context_key__in=[
+        lib.library_key for lib in libraries
+        if lib.library_key.org not in omit_orgs
+    ])
+
+    # Sort by descending access ID to simulate prioritizing the "most recently created context keys".
+    return list(
+        SearchAccess.objects.filter(
+            course_clause | library_clause
+        ).order_by('-id').values_list("id", flat=True)
+    )
diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py
@@ -8,7 +8,14 @@
 from xmodule.modulestore.tests.django_utils import SharedModuleStoreTestCase
 from xmodule.modulestore.tests.factories import BlockFactory, ToyCourseFactory
 
-from ..documents import searchable_doc_for_course_block
+try:
+    # This import errors in the lms because content.search is not an installed app there.
+    from ..documents import searchable_doc_for_course_block
+    from ..models import SearchAccess
+except RuntimeError:
+    searchable_doc_for_course_block = lambda x: x
+    SearchAccess = {}
+
 
 STUDIO_SEARCH_ENDPOINT_URL = "/api/content_search/v2/studio/"
 
@@ -26,6 +33,7 @@ def setUpClass(cls):
         cls.store = modulestore()
         cls.toy_course = ToyCourseFactory.create()  # See xmodule/modulestore/tests/sample_courses.py
         cls.toy_course_key = cls.toy_course.id
+
         # Get references to some blocks in the toy course
         cls.html_block_key = cls.toy_course_key.make_usage_key("html", "toyjumpto")
         # Create a problem in library
@@ -55,6 +63,16 @@ def setUpClass(cls):
         tagging_api.tag_object(str(cls.html_block_key), cls.subject_tags, tags=["Chinese", "Jump Links"])
         tagging_api.tag_object(str(cls.html_block_key), cls.difficulty_tags, tags=["Normal"])
 
+    @property
+    def toy_course_access_id(self):
+        """
+        Returns the SearchAccess.id created for the toy course.
+
+        This SearchAccess object is created when documents are added to the search index, so this method must be called
+        after this step, or risk a DoesNotExist error.
+        """
+        return SearchAccess.objects.get(context_key=self.toy_course_key).id
+
     def test_problem_block(self):
         """
         Test how a problem block gets represented in the search index
@@ -71,6 +89,7 @@ def test_problem_block(self):
             "block_id": "Test_Problem",
             "context_key": "course-v1:edX+toy+2012_Fall",
             "org": "edX",
+            "access_id": self.toy_course_access_id,
             "display_name": "Test Problem",
             "breadcrumbs": [
                 {"display_name": "Toy Course"},
@@ -105,6 +124,7 @@ def test_html_block(self):
             "block_id": "toyjumpto",
             "context_key": "course-v1:edX+toy+2012_Fall",
             "org": "edX",
+            "access_id": self.toy_course_access_id,
             "display_name": "Text",
             "breadcrumbs": [
                 {"display_name": "Toy Course"},
@@ -139,6 +159,7 @@ def test_video_block_untagged(self):
             "block_id": "Welcome",
             "context_key": "course-v1:edX+toy+2012_Fall",
             "org": "edX",
+            "access_id": self.toy_course_access_id,
             "display_name": "Welcome",
             "breadcrumbs": [
                 {"display_name": "Toy Course"},