Fetch all pages of paginated Canvas Studio API responses

Lists of collections and media are paginated, using offset-based pagination. Fetch all of pages, up to a limit, instead of just the first one. Also increase the size for each page from 20 items up to the maximum of 50. Fixes #6354
hypothesis · Jun 17, 2024 · 6856d22 · 6856d22
1 parent 52f8ec5
commit 6856d22
Show file tree

Hide file tree

Showing 2 changed files with 155 additions and 39 deletions.
diff --git a/lms/services/canvas_studio.py b/lms/services/canvas_studio.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import Literal, NotRequired, Type, TypedDict
+from typing import Literal, Mapping, NotRequired, Type, TypedDict
 from urllib.parse import urlencode, urljoin, urlparse, urlunparse
 
 import requests
@@ -20,6 +20,27 @@
 from lms.validation._base import RequestsResponseSchema
 
 
+class PaginationSchema(Schema):
+    """
+    Schema for `meta` field in paginated Canvas Studio responses.
+
+    See `PaginationMeta` on https://tw.instructuremedia.com/api/public/docs/.
+    """
+
+    class Meta:
+        unknown = EXCLUDE
+
+    current_page = fields.Integer()
+    """1-based page number."""
+
+    last_page = fields.Integer()
+    """1-based number of last page."""
+
+
+MAX_PAGE_SIZE = 50
+"""Maximum value for `per_page` argument to paginated API calls."""
+
+
 class CanvasStudioCollectionsSchema(RequestsResponseSchema):
     """Schema for Canvas Studio /collections responses."""
 
@@ -33,10 +54,7 @@ class Meta:
         created_at = fields.Str(required=False)
 
     collections = fields.List(fields.Nested(CollectionSchema), required=True)
-
-    @post_load
-    def post_load(self, data, **_kwargs):
-        return data["collections"]
+    meta = fields.Nested(PaginationSchema, required=True)
 
 
 class CanvasStudioCollectionMediaSchema(RequestsResponseSchema):
@@ -53,10 +71,7 @@ class Meta:
         thumbnail_url = fields.Str(required=False)
 
     media = fields.List(fields.Nested(MediaSchema), required=True)
-
-    @post_load
-    def post_load(self, data, **_kwargs):
-        return data["media"]
+    meta = fields.Nested(PaginationSchema, required=True)
 
 
 class CanvasStudioCaptionFilesSchema(RequestsResponseSchema):
@@ -119,7 +134,10 @@ class CanvasStudioService:
         API reference: https://tw.instructuremedia.com/api/public/docs/
     """
 
-    def __init__(self, request, application_instance):
+    page_size: int
+    """Number of items to fetch per page."""
+
+    def __init__(self, request, application_instance, page_size=MAX_PAGE_SIZE):
         self._domain = application_instance.settings.get("canvas_studio", "domain")
         self._client_id = application_instance.settings.get(
             "canvas_studio", "client_id"
@@ -132,6 +150,7 @@ def __init__(self, request, application_instance):
         )
         self._request = request
         self._application_instance = application_instance
+        self.page_size = page_size
 
     def get_access_token(self, code: str) -> None:
         """
@@ -218,7 +237,9 @@ def list_media_library(self) -> list[File]:
         picker when creating a Page.
         """
 
-        collections = self._api_request("v1/collections", CanvasStudioCollectionsSchema)
+        collections = self._paginated_api_request(
+            "v1/collections", CanvasStudioCollectionsSchema, field="collections"
+        )
         user_collection = None
 
         files: list[File] = []
@@ -254,8 +275,10 @@ def list_media_library(self) -> list[File]:
     def list_collection(self, collection_id: str) -> list[File]:
         """List the videos in a collection."""
 
-        media = self._api_request(
-            f"v1/collections/{collection_id}/media", CanvasStudioCollectionMediaSchema
+        media = self._paginated_api_request(
+            f"v1/collections/{collection_id}/media",
+            CanvasStudioCollectionMediaSchema,
+            field="media",
         )
 
         files: list[File] = []
@@ -353,16 +376,51 @@ def get_transcript_url(self, media_id: str) -> str | None:
         return None
 
     def _api_request(
-        self, path: str, schema_cls: Type[RequestsResponseSchema], as_admin=False
+        self,
+        path: str,
+        schema_cls: Type[RequestsResponseSchema],
+        as_admin=False,
     ) -> dict:
         """
         Make a request to the Canvas Studio API and parse the JSON response.
 
+        :param path: Request path, relative to the API root
+        :param schema_cls: Schema to parse the JSON response
         :param as_admin: Make the request using the admin account instead of the current user.
         """
         response = self._bare_api_request(path, as_admin=as_admin)
         return schema_cls(response).parse()
 
+    def _paginated_api_request(
+        self,
+        path: str,
+        schema_cls: Type[RequestsResponseSchema],
+        field: str,
+        as_admin=False,
+    ) -> list:
+        """
+        Fetch a paginated collection via the Canvas Studio API.
+
+        :param path: Request path, relative to the API root
+        :param schema_cls: Schema to parse the JSON response
+        :param field: Response field containing items from each page
+        :param as_admin: Make the request using the admin account instead of the current user.
+        """
+        max_pages = 100
+        next_page = 1
+        last_page = None
+        collated = []
+
+        while last_page is None or next_page <= last_page:
+            params = {"page": next_page, "per_page": self.page_size}
+            response = self._bare_api_request(path, as_admin=as_admin, params=params)
+            parsed = schema_cls(response).parse()
+            collated += parsed[field]
+            last_page = min(parsed["meta"]["last_page"], max_pages)
+            next_page += 1
+
+        return collated
+
     def _admin_api_request(self, path: str, allow_redirects=True) -> requests.Response:
         """
         Make a request to the Canvas Studio API using the admin user identity.
@@ -401,14 +459,18 @@ def _admin_api_request(self, path: str, allow_redirects=True) -> requests.Respon
             ) from err
 
     def _bare_api_request(
-        self, path: str, as_admin=False, allow_redirects=True
+        self,
+        path: str,
+        as_admin=False,
+        allow_redirects=True,
+        params: Mapping[str, str | int] | None = None,
     ) -> requests.Response:
         """
         Make a request to the Canvas Studio API and return the response.
 
         :param as_admin: Make the request using the admin account instead of the current user.
         """
-        url = self._api_url(path)
+        url = self._api_url(path, params)
 
         if as_admin and not self.is_admin():
             return self._admin_api_request(path, allow_redirects=allow_redirects)
@@ -426,18 +488,22 @@ def _bare_api_request(
                 refresh_service=Service.CANVAS_STUDIO,
             ) from err
 
-    def _api_url(self, path: str) -> str:
+    def _api_url(self, path: str, params: Mapping[str, str | int] | None = None) -> str:
         """
         Return the URL of a Canvas Studio API endpoint.
 
         See https://tw.instructuremedia.com/api/public/docs/ for available
         endpoints.
 
         :param path: Path of endpoint relative to the API root
+        :param params: Query parameters
         """
 
         site = self._canvas_studio_site()
-        return f"{site}/api/public/{path}"
+        url = f"{site}/api/public/{path}"
+        if params:
+            url = url + "?" + urlencode(params)
+        return url
 
     def _admin_email(self) -> str:
         """Return the email address of the configured Canvas Studio admin."""

diff --git a/tests/unit/lms/services/canvas_studio_test.py b/tests/unit/lms/services/canvas_studio_test.py
@@ -1,5 +1,6 @@
+import math
 from unittest.mock import create_autospec, patch, sentinel
-from urllib.parse import urlencode
+from urllib.parse import parse_qs, urlencode, urlparse
 
 import pytest
 from h_matchers import Any
@@ -99,8 +100,19 @@ def test_redirect_uri_localhost_workaround(self, svc, pyramid_request):
                 == "https://hypothesis.local:48001/api/canvas_studio/oauth/callback"
             )
 
-    def test_list_media_library(self, svc):
+    @pytest.mark.parametrize(
+        "page_size",
+        (
+            None,  # Default, one page will be enough
+            3,  # Multiple pages will be needed
+        ),
+    )
+    def test_list_media_library(self, svc, page_size):
+        if page_size is not None:
+            svc.page_size = page_size
+
         files = svc.list_media_library()
+
         assert files == [
             {
                 "type": "Folder",
@@ -166,8 +178,19 @@ def test_list_media_library_other_error(self, svc, oauth_http_service):
 
         assert exc_info.value == err
 
-    def test_list_collection(self, svc):
+    @pytest.mark.parametrize(
+        "page_size",
+        (
+            None,  # Default. One page will be enough
+            1,  # Multiple pages will be needed
+        ),
+    )
+    def test_list_collection(self, svc, page_size):
+        if page_size is not None:
+            svc.page_size = page_size
+
         files = svc.list_collection("8")
+
         assert files == [
             {
                 "type": "File",
@@ -318,7 +341,7 @@ def admin_oauth_http_service(self):
         svc.get.side_effect = self.get_request_handler(is_admin=True)
         return svc
 
-    def get_request_handler(self, collections=None, is_admin=False):
+    def get_request_handler(self, collections=None, is_admin=False):  # noqa: PLR0915 pylint:disable=too-complex
         """
         Create a handler for `GET` requests to the Canvas Studio API.
 
@@ -350,36 +373,63 @@ def make_file(id_, title, created_at, with_thumbnail=False):
                 make_collection(10, None, "course_wide", "2024-03-01"),
             ]
 
-        def handler(url, allow_redirects=True):
-            api_prefix = "https://hypothesis.instructuremedia.com/api/public/v1/"
-            assert url.startswith(api_prefix)
+        def handler(url, allow_redirects=True):  # noqa: PLR0912, PLR0915
+            api_prefix = "/api/public/v1/"
+
+            parsed_url = urlparse(url)
+            assert parsed_url.scheme == "https"
+            assert parsed_url.netloc == "hypothesis.instructuremedia.com"
+            assert parsed_url.path.startswith(api_prefix)
 
-            url_suffix = url[len(api_prefix) :]
             json_data = None
             status_code = 200
-
-            match url_suffix:
+            per_page = 20
+            page = 1
+
+            params = parse_qs(parsed_url.query)
+            if "per_page" in params:
+                per_page = int(params["per_page"][0])
+            if "page" in params:
+                page = int(params["page"][0])
+
+            def collection_data(items_field, items):
+                n_pages = max(math.ceil(len(items) / per_page), 1)
+                assert 1 <= page <= n_pages
+
+                start = (page - 1) * per_page
+                end = start + per_page
+
+                return {
+                    items_field: items[start:end],
+                    "meta": {
+                        "current_page": page,
+                        "last_page": n_pages,
+                    },
+                }
+
+            resource_path = parsed_url.path[len(api_prefix) :]
+            match resource_path:
                 case "collections":
-                    json_data = {
-                        "collections": collections.copy(),
-                    }
+                    json_data = collection_data("collections", collections.copy())
                 case "collections/1/media":
-                    json_data = {
-                        "media": [
+                    json_data = collection_data(
+                        "media",
+                        [
                             make_file(5, "Test video", "2024-02-03"),
-                        ]
-                    }
+                        ],
+                    )
                 case "collections/8/media":
-                    json_data = {
-                        "media": [
+                    json_data = collection_data(
+                        "media",
+                        [
                             make_file(
                                 7, "Some video", "2024-02-04", with_thumbnail=True
                             ),
                             make_file(
                                 6, "Another video", "2024-02-04", with_thumbnail=True
                             ),
-                        ]
-                    }
+                        ],
+                    )
                 case "media/42/caption_files":
                     assert is_admin
                     json_data = {