Fix nightly tests: modify kubectl exec syntax for creating new minio …

…bucket (#2097) Fixes #2096 For example failing test run, see: https://github.com/webrecorder/browsertrix/actions/runs/11121185534/job/30899729448 --------- Co-authored-by: Ilya Kreymer <[email protected]>
webrecorder · Oct 22, 2024 · f7426cc · f7426cc
1 parent 1b1819b
commit f7426cc
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 29 deletions.
diff --git a/.github/workflows/k3d-nightly-ci.yaml b/.github/workflows/k3d-nightly-ci.yaml
@@ -79,7 +79,7 @@ jobs:
         run: kubectl wait --for=condition=ready pod --all --timeout=240s
 
       - name: Create Extra Test Buckets
-        run:  kubectl exec -i deployment/local-minio -c minio mkdir /data/replica-0
+        run:  kubectl exec -i deployment/local-minio -c minio -- mkdir /data/replica-0
 
       - name: Run Tests
         run: pytest -vv ./backend/test_nightly/test_*.py

diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -592,8 +592,9 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
         update_query: dict[str, object] = {}
 
         running_crawl = await self.get_running_crawl(cid)
-        # only look up last finished crawl if no crawls running, otherwise
-        # lastCrawl* stats are already for running crawl
+
+        # If crawl is running, lastCrawl* stats are already for running crawl,
+        # so there's nothing to update other than size and crawl count
         if not running_crawl:
             match_query = {
                 "cid": cid,
@@ -603,26 +604,36 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
             last_crawl = await self.crawls.find_one(
                 match_query, sort=[("finished", pymongo.DESCENDING)]
             )
-        else:
-            last_crawl = None
-
-        if last_crawl:
-            last_crawl_finished = last_crawl.get("finished")
 
-            update_query["lastCrawlId"] = str(last_crawl.get("_id"))
-            update_query["lastCrawlStartTime"] = last_crawl.get("started")
-            update_query["lastStartedBy"] = last_crawl.get("userid")
-            update_query["lastStartedByName"] = last_crawl.get("userName")
-            update_query["lastCrawlTime"] = last_crawl_finished
-            update_query["lastCrawlState"] = last_crawl.get("state")
-            update_query["lastCrawlSize"] = sum(
-                file_.get("size", 0) for file_ in last_crawl.get("files", [])
-            )
-            update_query["lastCrawlStopping"] = False
-            update_query["isCrawlRunning"] = False
+            # Update to reflect last crawl
+            if last_crawl:
+                last_crawl_finished = last_crawl.get("finished")
+
+                update_query["lastCrawlId"] = str(last_crawl.get("_id"))
+                update_query["lastCrawlStartTime"] = last_crawl.get("started")
+                update_query["lastStartedBy"] = last_crawl.get("userid")
+                update_query["lastStartedByName"] = last_crawl.get("userName")
+                update_query["lastCrawlTime"] = last_crawl_finished
+                update_query["lastCrawlState"] = last_crawl.get("state")
+                update_query["lastCrawlSize"] = sum(
+                    file_.get("size", 0) for file_ in last_crawl.get("files", [])
+                )
+                update_query["lastCrawlStopping"] = False
+                update_query["isCrawlRunning"] = False
 
-            if last_crawl_finished:
-                update_query["lastRun"] = last_crawl_finished
+                if last_crawl_finished:
+                    update_query["lastRun"] = last_crawl_finished
+            # If no last crawl exists and no running crawl, reset stats
+            else:
+                update_query["lastCrawlId"] = None
+                update_query["lastCrawlStartTime"] = None
+                update_query["lastStartedBy"] = None
+                update_query["lastStartedByName"] = None
+                update_query["lastCrawlTime"] = None
+                update_query["lastCrawlState"] = None
+                update_query["lastCrawlSize"] = 0
+                update_query["lastRun"] = None
+                update_query["isCrawlRunning"] = False
 
         result = await self.crawl_configs.find_one_and_update(
             {"_id": cid, "inactive": {"$ne": True}},

diff --git a/backend/test_nightly/test_crawlconfig_crawl_stats.py b/backend/test_nightly/test_crawlconfig_crawl_stats.py
@@ -1,4 +1,5 @@
 import requests
+import time
 
 from .conftest import API_PREFIX
 
@@ -70,6 +71,8 @@ def test_crawlconfig_crawl_stats(admin_auth_headers, default_org_id, crawl_confi
     data = r.json()
     assert data["deleted"]
 
+    time.sleep(10)
+
     # Verify crawl stats from /crawlconfigs
     r = requests.get(
         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",

diff --git a/backend/test_nightly/test_storage_quota.py b/backend/test_nightly/test_storage_quota.py
@@ -8,18 +8,19 @@
 from .utils import get_crawl_status
 
 
-STORAGE_QUOTA_KB = 5
-STORAGE_QUOTA_BYTES = STORAGE_QUOTA_KB * 1000
+STORAGE_QUOTA_MB_TO_INCREASE = 5
+STORAGE_QUOTA_BYTES_INC = STORAGE_QUOTA_MB_TO_INCREASE * 1000 * 1000
 
 config_id = None
 
+storage_quota = None
 
 def run_crawl(org_id, headers):
     crawl_data = {
         "runNow": True,
         "name": "Storage Quota",
         "config": {
-            "seeds": [{"url": "https://webrecorder.net/"}],
+            "seeds": [{"url": "https://specs.webrecorder.net/"}],
             "extraHops": 1,
         },
     }
@@ -34,10 +35,22 @@ def run_crawl(org_id, headers):
 
 
 def test_storage_quota(org_with_quotas, admin_auth_headers):
+    # Get current storage usage
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{org_with_quotas}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    bytes_stored = r.json()["bytesStored"]
+
+    global storage_quota
+    storage_quota = bytes_stored + STORAGE_QUOTA_BYTES_INC
+
+    # Set storage quota higher than bytesStored
     r = requests.post(
         f"{API_PREFIX}/orgs/{org_with_quotas}/quotas",
         headers=admin_auth_headers,
-        json={"storageQuota": STORAGE_QUOTA_BYTES},
+        json={"storageQuota": storage_quota},
     )
     assert r.status_code == 200
     assert r.json()["updated"]
@@ -49,9 +62,12 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he
     crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers)
     time.sleep(1)
 
+    assert crawl_id
+
     while get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) in (
         "starting",
         "waiting_capacity",
+        "waiting_org_limit",
     ):
         time.sleep(2)
 
@@ -63,22 +79,19 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he
     ):
         time.sleep(2)
 
-    # Ensure that crawl was stopped by quota
     assert (
         get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers)
         == "stopped_storage_quota_reached"
     )
 
-    time.sleep(10)
-
     # Ensure crawl storage went over quota
     r = requests.get(
         f"{API_PREFIX}/orgs/{org_with_quotas}",
         headers=admin_auth_headers,
     )
     data = r.json()
     bytes_stored = data["bytesStored"]
-    assert bytes_stored >= STORAGE_QUOTA_BYTES
+    assert bytes_stored >= storage_quota
 
     time.sleep(5)