diff --git a/.github/workflows/k3d-nightly-ci.yaml b/.github/workflows/k3d-nightly-ci.yaml index a4db9cfb2..c350f41e1 100644 --- a/.github/workflows/k3d-nightly-ci.yaml +++ b/.github/workflows/k3d-nightly-ci.yaml @@ -79,7 +79,7 @@ jobs: run: kubectl wait --for=condition=ready pod --all --timeout=240s - name: Create Extra Test Buckets - run: kubectl exec -i deployment/local-minio -c minio mkdir /data/replica-0 + run: kubectl exec -i deployment/local-minio -c minio -- mkdir /data/replica-0 - name: Run Tests run: pytest -vv ./backend/test_nightly/test_*.py diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 5a0551f14..8d231733c 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -592,8 +592,9 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): update_query: dict[str, object] = {} running_crawl = await self.get_running_crawl(cid) - # only look up last finished crawl if no crawls running, otherwise - # lastCrawl* stats are already for running crawl + + # If crawl is running, lastCrawl* stats are already for running crawl, + # so there's nothing to update other than size and crawl count if not running_crawl: match_query = { "cid": cid, @@ -603,26 +604,36 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): last_crawl = await self.crawls.find_one( match_query, sort=[("finished", pymongo.DESCENDING)] ) - else: - last_crawl = None - - if last_crawl: - last_crawl_finished = last_crawl.get("finished") - update_query["lastCrawlId"] = str(last_crawl.get("_id")) - update_query["lastCrawlStartTime"] = last_crawl.get("started") - update_query["lastStartedBy"] = last_crawl.get("userid") - update_query["lastStartedByName"] = last_crawl.get("userName") - update_query["lastCrawlTime"] = last_crawl_finished - update_query["lastCrawlState"] = last_crawl.get("state") - update_query["lastCrawlSize"] = sum( - file_.get("size", 0) for file_ in last_crawl.get("files", []) - ) - update_query["lastCrawlStopping"] = False - update_query["isCrawlRunning"] = False + # Update to reflect last crawl + if last_crawl: + last_crawl_finished = last_crawl.get("finished") + + update_query["lastCrawlId"] = str(last_crawl.get("_id")) + update_query["lastCrawlStartTime"] = last_crawl.get("started") + update_query["lastStartedBy"] = last_crawl.get("userid") + update_query["lastStartedByName"] = last_crawl.get("userName") + update_query["lastCrawlTime"] = last_crawl_finished + update_query["lastCrawlState"] = last_crawl.get("state") + update_query["lastCrawlSize"] = sum( + file_.get("size", 0) for file_ in last_crawl.get("files", []) + ) + update_query["lastCrawlStopping"] = False + update_query["isCrawlRunning"] = False - if last_crawl_finished: - update_query["lastRun"] = last_crawl_finished + if last_crawl_finished: + update_query["lastRun"] = last_crawl_finished + # If no last crawl exists and no running crawl, reset stats + else: + update_query["lastCrawlId"] = None + update_query["lastCrawlStartTime"] = None + update_query["lastStartedBy"] = None + update_query["lastStartedByName"] = None + update_query["lastCrawlTime"] = None + update_query["lastCrawlState"] = None + update_query["lastCrawlSize"] = 0 + update_query["lastRun"] = None + update_query["isCrawlRunning"] = False result = await self.crawl_configs.find_one_and_update( {"_id": cid, "inactive": {"$ne": True}}, diff --git a/backend/test_nightly/test_crawlconfig_crawl_stats.py b/backend/test_nightly/test_crawlconfig_crawl_stats.py index 722e50cdb..690ea3bdd 100644 --- a/backend/test_nightly/test_crawlconfig_crawl_stats.py +++ b/backend/test_nightly/test_crawlconfig_crawl_stats.py @@ -1,4 +1,5 @@ import requests +import time from .conftest import API_PREFIX @@ -70,6 +71,8 @@ def test_crawlconfig_crawl_stats(admin_auth_headers, default_org_id, crawl_confi data = r.json() assert data["deleted"] + time.sleep(10) + # Verify crawl stats from /crawlconfigs r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}", diff --git a/backend/test_nightly/test_storage_quota.py b/backend/test_nightly/test_storage_quota.py index fee742ede..212d96e0a 100644 --- a/backend/test_nightly/test_storage_quota.py +++ b/backend/test_nightly/test_storage_quota.py @@ -8,18 +8,19 @@ from .utils import get_crawl_status -STORAGE_QUOTA_KB = 5 -STORAGE_QUOTA_BYTES = STORAGE_QUOTA_KB * 1000 +STORAGE_QUOTA_MB_TO_INCREASE = 5 +STORAGE_QUOTA_BYTES_INC = STORAGE_QUOTA_MB_TO_INCREASE * 1000 * 1000 config_id = None +storage_quota = None def run_crawl(org_id, headers): crawl_data = { "runNow": True, "name": "Storage Quota", "config": { - "seeds": [{"url": "https://webrecorder.net/"}], + "seeds": [{"url": "https://specs.webrecorder.net/"}], "extraHops": 1, }, } @@ -34,10 +35,22 @@ def run_crawl(org_id, headers): def test_storage_quota(org_with_quotas, admin_auth_headers): + # Get current storage usage + r = requests.get( + f"{API_PREFIX}/orgs/{org_with_quotas}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + bytes_stored = r.json()["bytesStored"] + + global storage_quota + storage_quota = bytes_stored + STORAGE_QUOTA_BYTES_INC + + # Set storage quota higher than bytesStored r = requests.post( f"{API_PREFIX}/orgs/{org_with_quotas}/quotas", headers=admin_auth_headers, - json={"storageQuota": STORAGE_QUOTA_BYTES}, + json={"storageQuota": storage_quota}, ) assert r.status_code == 200 assert r.json()["updated"] @@ -49,9 +62,12 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers) time.sleep(1) + assert crawl_id + while get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) in ( "starting", "waiting_capacity", + "waiting_org_limit", ): time.sleep(2) @@ -63,14 +79,11 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he ): time.sleep(2) - # Ensure that crawl was stopped by quota assert ( get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) == "stopped_storage_quota_reached" ) - time.sleep(10) - # Ensure crawl storage went over quota r = requests.get( f"{API_PREFIX}/orgs/{org_with_quotas}", @@ -78,7 +91,7 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he ) data = r.json() bytes_stored = data["bytesStored"] - assert bytes_stored >= STORAGE_QUOTA_BYTES + assert bytes_stored >= storage_quota time.sleep(5)