Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement downloading archived item + QA runs as multi-WACZ #1933

Merged
merged 18 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import asyncio
from fastapi import HTTPException, Depends
from fastapi.responses import StreamingResponse

from .models import (
CrawlFile,
Expand Down Expand Up @@ -797,6 +798,20 @@ async def get_all_crawl_search_values(
"firstSeeds": list(first_seeds),
}

async def download_crawl_as_single_wacz(self, crawl_id: str, org: Organization):
"""Download all WACZs in archived item as streaming nested WACZ"""
crawl = await self.get_crawl_out(crawl_id, org)

if not crawl.resources:
raise HTTPException(status_code=400, detail="no_crawl_resources")

resp = await self.storage_ops.download_streaming_wacz(org, crawl.resources)

headers = {"Content-Disposition": f'attachment; filename="{crawl_id}.wacz"'}
return StreamingResponse(
resp, headers=headers, media_type="application/wacz+zip"
)

async def calculate_org_crawl_file_storage(
self, oid: UUID, type_: Optional[str] = None
) -> Tuple[int, int, int]:
Expand Down Expand Up @@ -928,6 +943,16 @@ async def get_base_crawl_admin(crawl_id, user: User = Depends(user_dep)):
async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
return await ops.get_crawl_out(crawl_id, org)

@app.get(
"/orgs/{oid}/all-crawls/{crawl_id}/download",
tags=["all-crawls"],
response_model=bytes,
)
async def download_base_crawl_as_single_wacz(
crawl_id: str, org: Organization = Depends(org_viewer_dep)
):
return await ops.download_crawl_as_single_wacz(crawl_id, org)

@app.patch(
"/orgs/{oid}/all-crawls/{crawl_id}",
tags=["all-crawls"],
Expand Down
40 changes: 40 additions & 0 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,6 +1008,28 @@ async def get_qa_run_for_replay(

return QARunWithResources(**qa_run_dict)

async def download_qa_run_as_single_wacz(
self, crawl_id: str, qa_run_id: str, org: Organization
):
"""Download all WACZs in a QA run as streaming nested WACZ"""
qa_run = await self.get_qa_run_for_replay(crawl_id, qa_run_id, org)
if not qa_run.finished:
raise HTTPException(status_code=400, detail="qa_run_not_finished")

if not qa_run.resources:
raise HTTPException(status_code=400, detail="qa_run_no_resources")

resp = await self.storage_ops.download_streaming_wacz(org, qa_run.resources)

finished = qa_run.finished.isoformat()

headers = {
"Content-Disposition": f'attachment; filename="qa-{finished}-crawl-{crawl_id}.wacz"'
}
return StreamingResponse(
resp, headers=headers, media_type="application/wacz+zip"
)

async def get_qa_run_aggregate_stats(
self,
crawl_id: str,
Expand Down Expand Up @@ -1226,6 +1248,14 @@ async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)):
async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
return await ops.get_crawl_out(crawl_id, org, "crawl")

@app.get(
"/orgs/{oid}/crawls/{crawl_id}/download", tags=["crawls"], response_model=bytes
)
async def download_crawl_as_single_wacz(
crawl_id: str, org: Organization = Depends(org_viewer_dep)
):
return await ops.download_crawl_as_single_wacz(crawl_id, org)

# QA APIs
# ---------------------
@app.get(
Expand All @@ -1249,6 +1279,16 @@ async def get_qa_run(
):
return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org)

@app.get(
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/download",
tags=["qa"],
response_model=bytes,
)
async def download_qa_run_as_single_wacz(
crawl_id: str, qa_run_id: str, org: Organization = Depends(org_viewer_dep)
):
return await ops.download_qa_run_as_single_wacz(crawl_id, qa_run_id, org)

@app.get(
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/stats",
tags=["qa"],
Expand Down
12 changes: 8 additions & 4 deletions backend/btrixcloud/storages.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
AsyncIterator,
TYPE_CHECKING,
Any,
cast,
)
from urllib.parse import urlsplit
from contextlib import asynccontextmanager
Expand All @@ -26,7 +27,7 @@
from zipfile import ZipInfo

from fastapi import Depends, HTTPException
from stream_zip import stream_zip, NO_COMPRESSION_64
from stream_zip import stream_zip, NO_COMPRESSION_64, Method
from remotezip import RemoteZip

import aiobotocore.session
Expand Down Expand Up @@ -698,15 +699,17 @@ def get_file(name) -> Iterator[bytes]:
response = client.get_object(Bucket=bucket, Key=key + name)
return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)

def member_files():
def member_files() -> (
Iterable[tuple[str, datetime, int, Method, Iterable[bytes]]]
):
modified_at = datetime(year=1980, month=1, day=1)
perms = 0o664
for file_ in all_files:
yield (
file_.name,
modified_at,
perms,
NO_COMPRESSION_64(file_.size, file_.crc32),
NO_COMPRESSION_64(file_.size, 0),
get_file(file_.name),
)

Expand All @@ -720,7 +723,8 @@ def member_files():
(datapackage_bytes,),
)

return stream_zip(member_files(), chunk_size=CHUNK_SIZE)
# stream_zip() is an Iterator but defined as an Iterable, can cast
return cast(Iterator[bytes], stream_zip(member_files(), chunk_size=CHUNK_SIZE))

async def download_streaming_wacz(
self, org: Organization, files: List[CrawlFileOut]
Expand Down
10 changes: 10 additions & 0 deletions backend/btrixcloud/uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,16 @@ async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)):
async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)):
return await ops.get_crawl_out(crawl_id, org, "upload")

@app.get(
"/orgs/{oid}/uploads/{crawl_id}/download",
tags=["uploads"],
response_model=bytes,
)
async def download_upload_as_single_wacz(
crawl_id: str, org: Organization = Depends(org_viewer_dep)
):
return await ops.download_crawl_as_single_wacz(crawl_id, org)

@app.patch(
"/orgs/{oid}/uploads/{crawl_id}",
tags=["uploads"],
Expand Down
3 changes: 1 addition & 2 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ jinja2
humanize
python-multipart
pathvalidate
#https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-uncompress.zip
https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-ignore-local-crc32.zip
https://github.com/ikreymer/stream-zip/archive/refs/heads/crc32-optional.zip
boto3
backoff>=2.2.1
python-slugify>=8.0.1
Expand Down
29 changes: 29 additions & 0 deletions backend/test/test_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import requests
import time
from datetime import datetime
from tempfile import TemporaryFile
from zipfile import ZipFile, ZIP_STORED

import pytest

Expand Down Expand Up @@ -541,6 +543,33 @@ def test_sort_crawls_by_qa_runs(
last_count = crawl_qa_count


def test_download_wacz_crawls(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
qa_run_id,
qa_run_pages_ready,
):
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/download",
headers=crawler_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)

fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()

assert len(contents) >= 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED


def test_delete_qa_runs(
crawler_crawl_id,
crawler_auth_headers,
Expand Down
36 changes: 36 additions & 0 deletions backend/test/test_run_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import re
import csv
import codecs
from tempfile import TemporaryFile
from zipfile import ZipFile, ZIP_STORED

import pytest

from .conftest import API_PREFIX, HOST_PREFIX, FINISHED_STATES
from .test_collections import UPDATED_NAME as COLLECTION_NAME
Expand Down Expand Up @@ -371,6 +375,38 @@ def test_verify_wacz():
assert len(pages.strip().split("\n")) == 4


@pytest.mark.parametrize(
"type_path",
[
# crawls endpoint
("crawls"),
# all-crawls endpoint
("all-crawls"),
],
)
def test_download_wacz_crawls(
admin_auth_headers, default_org_id, admin_crawl_id, type_path
):
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/{type_path}/{admin_crawl_id}/download",
headers=admin_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)

fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()

assert len(contents) >= 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED


def test_update_crawl(
admin_auth_headers,
default_org_id,
Expand Down
23 changes: 23 additions & 0 deletions backend/test/test_uploads.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import requests
import os
import time
from tempfile import TemporaryFile
from urllib.parse import urljoin
from zipfile import ZipFile, ZIP_STORED

import pytest

Expand Down Expand Up @@ -329,6 +331,27 @@ def test_update_upload_metadata(admin_auth_headers, default_org_id, upload_id):
assert data["collectionIds"] == UPDATED_COLLECTION_IDS


def test_download_wacz_uploads(admin_auth_headers, default_org_id, upload_id):
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/download",
headers=admin_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)

fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()

assert len(contents) == 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED


def test_delete_stream_upload(
admin_auth_headers, crawler_auth_headers, default_org_id, upload_id
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,21 @@ export class ArchivedItemDetail extends TailwindElement {
]);
break;
case "files":
sectionContent = this.renderPanel(msg("Files"), this.renderFiles());
sectionContent = this.renderPanel(
html` ${this.renderTitle(msg("Files"))}
<sl-tooltip content=${msg("Download all files as a single WACZ")}>
<sl-button
href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
download
size="small"
variant="primary"
>
<sl-icon slot="prefix" name="cloud-download"></sl-icon>
${msg("Download Item")}
</sl-button>
</sl-tooltip>`,
this.renderFiles(),
);
break;
case "logs":
sectionContent = this.renderPanel(
Expand Down Expand Up @@ -558,6 +572,8 @@ export class ArchivedItemDetail extends TailwindElement {
private renderMenu() {
if (!this.crawl) return;

const authToken = this.authState!.headers.Authorization.split(" ")[1];

return html`
<sl-dropdown placement="bottom-end" distance="4" hoist>
<sl-button slot="trigger" size="small" caret
Expand Down Expand Up @@ -609,6 +625,19 @@ export class ArchivedItemDetail extends TailwindElement {
<sl-icon name="tags" slot="prefix"></sl-icon>
${msg("Copy Tags")}
</sl-menu-item>
${when(
finishedCrawlStates.includes(this.crawl.state),
() => html`
<sl-divider></sl-divider>
<btrix-menu-item-link
href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
download
>
<sl-icon name="cloud-download" slot="prefix"></sl-icon>
${msg("Download Item")}
</btrix-menu-item-link>
`,
)}
${when(
this.isCrawler && !isActive(this.crawl.state),
() => html`
Expand All @@ -618,7 +647,7 @@ export class ArchivedItemDetail extends TailwindElement {
@click=${() => void this.deleteCrawl()}
>
<sl-icon name="trash3" slot="prefix"></sl-icon>
${msg("Delete Crawl")}
${msg("Delete Item")}
Shrinks99 marked this conversation as resolved.
Show resolved Hide resolved
</sl-menu-item>
`,
)}
Expand Down
Loading
Loading