webrecorder · ikreymer · Jul 25, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
@@ -8,6 +8,7 @@
 
 import asyncio
 from fastapi import HTTPException, Depends
+from fastapi.responses import StreamingResponse
 
 from .models import (
     CrawlFile,
@@ -797,6 +798,20 @@ async def get_all_crawl_search_values(
             "firstSeeds": list(first_seeds),
         }
 
+    async def download_crawl_as_single_wacz(self, crawl_id: str, org: Organization):
+        """Download all WACZs in archived item as streaming nested WACZ"""
+        crawl = await self.get_crawl_out(crawl_id, org)
+
+        if not crawl.resources:
+            raise HTTPException(status_code=400, detail="no_crawl_resources")
+
+        resp = await self.storage_ops.download_streaming_wacz(org, crawl.resources)
+
+        headers = {"Content-Disposition": f'attachment; filename="{crawl_id}.wacz"'}
+        return StreamingResponse(
+            resp, headers=headers, media_type="application/wacz+zip"
+        )
+
     async def calculate_org_crawl_file_storage(
         self, oid: UUID, type_: Optional[str] = None
     ) -> Tuple[int, int, int]:
@@ -928,6 +943,16 @@ async def get_base_crawl_admin(crawl_id, user: User = Depends(user_dep)):
     async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
         return await ops.get_crawl_out(crawl_id, org)
 
+    @app.get(
+        "/orgs/{oid}/all-crawls/{crawl_id}/download",
+        tags=["all-crawls"],
+        response_model=bytes,
+    )
+    async def download_base_crawl_as_single_wacz(
+        crawl_id: str, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.download_crawl_as_single_wacz(crawl_id, org)
+
     @app.patch(
         "/orgs/{oid}/all-crawls/{crawl_id}",
         tags=["all-crawls"],

diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
@@ -1008,6 +1008,28 @@ async def get_qa_run_for_replay(
 
         return QARunWithResources(**qa_run_dict)
 
+    async def download_qa_run_as_single_wacz(
+        self, crawl_id: str, qa_run_id: str, org: Organization
+    ):
+        """Download all WACZs in a QA run as streaming nested WACZ"""
+        qa_run = await self.get_qa_run_for_replay(crawl_id, qa_run_id, org)
+        if not qa_run.finished:
+            raise HTTPException(status_code=400, detail="qa_run_not_finished")
+
+        if not qa_run.resources:
+            raise HTTPException(status_code=400, detail="qa_run_no_resources")
+
+        resp = await self.storage_ops.download_streaming_wacz(org, qa_run.resources)
+
+        finished = qa_run.finished.isoformat()
+
+        headers = {
+            "Content-Disposition": f'attachment; filename="qa-{finished}-crawl-{crawl_id}.wacz"'
+        }
+        return StreamingResponse(
+            resp, headers=headers, media_type="application/wacz+zip"
+        )
+
     async def get_qa_run_aggregate_stats(
         self,
         crawl_id: str,
@@ -1226,6 +1248,14 @@ async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)):
     async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
         return await ops.get_crawl_out(crawl_id, org, "crawl")
 
+    @app.get(
+        "/orgs/{oid}/crawls/{crawl_id}/download", tags=["crawls"], response_model=bytes
+    )
+    async def download_crawl_as_single_wacz(
+        crawl_id: str, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.download_crawl_as_single_wacz(crawl_id, org)
+
     # QA APIs
     # ---------------------
     @app.get(
@@ -1249,6 +1279,16 @@ async def get_qa_run(
     ):
         return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org)
 
+    @app.get(
+        "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/download",
+        tags=["qa"],
+        response_model=bytes,
+    )
+    async def download_qa_run_as_single_wacz(
+        crawl_id: str, qa_run_id: str, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.download_qa_run_as_single_wacz(crawl_id, qa_run_id, org)
+
     @app.get(
         "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/stats",
         tags=["qa"],

diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py
@@ -11,6 +11,7 @@
     AsyncIterator,
     TYPE_CHECKING,
     Any,
+    cast,
 )
 from urllib.parse import urlsplit
 from contextlib import asynccontextmanager
@@ -26,7 +27,7 @@
 from zipfile import ZipInfo
 
 from fastapi import Depends, HTTPException
-from stream_zip import stream_zip, NO_COMPRESSION_64
+from stream_zip import stream_zip, NO_COMPRESSION_64, Method
 from remotezip import RemoteZip
 
 import aiobotocore.session
@@ -698,15 +699,17 @@ def get_file(name) -> Iterator[bytes]:
             response = client.get_object(Bucket=bucket, Key=key + name)
             return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
 
-        def member_files():
+        def member_files() -> (
+            Iterable[tuple[str, datetime, int, Method, Iterable[bytes]]]
+        ):
             modified_at = datetime(year=1980, month=1, day=1)
             perms = 0o664
             for file_ in all_files:
                 yield (
                     file_.name,
                     modified_at,
                     perms,
-                    NO_COMPRESSION_64(file_.size, file_.crc32),
+                    NO_COMPRESSION_64(file_.size, 0),
                     get_file(file_.name),
                 )
 
@@ -720,7 +723,8 @@ def member_files():
                 (datapackage_bytes,),
             )
 
-        return stream_zip(member_files(), chunk_size=CHUNK_SIZE)
+        # stream_zip() is an Iterator but defined as an Iterable, can cast
+        return cast(Iterator[bytes], stream_zip(member_files(), chunk_size=CHUNK_SIZE))
 
     async def download_streaming_wacz(
         self, org: Organization, files: List[CrawlFileOut]

diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py
@@ -423,6 +423,16 @@ async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)):
     async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)):
         return await ops.get_crawl_out(crawl_id, org, "upload")
 
+    @app.get(
+        "/orgs/{oid}/uploads/{crawl_id}/download",
+        tags=["uploads"],
+        response_model=bytes,
+    )
+    async def download_upload_as_single_wacz(
+        crawl_id: str, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.download_crawl_as_single_wacz(crawl_id, org)
+
     @app.patch(
         "/orgs/{oid}/uploads/{crawl_id}",
         tags=["uploads"],

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -17,8 +17,7 @@ jinja2
 humanize
 python-multipart
 pathvalidate
-#https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-uncompress.zip
-https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-ignore-local-crc32.zip
+https://github.com/ikreymer/stream-zip/archive/refs/heads/crc32-optional.zip
 boto3
 backoff>=2.2.1
 python-slugify>=8.0.1

diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py
@@ -2,6 +2,8 @@
 import requests
 import time
 from datetime import datetime
+from tempfile import TemporaryFile
+from zipfile import ZipFile, ZIP_STORED
 
 import pytest
 
@@ -541,6 +543,33 @@ def test_sort_crawls_by_qa_runs(
         last_count = crawl_qa_count
 
 
+def test_download_wacz_crawls(
+    crawler_crawl_id,
+    crawler_auth_headers,
+    default_org_id,
+    qa_run_id,
+    qa_run_pages_ready,
+):
+    with TemporaryFile() as fh:
+        with requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/download",
+            headers=crawler_auth_headers,
+            stream=True,
+        ) as r:
+            assert r.status_code == 200
+            for chunk in r.iter_content():
+                fh.write(chunk)
+
+        fh.seek(0)
+        with ZipFile(fh, "r") as zip_file:
+            contents = zip_file.namelist()
+
+            assert len(contents) >= 2
+            for filename in contents:
+                assert filename.endswith(".wacz") or filename == "datapackage.json"
+                assert zip_file.getinfo(filename).compress_type == ZIP_STORED
+
+
 def test_delete_qa_runs(
     crawler_crawl_id,
     crawler_auth_headers,

diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py
@@ -6,6 +6,10 @@
 import re
 import csv
 import codecs
+from tempfile import TemporaryFile
+from zipfile import ZipFile, ZIP_STORED
+
+import pytest
 
 from .conftest import API_PREFIX, HOST_PREFIX, FINISHED_STATES
 from .test_collections import UPDATED_NAME as COLLECTION_NAME
@@ -371,6 +375,38 @@ def test_verify_wacz():
     assert len(pages.strip().split("\n")) == 4
 
 
+@pytest.mark.parametrize(
+    "type_path",
+    [
+        # crawls endpoint
+        ("crawls"),
+        # all-crawls endpoint
+        ("all-crawls"),
+    ],
+)
+def test_download_wacz_crawls(
+    admin_auth_headers, default_org_id, admin_crawl_id, type_path
+):
+    with TemporaryFile() as fh:
+        with requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/{type_path}/{admin_crawl_id}/download",
+            headers=admin_auth_headers,
+            stream=True,
+        ) as r:
+            assert r.status_code == 200
+            for chunk in r.iter_content():
+                fh.write(chunk)
+
+        fh.seek(0)
+        with ZipFile(fh, "r") as zip_file:
+            contents = zip_file.namelist()
+
+            assert len(contents) >= 2
+            for filename in contents:
+                assert filename.endswith(".wacz") or filename == "datapackage.json"
+                assert zip_file.getinfo(filename).compress_type == ZIP_STORED
+
+
 def test_update_crawl(
     admin_auth_headers,
     default_org_id,

diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py
@@ -1,7 +1,9 @@
 import requests
 import os
 import time
+from tempfile import TemporaryFile
 from urllib.parse import urljoin
+from zipfile import ZipFile, ZIP_STORED
 
 import pytest
 
@@ -329,6 +331,27 @@ def test_update_upload_metadata(admin_auth_headers, default_org_id, upload_id):
     assert data["collectionIds"] == UPDATED_COLLECTION_IDS
 
 
+def test_download_wacz_uploads(admin_auth_headers, default_org_id, upload_id):
+    with TemporaryFile() as fh:
+        with requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/download",
+            headers=admin_auth_headers,
+            stream=True,
+        ) as r:
+            assert r.status_code == 200
+            for chunk in r.iter_content():
+                fh.write(chunk)
+
+        fh.seek(0)
+        with ZipFile(fh, "r") as zip_file:
+            contents = zip_file.namelist()
+
+            assert len(contents) == 2
+            for filename in contents:
+                assert filename.endswith(".wacz") or filename == "datapackage.json"
+                assert zip_file.getinfo(filename).compress_type == ZIP_STORED
+
+
 def test_delete_stream_upload(
     admin_auth_headers, crawler_auth_headers, default_org_id, upload_id
 ):

diff --git a/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts b/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts
@@ -275,7 +275,21 @@ export class ArchivedItemDetail extends TailwindElement {
         ]);
         break;
       case "files":
-        sectionContent = this.renderPanel(msg("Files"), this.renderFiles());
+        sectionContent = this.renderPanel(
+          html` ${this.renderTitle(msg("Files"))}
+            <sl-tooltip content=${msg("Download all files as a single WACZ")}>
+              <sl-button
+                href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
+                download
+                size="small"
+                variant="primary"
+              >
+                <sl-icon slot="prefix" name="cloud-download"></sl-icon>
+                ${msg("Download Item")}
+              </sl-button>
+            </sl-tooltip>`,
+          this.renderFiles(),
+        );
         break;
       case "logs":
         sectionContent = this.renderPanel(
@@ -558,6 +572,8 @@ export class ArchivedItemDetail extends TailwindElement {
   private renderMenu() {
     if (!this.crawl) return;
 
+    const authToken = this.authState!.headers.Authorization.split(" ")[1];
+
     return html`
       <sl-dropdown placement="bottom-end" distance="4" hoist>
         <sl-button slot="trigger" size="small" caret
@@ -609,6 +625,19 @@ export class ArchivedItemDetail extends TailwindElement {
             <sl-icon name="tags" slot="prefix"></sl-icon>
             ${msg("Copy Tags")}
           </sl-menu-item>
+          ${when(
+            finishedCrawlStates.includes(this.crawl.state),
+            () => html`
+              <sl-divider></sl-divider>
+              <btrix-menu-item-link
+                href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
+                download
+              >
+                <sl-icon name="cloud-download" slot="prefix"></sl-icon>
+                ${msg("Download Item")}
+              </btrix-menu-item-link>
+            `,
+          )}
           ${when(
             this.isCrawler && !isActive(this.crawl.state),
             () => html`
@@ -618,7 +647,7 @@ export class ArchivedItemDetail extends TailwindElement {
                 @click=${() => void this.deleteCrawl()}
               >
                 <sl-icon name="trash3" slot="prefix"></sl-icon>
-                ${msg("Delete Crawl")}
+                ${msg("Delete Item")}
               </sl-menu-item>
             `,
           )}