-
-
Notifications
You must be signed in to change notification settings - Fork 219
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[#799] wb-manager: Add wacz archives to collection with --uncompress-…
…wacz (#800) Add WACZ support for `wb-manager add` by unpacking WACZ files with --uncompress-wacz. A future commit will add pywb support for WACZ files without requiring them to be unpacked.
- Loading branch information
1 parent
b869330
commit 454486b
Showing
6 changed files
with
225 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import os | ||
|
||
import pytest | ||
|
||
from pywb.manager.manager import CollectionsManager | ||
|
||
VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz' | ||
INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz' | ||
|
||
TEST_COLLECTION_NAME = 'test-col' | ||
|
||
|
||
class TestManager: | ||
def test_add_valid_wacz_uncompressed(self, tmp_path): | ||
"""Test if adding a valid wacz file to a collection succeeds""" | ||
manager = self.get_test_collections_manager(tmp_path) | ||
manager._add_wacz_uncompressed(VALID_WACZ_PATH) | ||
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir) | ||
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir) | ||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: | ||
assert '"filename": "valid_example_1-0.warc"' in f.read() | ||
|
||
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog): | ||
"""Test if adding an invalid wacz file to a collection fails""" | ||
manager = self.get_test_collections_manager(tmp_path) | ||
manager._add_wacz_uncompressed(INVALID_WACZ_PATH) | ||
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir) | ||
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text | ||
|
||
index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE) | ||
if os.path.exists(index_path): | ||
with open(index_path, 'r') as f: | ||
assert '"filename": "invalid_example_1-0.warc"' not in f.read() | ||
|
||
def test_add_valid_archives_uncompressed_wacz(self, tmp_path): | ||
manager = self.get_test_collections_manager(tmp_path) | ||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', | ||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', | ||
'sample_archive/waczs/valid_example_1.wacz'] | ||
manager.add_archives(archives, uncompress_wacz=True) | ||
|
||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: | ||
index_text = f.read() | ||
|
||
for archive in archives: | ||
archive = os.path.basename(archive) | ||
|
||
if archive.endswith('wacz'): | ||
archive = 'valid_example_1-0.warc' | ||
|
||
assert archive in os.listdir(manager.archive_dir) | ||
assert archive in index_text | ||
|
||
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path): | ||
manager = self.get_test_collections_manager(tmp_path) | ||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', | ||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', | ||
'sample_archive/waczs/valid_example_1.wacz'] | ||
|
||
with pytest.raises(NotImplementedError): | ||
manager.add_archives(archives, uncompress_wacz=False) | ||
|
||
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog): | ||
manager = self.get_test_collections_manager(tmp_path) | ||
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'], | ||
uncompress_wacz=True) | ||
assert 'sample.html' not in os.listdir(manager.archive_dir) | ||
assert 'example.warc' in os.listdir(manager.archive_dir) | ||
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages | ||
|
||
def test_merge_wacz_index(self, tmp_path): | ||
manager = self.get_test_collections_manager(tmp_path) | ||
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), | ||
'sample_archive/cdxj/example.cdxj', | ||
{'example.warc.gz': 'rewritten.warc.gz'}) | ||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: | ||
index_content = f.read() | ||
|
||
assert 'example.warc.gz' not in index_content | ||
assert 'rewritten.warc.gz' in index_content | ||
|
||
def test_merge_wacz_index_gzip(self, tmp_path): | ||
manager = self.get_test_collections_manager(tmp_path) | ||
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), | ||
'sample_archive/cdxj/example.cdx.gz', | ||
{'example-collection.warc': 'rewritten.warc'}) | ||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: | ||
index_content = f.read() | ||
|
||
assert 'example-collection.warc' not in index_content | ||
assert 'rewritten.warc' in index_content | ||
|
||
@staticmethod | ||
def get_test_collections_manager(collections_path): | ||
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False) | ||
manager.add_collection() | ||
return manager |