Skip to content

Commit

Permalink
Rename --uncompress-wacz to --unpack-wacz and add docs (#901)
Browse files Browse the repository at this point in the history
Also adds help text for wb-manager add --unpack-wacz option in CLI
  • Loading branch information
tw4l authored Apr 24, 2024
1 parent b4c91c6 commit e89924b
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 19 deletions.
3 changes: 2 additions & 1 deletion docs/manual/apps.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha
It can be used to:

* Create a new collection -- ``wb-manager init <coll>``
* Add WARCs or WACZs to collection -- ``wb-manager add <coll> <warc/wacz>``
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz <coll> <wacz>``
* Add override templates
* Add and remove metadata to a collections ``metadata.yaml``
* List all collections
Expand Down
2 changes: 2 additions & 0 deletions docs/manual/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ Using Existing Web Archive Collections
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
WARC/ARC files will automatically be placed in the collection archive directory and indexed.

In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.

By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.

If you have a large number of existing CDX index files, pywb will be able to read them as well after running through a simple conversion process.
Expand Down
22 changes: 14 additions & 8 deletions pywb/manager/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _assert_coll_exists(self):
'To create a new collection, run\n\n{1} init {0}')
raise IOError(msg.format(self.coll_name, sys.argv[0]))

def add_archives(self, archives, uncompress_wacz=False):
def add_archives(self, archives, unpack_wacz=False):
if not os.path.isdir(self.archive_dir):
raise IOError('Directory {0} does not exist'.
format(self.archive_dir))
Expand All @@ -134,11 +134,11 @@ def add_archives(self, archives, uncompress_wacz=False):
if full_path:
warc_paths.append(full_path)
elif self.WACZ_RX.match(archive):
if uncompress_wacz:
self._add_wacz_uncompressed(archive)
if unpack_wacz:
self._add_wacz_unpacked(archive)
else:
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
'\'--uncompress-wacz\' flag to add the wacz\'s content.')
'\'--unpack-wacz\' flag to add the wacz\'s content.')
else:
invalid_archives.append(archive)

Expand All @@ -160,7 +160,7 @@ def _add_warc(self, warc):
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
return full_path

def _add_wacz_uncompressed(self, wacz):
def _add_wacz_unpacked(self, wacz):
wacz = os.path.abspath(wacz)
temp_dir = mkdtemp()
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
Expand Down Expand Up @@ -494,11 +494,17 @@ def do_list(r):
# Add Warcs or Waczs
def do_add(r):
m = CollectionsManager(r.coll_name)
m.add_archives(r.files, r.uncompress_wacz)
m.add_archives(r.files, r.unpack_wacz)

add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex'
add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex'
add_unpack_wacz_help = 'Copy WARCs from WACZ to collection directory and reindex'
add_archives = subparsers.add_parser('add', help=add_archives_help)
add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true')
add_archives.add_argument(
'--unpack-wacz',
dest='unpack_wacz',
action='store_true',
help=add_unpack_wacz_help
)
add_archives.add_argument('coll_name')
add_archives.add_argument('files', nargs='+')
add_archives.set_defaults(func=do_add)
Expand Down
25 changes: 15 additions & 10 deletions tests/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@


class TestManager:
def test_add_valid_wacz_uncompressed(self, tmp_path):
def test_add_valid_wacz_unpacked(self, tmp_path):
"""Test if adding a valid wacz file to a collection succeeds"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_uncompressed(VALID_WACZ_PATH)
manager._add_wacz_unpacked(VALID_WACZ_PATH)
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
assert '"filename": "valid_example_1-0.warc"' in f.read()

def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
"""Test if adding an invalid wacz file to a collection fails"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
manager._add_wacz_unpacked(INVALID_WACZ_PATH)
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text

Expand All @@ -32,12 +32,12 @@ def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
with open(index_path, 'r') as f:
assert '"filename": "invalid_example_1-0.warc"' not in f.read()

def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
def test_add_valid_archives_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz']
manager.add_archives(archives, uncompress_wacz=True)
manager.add_archives(archives, unpack_wacz=True)

with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_text = f.read()
Expand All @@ -51,19 +51,19 @@ def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
assert archive in os.listdir(manager.archive_dir)
assert archive in index_text

def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz']

with pytest.raises(NotImplementedError):
manager.add_archives(archives, uncompress_wacz=False)
manager.add_archives(archives, unpack_wacz=False)

def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
manager = self.get_test_collections_manager(tmp_path)
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
uncompress_wacz=True)
unpack_wacz=True)
assert 'sample.html' not in os.listdir(manager.archive_dir)
assert 'example.warc' in os.listdir(manager.archive_dir)
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
Expand Down Expand Up @@ -91,10 +91,15 @@ def test_merge_wacz_index_gzip(self, tmp_path):
{'example-collection.warc': 'rewritten.warc'})
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_content = f.read()
index_content = index_content.strip()

assert 'example-collection.warc' not in index_content
assert 'rewritten.warc' in index_content

# check that collection index is sorted
index_lines = index_content.split('\n')
assert sorted(index_lines) == index_lines

@staticmethod
def get_test_collections_manager(collections_path):
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
Expand Down

0 comments on commit e89924b

Please sign in to comment.