diff --git a/docs/manual/apps.rst b/docs/manual/apps.rst index 4c7f1b99..680e7034 100644 --- a/docs/manual/apps.rst +++ b/docs/manual/apps.rst @@ -45,7 +45,8 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha It can be used to: * Create a new collection -- ``wb-manager init `` -* Add WARCs or WACZs to collection -- ``wb-manager add `` +* Add WARCs to collection -- ``wb-manager add `` +* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz `` * Add override templates * Add and remove metadata to a collections ``metadata.yaml`` * List all collections diff --git a/docs/manual/usage.rst b/docs/manual/usage.rst index 01aa8ed0..ca1119d0 100644 --- a/docs/manual/usage.rst +++ b/docs/manual/usage.rst @@ -114,6 +114,8 @@ Using Existing Web Archive Collections Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``, WARC/ARC files will automatically be placed in the collection archive directory and indexed. +In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection. + By default ``wb-manager``, places new collections in ``collections/`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d ``. Other options can be set in the config file. If you have a large number of existing CDX index files, pywb will be able to read them as well after running through a simple conversion process. diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index d983c726..1b833256 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -121,7 +121,7 @@ def _assert_coll_exists(self): 'To create a new collection, run\n\n{1} init {0}') raise IOError(msg.format(self.coll_name, sys.argv[0])) - def add_archives(self, archives, uncompress_wacz=False): + def add_archives(self, archives, unpack_wacz=False): if not os.path.isdir(self.archive_dir): raise IOError('Directory {0} does not exist'. format(self.archive_dir)) @@ -134,11 +134,11 @@ def add_archives(self, archives, uncompress_wacz=False): if full_path: warc_paths.append(full_path) elif self.WACZ_RX.match(archive): - if uncompress_wacz: - self._add_wacz_uncompressed(archive) + if unpack_wacz: + self._add_wacz_unpacked(archive) else: raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use ' - '\'--uncompress-wacz\' flag to add the wacz\'s content.') + '\'--unpack-wacz\' flag to add the wacz\'s content.') else: invalid_archives.append(archive) @@ -160,7 +160,7 @@ def _add_warc(self, warc): logging.info('Copied ' + filename + ' to ' + self.archive_dir) return full_path - def _add_wacz_uncompressed(self, wacz): + def _add_wacz_unpacked(self, wacz): wacz = os.path.abspath(wacz) temp_dir = mkdtemp() warc_regex = re.compile(r'.+\.warc(\.gz)?$') @@ -494,11 +494,17 @@ def do_list(r): # Add Warcs or Waczs def do_add(r): m = CollectionsManager(r.coll_name) - m.add_archives(r.files, r.uncompress_wacz) + m.add_archives(r.files, r.unpack_wacz) - add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex' + add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex' + add_unpack_wacz_help = 'Copy WARCs from WACZ to collection directory and reindex' add_archives = subparsers.add_parser('add', help=add_archives_help) - add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true') + add_archives.add_argument( + '--unpack-wacz', + dest='unpack_wacz', + action='store_true', + help=add_unpack_wacz_help + ) add_archives.add_argument('coll_name') add_archives.add_argument('files', nargs='+') add_archives.set_defaults(func=do_add) diff --git a/tests/test_manager.py b/tests/test_manager.py index c960674b..f8245ab0 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -11,19 +11,19 @@ class TestManager: - def test_add_valid_wacz_uncompressed(self, tmp_path): + def test_add_valid_wacz_unpacked(self, tmp_path): """Test if adding a valid wacz file to a collection succeeds""" manager = self.get_test_collections_manager(tmp_path) - manager._add_wacz_uncompressed(VALID_WACZ_PATH) + manager._add_wacz_unpacked(VALID_WACZ_PATH) assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir) assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: assert '"filename": "valid_example_1-0.warc"' in f.read() - def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog): + def test_add_invalid_wacz_unpacked(self, tmp_path, caplog): """Test if adding an invalid wacz file to a collection fails""" manager = self.get_test_collections_manager(tmp_path) - manager._add_wacz_uncompressed(INVALID_WACZ_PATH) + manager._add_wacz_unpacked(INVALID_WACZ_PATH) assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir) assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text @@ -32,12 +32,12 @@ def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog): with open(index_path, 'r') as f: assert '"filename": "invalid_example_1-0.warc"' not in f.read() - def test_add_valid_archives_uncompressed_wacz(self, tmp_path): + def test_add_valid_archives_unpack_wacz(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', 'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', 'sample_archive/waczs/valid_example_1.wacz'] - manager.add_archives(archives, uncompress_wacz=True) + manager.add_archives(archives, unpack_wacz=True) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: index_text = f.read() @@ -51,19 +51,19 @@ def test_add_valid_archives_uncompressed_wacz(self, tmp_path): assert archive in os.listdir(manager.archive_dir) assert archive in index_text - def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path): + def test_add_valid_archives_dont_unpack_wacz(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', 'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', 'sample_archive/waczs/valid_example_1.wacz'] with pytest.raises(NotImplementedError): - manager.add_archives(archives, uncompress_wacz=False) + manager.add_archives(archives, unpack_wacz=False) - def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog): + def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog): manager = self.get_test_collections_manager(tmp_path) manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'], - uncompress_wacz=True) + unpack_wacz=True) assert 'sample.html' not in os.listdir(manager.archive_dir) assert 'example.warc' in os.listdir(manager.archive_dir) assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages @@ -91,10 +91,15 @@ def test_merge_wacz_index_gzip(self, tmp_path): {'example-collection.warc': 'rewritten.warc'}) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: index_content = f.read() + index_content = index_content.strip() assert 'example-collection.warc' not in index_content assert 'rewritten.warc' in index_content + # check that collection index is sorted + index_lines = index_content.split('\n') + assert sorted(index_lines) == index_lines + @staticmethod def get_test_collections_manager(collections_path): manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)