From 8d5b2be4c440510ef3972f08c1b2e4f7617f3aeb Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 24 Apr 2024 12:07:34 +0200 Subject: [PATCH] Avoid name conflicts when adding WARCs to collection Append -index to end of files until there is no conflict --- pywb/manager/manager.py | 29 ++++++++++++++++++++--------- tests/test_manager.py | 14 ++++++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 1b833256..05f6fd1f 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -147,18 +147,29 @@ def add_archives(self, archives, unpack_wacz=False): if invalid_archives: logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}') + def _rename_warc(self, source_dir, warc_basename): + dupe_idx = 1 + while True: + new_basename = f'{warc_basename}-{dupe_idx}' + if not os.path.exists(os.path.join(self.archive_dir, new_basename)): + break + dupe_idx += 1 + + return new_basename + def _add_warc(self, warc): - filename = os.path.abspath(warc) + warc_source = os.path.abspath(warc) + source_dir, warc_basename = os.path.split(warc_source) # don't overwrite existing warcs with duplicate names - if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): - logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.') - return None - - shutil.copy2(filename, self.archive_dir) - full_path = os.path.join(self.archive_dir, filename) - logging.info('Copied ' + filename + ' to ' + self.archive_dir) - return full_path + if os.path.exists(os.path.join(self.archive_dir, warc_basename)): + warc_basename = self._rename_warc(source_dir, warc_basename) + logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.') + + warc_dest = os.path.join(self.archive_dir, warc_basename) + shutil.copy2(warc_source, warc_dest) + logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}') + return warc_dest def _add_wacz_unpacked(self, wacz): wacz = os.path.abspath(wacz) diff --git a/tests/test_manager.py b/tests/test_manager.py index f8245ab0..cc136a8c 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -20,6 +20,20 @@ def test_add_valid_wacz_unpacked(self, tmp_path): with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: assert '"filename": "valid_example_1-0.warc"' in f.read() + def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path): + """Test if warc that already exists is renamed with -index suffix""" + manager = self.get_test_collections_manager(tmp_path) + manager._add_wacz_unpacked(VALID_WACZ_PATH) + # Add it again to see if there are name conflicts + manager._add_wacz_unpacked(VALID_WACZ_PATH) + assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir) + assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir) + assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir) + with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: + data = f.read() + assert '"filename": "valid_example_1-0.warc"' in data + assert '"filename": "valid_example_1-0-1.warc"' in data + def test_add_invalid_wacz_unpacked(self, tmp_path, caplog): """Test if adding an invalid wacz file to a collection fails""" manager = self.get_test_collections_manager(tmp_path)