From 8d5b2be4c440510ef3972f08c1b2e4f7617f3aeb Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 24 Apr 2024 12:07:34 +0200 Subject: [PATCH 1/4] Avoid name conflicts when adding WARCs to collection Append -index to end of files until there is no conflict --- pywb/manager/manager.py | 29 ++++++++++++++++++++--------- tests/test_manager.py | 14 ++++++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 1b833256..05f6fd1f 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -147,18 +147,29 @@ def add_archives(self, archives, unpack_wacz=False): if invalid_archives: logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}') + def _rename_warc(self, source_dir, warc_basename): + dupe_idx = 1 + while True: + new_basename = f'{warc_basename}-{dupe_idx}' + if not os.path.exists(os.path.join(self.archive_dir, new_basename)): + break + dupe_idx += 1 + + return new_basename + def _add_warc(self, warc): - filename = os.path.abspath(warc) + warc_source = os.path.abspath(warc) + source_dir, warc_basename = os.path.split(warc_source) # don't overwrite existing warcs with duplicate names - if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): - logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.') - return None - - shutil.copy2(filename, self.archive_dir) - full_path = os.path.join(self.archive_dir, filename) - logging.info('Copied ' + filename + ' to ' + self.archive_dir) - return full_path + if os.path.exists(os.path.join(self.archive_dir, warc_basename)): + warc_basename = self._rename_warc(source_dir, warc_basename) + logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.') + + warc_dest = os.path.join(self.archive_dir, warc_basename) + shutil.copy2(warc_source, warc_dest) + logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}') + return warc_dest def _add_wacz_unpacked(self, wacz): wacz = os.path.abspath(wacz) diff --git a/tests/test_manager.py b/tests/test_manager.py index f8245ab0..cc136a8c 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -20,6 +20,20 @@ def test_add_valid_wacz_unpacked(self, tmp_path): with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: assert '"filename": "valid_example_1-0.warc"' in f.read() + def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path): + """Test if warc that already exists is renamed with -index suffix""" + manager = self.get_test_collections_manager(tmp_path) + manager._add_wacz_unpacked(VALID_WACZ_PATH) + # Add it again to see if there are name conflicts + manager._add_wacz_unpacked(VALID_WACZ_PATH) + assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir) + assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir) + assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir) + with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: + data = f.read() + assert '"filename": "valid_example_1-0.warc"' in data + assert '"filename": "valid_example_1-0-1.warc"' in data + def test_add_invalid_wacz_unpacked(self, tmp_path, caplog): """Test if adding an invalid wacz file to a collection fails""" manager = self.get_test_collections_manager(tmp_path) From ee15a3e06f3c2fe172d3aa960c6cd34adaa010c2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 24 Apr 2024 12:15:25 +0200 Subject: [PATCH 2/4] Fix for WACZ as well --- pywb/manager/manager.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 05f6fd1f..78d8e3af 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -147,7 +147,7 @@ def add_archives(self, archives, unpack_wacz=False): if invalid_archives: logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}') - def _rename_warc(self, source_dir, warc_basename): + def _rename_warc(self, warc_basename): dupe_idx = 1 while True: new_basename = f'{warc_basename}-{dupe_idx}' @@ -163,7 +163,7 @@ def _add_warc(self, warc): # don't overwrite existing warcs with duplicate names if os.path.exists(os.path.join(self.archive_dir, warc_basename)): - warc_basename = self._rename_warc(source_dir, warc_basename) + warc_basename = self._rename_warc(warc_basename) logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.') warc_dest = os.path.join(self.archive_dir, warc_basename) @@ -209,8 +209,9 @@ def _add_wacz_unpacked(self, wacz): warc_destination_path = os.path.join(self.archive_dir, warc_filename) if os.path.exists(warc_destination_path): - logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.') - continue + warc_filename = self._rename_warc(warc_filename) + logging.info(f'Warc {warc_destination_path} already exists - renamed to {warc_filename}.') + warc_destination_path = os.path.join(self.archive_dir, warc_filename) warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path) From 52c5b84b1fd5c47bcc1aa13601a56c54c3b3d1c1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 24 Apr 2024 12:23:35 +0200 Subject: [PATCH 3/4] Fix dupe renaming and add additional test for warc.gz --- pywb/manager/manager.py | 6 +++++- tests/test_manager.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 78d8e3af..4a167cd6 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -7,6 +7,7 @@ import re import gzip import six +import pathlib from distutils.util import strtobool from pkg_resources import resource_string, get_distribution @@ -149,8 +150,11 @@ def add_archives(self, archives, unpack_wacz=False): def _rename_warc(self, warc_basename): dupe_idx = 1 + ext = ''.join(pathlib.Path(warc_basename).suffixes) + pre_ext_name = warc_basename.split(ext)[0] + while True: - new_basename = f'{warc_basename}-{dupe_idx}' + new_basename = f'{pre_ext_name}-{dupe_idx}{ext}' if not os.path.exists(os.path.join(self.archive_dir, new_basename)): break dupe_idx += 1 diff --git a/tests/test_manager.py b/tests/test_manager.py index cc136a8c..6c0abe85 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -65,6 +65,21 @@ def test_add_valid_archives_unpack_wacz(self, tmp_path): assert archive in os.listdir(manager.archive_dir) assert archive in index_text + def test_add_valid_archives_dupe_name(self, tmp_path): + manager = self.get_test_collections_manager(tmp_path) + warc_filename = 'sample_archive/warcs/example.warc.gz' + manager.add_archives(warc_filename) + manager.add_archives(warc_filename) + + with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: + index_text = f.read() + + expected_archives = ('example.warc.gz', 'example-1.warc.gz') + + for archive in expected_archives: + assert archive in os.listdir(manager.archive_dir) + assert archive in index_text + def test_add_valid_archives_dont_unpack_wacz(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', From d74c6c2a27d12f998993ff64549d6e90a4e5b055 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 24 Apr 2024 12:29:18 +0200 Subject: [PATCH 4/4] Fix argument to manager.add_archives --- tests/test_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_manager.py b/tests/test_manager.py index 6c0abe85..cd34c997 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -68,8 +68,7 @@ def test_add_valid_archives_unpack_wacz(self, tmp_path): def test_add_valid_archives_dupe_name(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) warc_filename = 'sample_archive/warcs/example.warc.gz' - manager.add_archives(warc_filename) - manager.add_archives(warc_filename) + manager.add_archives([warc_filename, warc_filename]) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: index_text = f.read()