From b6538a3911b106028bd9624780bbabdfab6957c3 Mon Sep 17 00:00:00 2001 From: Josephine Funken <138496024+josephine-funken@users.noreply.github.com> Date: Fri, 13 Oct 2023 10:56:34 +0200 Subject: [PATCH] feat: Optionally remove single-child top-level directory from extracted archives (#579) --- src/pymovements/dataset/dataset.py | 10 ++- src/pymovements/dataset/dataset_download.py | 4 + src/pymovements/utils/archives.py | 21 ++++- src/pymovements/utils/downloads.py | 4 + tests/unit/dataset/dataset_download_test.py | 4 +- tests/unit/utils/archives_test.py | 90 ++++++++++++++++----- tests/unit/utils/downloads_test.py | 9 ++- 7 files changed, 118 insertions(+), 24 deletions(-) diff --git a/src/pymovements/dataset/dataset.py b/src/pymovements/dataset/dataset.py index 3fa0415b2..0c7f39b9f 100644 --- a/src/pymovements/dataset/dataset.py +++ b/src/pymovements/dataset/dataset.py @@ -767,13 +767,20 @@ def download( ) return self - def extract(self, remove_finished: bool = False, verbose: int = 1) -> Dataset: + def extract( + self, + remove_finished: bool = False, + remove_top_level: bool = True, + verbose: int = 1, + ) -> Dataset: """Extract downloaded dataset archive files. Parameters ---------- remove_finished : bool Remove archive files after extraction. + remove_top_level: bool + If ``True``, remove the top-level directory if it has only one child. verbose : int Verbosity levels: (1) Print messages for extracting each dataset resource without printing messages for recursive archives. (2) Print additional messages for each @@ -788,6 +795,7 @@ def extract(self, remove_finished: bool = False, verbose: int = 1) -> Dataset: definition=self.definition, paths=self.paths, remove_finished=remove_finished, + remove_top_level=remove_top_level, verbose=verbose, ) return self diff --git a/src/pymovements/dataset/dataset_download.py b/src/pymovements/dataset/dataset_download.py index be0d8512f..427548446 100644 --- a/src/pymovements/dataset/dataset_download.py +++ b/src/pymovements/dataset/dataset_download.py @@ -122,6 +122,7 @@ def extract_dataset( definition: DatasetDefinition, paths: DatasetPaths, remove_finished: bool = False, + remove_top_level: bool = True, verbose: int = 1, ) -> None: """Extract downloaded dataset archive files. @@ -134,6 +135,8 @@ def extract_dataset( The dataset paths. remove_finished : bool Remove archive files after extraction. + remove_top_level: bool + If ``True``, remove the top-level directory if it has only one child. verbose: Verbosity levels: (1) Print messages for extracting each dataset resource without printing messages for recursive archives. (2) Print messages for extracting each dataset resource and @@ -150,5 +153,6 @@ def extract_dataset( destination_path=destination_path, recursive=True, remove_finished=remove_finished, + remove_top_level=remove_top_level, verbose=verbose, ) diff --git a/src/pymovements/utils/archives.py b/src/pymovements/utils/archives.py index 53f53c0c7..96a269095 100644 --- a/src/pymovements/utils/archives.py +++ b/src/pymovements/utils/archives.py @@ -23,6 +23,8 @@ import bz2 import gzip import lzma +import os +import shutil import tarfile import zipfile from collections.abc import Callable @@ -37,6 +39,7 @@ def extract_archive( destination_path: Path | None = None, recursive: bool = True, remove_finished: bool = False, + remove_top_level: bool = True, verbose: int = 1, ) -> Path: """Extract an archive. @@ -55,11 +58,12 @@ def extract_archive( Recursively extract archives which are included in extracted archive. remove_finished : bool If ``True``, remove the file after the extraction. + remove_top_level: bool + If ``True``, remove the top-level directory if it has only one child. verbose: Verbosity levels: (1) Print messages for extracting each dataset resource without printing messages for recursive archives. (2) Print additional messages for each recursive archive extract. - Returns ------- Path : @@ -88,6 +92,20 @@ def extract_archive( if remove_finished: source_path.unlink() + if remove_top_level: + # path of extracted archive + extract_destination = destination_path / source_path.stem + if destination_path.stem == source_path.stem: + extract_destination = destination_path + + # check if extracted archive has single child that is a directory + children = [str(file) for file in extract_destination.glob('*')] + if len(children) == 1 and os.path.isdir(children[0]): + # move contents of single child and remove it + for f in [str(file) for file in Path(children[0]).glob('*')]: + shutil.move(f, extract_destination) + Path(children[0]).rmdir() + if recursive: # Get filepaths of all archives in extracted directory. archive_extensions = [ @@ -107,6 +125,7 @@ def extract_archive( destination_path=extract_destination, recursive=recursive, remove_finished=remove_finished, + remove_top_level=remove_top_level, verbose=0 if verbose < 2 else 2, ) diff --git a/src/pymovements/utils/downloads.py b/src/pymovements/utils/downloads.py index fc053fd0c..3a245496d 100644 --- a/src/pymovements/utils/downloads.py +++ b/src/pymovements/utils/downloads.py @@ -40,6 +40,7 @@ def download_and_extract_archive( md5: str | None = None, recursive: bool = True, remove_finished: bool = False, + remove_top_level: bool = True, verbose: int = 1, ) -> None: """Download and extract archive file. @@ -60,6 +61,8 @@ def download_and_extract_archive( Recursively extract archives which are included in extracted archive. remove_finished : bool Remove downloaded file after successful extraction or decompression, default: False. + remove_top_level: bool + If ``True``, remove the top-level directory if it has only one child, default:True. verbose : int Verbosity levels: (1) Show download progress bar and print info messages on downloading and extracting archive files without printing messages for recursive archive extraction. @@ -87,6 +90,7 @@ def download_and_extract_archive( destination_path=extract_dirpath, recursive=recursive, remove_finished=remove_finished, + remove_top_level=remove_top_level, verbose=verbose, ) diff --git a/tests/unit/dataset/dataset_download_test.py b/tests/unit/dataset/dataset_download_test.py index ca26acbe9..478be9788 100644 --- a/tests/unit/dataset/dataset_download_test.py +++ b/tests/unit/dataset/dataset_download_test.py @@ -214,7 +214,7 @@ def test_dataset_extract_remove_finished_true( paths = pm.DatasetPaths(root=tmp_path, dataset='.') dataset = pm.Dataset(dataset_definition, path=paths) - dataset.extract(remove_finished=True, verbose=1) + dataset.extract(remove_finished=True, remove_top_level=False, verbose=1) mock_extract_archive.assert_has_calls([ mock.call( @@ -222,6 +222,7 @@ def test_dataset_extract_remove_finished_true( destination_path=tmp_path / 'raw', recursive=True, remove_finished=True, + remove_top_level=False, verbose=1, ), ]) @@ -245,6 +246,7 @@ def test_dataset_extract_remove_finished_false( destination_path=tmp_path / 'raw', recursive=True, remove_finished=False, + remove_top_level=True, verbose=1, ), ]) diff --git a/tests/unit/utils/archives_test.py b/tests/unit/utils/archives_test.py index 69192f07b..b3fe41fb1 100644 --- a/tests/unit/utils/archives_test.py +++ b/tests/unit/utils/archives_test.py @@ -86,12 +86,16 @@ def fixture_archive(request, tmp_path): test_filepath = rootpath / 'test.file' test_filepath.write_text('test') + single_child_directory = 'singlechild' top_level_directory = 'toplevel' # add additional archive filepath = rootpath / 'recursive.zip' with zipfile.ZipFile(filepath, 'w') as zip_open: - zip_open.write(test_filepath, arcname=test_filepath.name) + zip_open.write( + test_filepath, + arcname=os.path.join(single_child_directory, test_filepath.name), + ) # declare archive path if compression is None: @@ -204,10 +208,10 @@ def fixture_unsupported_archive(request, tmp_path): @pytest.mark.parametrize( - ('recursive', 'remove_finished', 'expected_files'), + ('recursive', 'remove_finished', 'remove_top_level', 'expected_files'), [ pytest.param( - False, False, + False, False, False, ( 'toplevel', os.path.join('toplevel', 'recursive.zip'), @@ -215,7 +219,7 @@ def fixture_unsupported_archive(request, tmp_path): id='recursive_false_remove_finished_false', ), pytest.param( - False, True, + False, True, False, ( 'toplevel', os.path.join('toplevel', 'recursive.zip'), @@ -223,32 +227,55 @@ def fixture_unsupported_archive(request, tmp_path): id='recursive_false_remove_finished_true', ), pytest.param( - True, False, + True, False, False, ( 'toplevel', os.path.join('toplevel', 'recursive.zip'), os.path.join('toplevel', 'recursive'), - os.path.join('toplevel', 'recursive', 'test.file'), + os.path.join('toplevel', 'recursive', 'singlechild'), + os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'), ), id='recursive_true_remove_finished_false', ), pytest.param( - True, True, + True, True, False, ( 'toplevel', os.path.join('toplevel', 'recursive'), - os.path.join('toplevel', 'recursive', 'test.file'), + os.path.join('toplevel', 'recursive', 'singlechild'), + os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'), ), id='recursive_true_remove_finished_true', ), + pytest.param( + False, False, True, + ( + 'toplevel', + os.path.join('toplevel', 'recursive.zip'), + ), + id='recursive_false_remove_top_level_true', + ), + pytest.param( + True, False, True, + ( + 'toplevel', + os.path.join('toplevel', 'recursive.zip'), + os.path.join('toplevel', 'recursive'), + os.path.join('toplevel', 'recursive', 'test.file'), + ), + id='recursive_true_remove_top_level_true', + ), ], ) -def test_extract_archive_destination_path_None(recursive, remove_finished, expected_files, archive): +def test_extract_archive_destination_path_None( + recursive, remove_finished, remove_top_level, expected_files, archive, +): extract_archive( source_path=archive, destination_path=None, recursive=recursive, remove_finished=remove_finished, + remove_top_level=remove_top_level, ) result_files = { str(file.relative_to(archive.parent)) for file in archive.parent.rglob('*') @@ -269,7 +296,9 @@ def test_extract_archive_destination_path_None(recursive, remove_finished, expec pytest.param(True, True, id='recursive_true_remove_finished_true'), ], ) -def test_extract_compressed_file_destination_path_None(recursive, remove_finished, compressed_file): +def test_extract_compressed_file_destination_path_None( + recursive, remove_finished, compressed_file, +): extract_archive( source_path=compressed_file, destination_path=None, @@ -318,10 +347,10 @@ def test_extract_unsupported_archive_destination_path_None( @pytest.mark.parametrize( - ('recursive', 'remove_finished', 'expected_files'), + ('recursive', 'remove_finished', 'remove_top_level', 'expected_files'), [ pytest.param( - False, False, + False, False, False, ( 'toplevel', os.path.join('toplevel', 'recursive.zip'), @@ -329,7 +358,7 @@ def test_extract_unsupported_archive_destination_path_None( id='recursive_false_remove_finished_false', ), pytest.param( - False, True, + False, True, False, ( 'toplevel', os.path.join('toplevel', 'recursive.zip'), @@ -337,28 +366,48 @@ def test_extract_unsupported_archive_destination_path_None( id='recursive_false_remove_finished_true', ), pytest.param( - True, False, + True, False, False, ( 'toplevel', - os.path.join('toplevel', 'recursive.zip'), os.path.join('toplevel', 'recursive'), - os.path.join('toplevel', 'recursive', 'test.file'), + os.path.join('toplevel', 'recursive.zip'), + os.path.join('toplevel', 'recursive', 'singlechild'), + os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'), ), id='recursive_true_remove_finished_false', ), pytest.param( - True, True, + True, True, False, ( 'toplevel', os.path.join('toplevel', 'recursive'), - os.path.join('toplevel', 'recursive', 'test.file'), + os.path.join('toplevel', 'recursive', 'singlechild'), + os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'), ), id='recursive_true_remove_finished_true', ), + pytest.param( + False, False, True, + ( + 'toplevel', + os.path.join('toplevel', 'recursive.zip'), + ), + id='recursive_false_remove_top_level_true', + ), + pytest.param( + True, False, True, + ( + 'toplevel', + os.path.join('toplevel', 'recursive'), + os.path.join('toplevel', 'recursive.zip'), + os.path.join('toplevel', 'recursive', 'test.file'), + ), + id='recursive_true_remove_top_level_true', + ), ], ) def test_extract_archive_destination_path_not_None( - recursive, remove_finished, archive, tmp_path, expected_files, + recursive, remove_finished, remove_top_level, archive, tmp_path, expected_files, ): destination_path = tmp_path / pathlib.Path('tmpfoo') extract_archive( @@ -366,6 +415,7 @@ def test_extract_archive_destination_path_not_None( destination_path=destination_path, recursive=recursive, remove_finished=remove_finished, + remove_top_level=remove_top_level, ) if destination_path.is_file(): @@ -421,7 +471,7 @@ def test_extract_compressed_file_destination_path_not_None( pytest.param(True, id='remove_finished_true'), ], ) -def test_extract_unnsupported_archive_destination_path_not_None( +def test_extract_unsupported_archive_destination_path_not_None( recursive, remove_finished, unsupported_archive, diff --git a/tests/unit/utils/downloads_test.py b/tests/unit/utils/downloads_test.py index ae6ff3bc1..6d2b3a82c 100644 --- a/tests/unit/utils/downloads_test.py +++ b/tests/unit/utils/downloads_test.py @@ -156,7 +156,14 @@ def test_download_and_extract_archive(tmp_path): md5 = '52bbf03a7c50ee7152ccb9d357c2bb30' extract_dirpath = tmp_path / 'extracted' - download_and_extract_archive(url, tmp_path, download_filename, extract_dirpath, md5) + download_and_extract_archive( + url, + tmp_path, + download_filename, + extract_dirpath, + md5, + remove_top_level=False, + ) assert extract_dirpath.exists() assert (extract_dirpath / 'pymovements-0.4.0').exists()