Skip to content

Commit

Permalink
feat: Optionally remove single-child top-level directory from extract…
Browse files Browse the repository at this point in the history
…ed archives (#579)
  • Loading branch information
josephine-funken authored Oct 13, 2023
1 parent 417a305 commit b6538a3
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 24 deletions.
10 changes: 9 additions & 1 deletion src/pymovements/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,13 +767,20 @@ def download(
)
return self

def extract(self, remove_finished: bool = False, verbose: int = 1) -> Dataset:
def extract(
self,
remove_finished: bool = False,
remove_top_level: bool = True,
verbose: int = 1,
) -> Dataset:
"""Extract downloaded dataset archive files.
Parameters
----------
remove_finished : bool
Remove archive files after extraction.
remove_top_level: bool
If ``True``, remove the top-level directory if it has only one child.
verbose : int
Verbosity levels: (1) Print messages for extracting each dataset resource without
printing messages for recursive archives. (2) Print additional messages for each
Expand All @@ -788,6 +795,7 @@ def extract(self, remove_finished: bool = False, verbose: int = 1) -> Dataset:
definition=self.definition,
paths=self.paths,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
verbose=verbose,
)
return self
Expand Down
4 changes: 4 additions & 0 deletions src/pymovements/dataset/dataset_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def extract_dataset(
definition: DatasetDefinition,
paths: DatasetPaths,
remove_finished: bool = False,
remove_top_level: bool = True,
verbose: int = 1,
) -> None:
"""Extract downloaded dataset archive files.
Expand All @@ -134,6 +135,8 @@ def extract_dataset(
The dataset paths.
remove_finished : bool
Remove archive files after extraction.
remove_top_level: bool
If ``True``, remove the top-level directory if it has only one child.
verbose:
Verbosity levels: (1) Print messages for extracting each dataset resource without printing
messages for recursive archives. (2) Print messages for extracting each dataset resource and
Expand All @@ -150,5 +153,6 @@ def extract_dataset(
destination_path=destination_path,
recursive=True,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
verbose=verbose,
)
21 changes: 20 additions & 1 deletion src/pymovements/utils/archives.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import bz2
import gzip
import lzma
import os
import shutil
import tarfile
import zipfile
from collections.abc import Callable
Expand All @@ -37,6 +39,7 @@ def extract_archive(
destination_path: Path | None = None,
recursive: bool = True,
remove_finished: bool = False,
remove_top_level: bool = True,
verbose: int = 1,
) -> Path:
"""Extract an archive.
Expand All @@ -55,11 +58,12 @@ def extract_archive(
Recursively extract archives which are included in extracted archive.
remove_finished : bool
If ``True``, remove the file after the extraction.
remove_top_level: bool
If ``True``, remove the top-level directory if it has only one child.
verbose:
Verbosity levels: (1) Print messages for extracting each dataset resource without printing
messages for recursive archives. (2) Print additional messages for each recursive archive
extract.
Returns
-------
Path :
Expand Down Expand Up @@ -88,6 +92,20 @@ def extract_archive(
if remove_finished:
source_path.unlink()

if remove_top_level:
# path of extracted archive
extract_destination = destination_path / source_path.stem
if destination_path.stem == source_path.stem:
extract_destination = destination_path

# check if extracted archive has single child that is a directory
children = [str(file) for file in extract_destination.glob('*')]
if len(children) == 1 and os.path.isdir(children[0]):
# move contents of single child and remove it
for f in [str(file) for file in Path(children[0]).glob('*')]:
shutil.move(f, extract_destination)
Path(children[0]).rmdir()

if recursive:
# Get filepaths of all archives in extracted directory.
archive_extensions = [
Expand All @@ -107,6 +125,7 @@ def extract_archive(
destination_path=extract_destination,
recursive=recursive,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
verbose=0 if verbose < 2 else 2,
)

Expand Down
4 changes: 4 additions & 0 deletions src/pymovements/utils/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def download_and_extract_archive(
md5: str | None = None,
recursive: bool = True,
remove_finished: bool = False,
remove_top_level: bool = True,
verbose: int = 1,
) -> None:
"""Download and extract archive file.
Expand All @@ -60,6 +61,8 @@ def download_and_extract_archive(
Recursively extract archives which are included in extracted archive.
remove_finished : bool
Remove downloaded file after successful extraction or decompression, default: False.
remove_top_level: bool
If ``True``, remove the top-level directory if it has only one child, default:True.
verbose : int
Verbosity levels: (1) Show download progress bar and print info messages on downloading
and extracting archive files without printing messages for recursive archive extraction.
Expand Down Expand Up @@ -87,6 +90,7 @@ def download_and_extract_archive(
destination_path=extract_dirpath,
recursive=recursive,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
verbose=verbose,
)

Expand Down
4 changes: 3 additions & 1 deletion tests/unit/dataset/dataset_download_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,15 @@ def test_dataset_extract_remove_finished_true(

paths = pm.DatasetPaths(root=tmp_path, dataset='.')
dataset = pm.Dataset(dataset_definition, path=paths)
dataset.extract(remove_finished=True, verbose=1)
dataset.extract(remove_finished=True, remove_top_level=False, verbose=1)

mock_extract_archive.assert_has_calls([
mock.call(
source_path=tmp_path / 'downloads' / 'test.gz.tar',
destination_path=tmp_path / 'raw',
recursive=True,
remove_finished=True,
remove_top_level=False,
verbose=1,
),
])
Expand All @@ -245,6 +246,7 @@ def test_dataset_extract_remove_finished_false(
destination_path=tmp_path / 'raw',
recursive=True,
remove_finished=False,
remove_top_level=True,
verbose=1,
),
])
Expand Down
90 changes: 70 additions & 20 deletions tests/unit/utils/archives_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,16 @@ def fixture_archive(request, tmp_path):
test_filepath = rootpath / 'test.file'
test_filepath.write_text('test')

single_child_directory = 'singlechild'
top_level_directory = 'toplevel'

# add additional archive
filepath = rootpath / 'recursive.zip'
with zipfile.ZipFile(filepath, 'w') as zip_open:
zip_open.write(test_filepath, arcname=test_filepath.name)
zip_open.write(
test_filepath,
arcname=os.path.join(single_child_directory, test_filepath.name),
)

# declare archive path
if compression is None:
Expand Down Expand Up @@ -204,51 +208,74 @@ def fixture_unsupported_archive(request, tmp_path):


@pytest.mark.parametrize(
('recursive', 'remove_finished', 'expected_files'),
('recursive', 'remove_finished', 'remove_top_level', 'expected_files'),
[
pytest.param(
False, False,
False, False, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_finished_false',
),
pytest.param(
False, True,
False, True, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_finished_true',
),
pytest.param(
True, False,
True, False, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
os.path.join('toplevel', 'recursive', 'singlechild'),
os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'),
),
id='recursive_true_remove_finished_false',
),
pytest.param(
True, True,
True, True, False,
(
'toplevel',
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
os.path.join('toplevel', 'recursive', 'singlechild'),
os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'),
),
id='recursive_true_remove_finished_true',
),
pytest.param(
False, False, True,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_top_level_true',
),
pytest.param(
True, False, True,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
),
id='recursive_true_remove_top_level_true',
),
],
)
def test_extract_archive_destination_path_None(recursive, remove_finished, expected_files, archive):
def test_extract_archive_destination_path_None(
recursive, remove_finished, remove_top_level, expected_files, archive,
):
extract_archive(
source_path=archive,
destination_path=None,
recursive=recursive,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
)
result_files = {
str(file.relative_to(archive.parent)) for file in archive.parent.rglob('*')
Expand All @@ -269,7 +296,9 @@ def test_extract_archive_destination_path_None(recursive, remove_finished, expec
pytest.param(True, True, id='recursive_true_remove_finished_true'),
],
)
def test_extract_compressed_file_destination_path_None(recursive, remove_finished, compressed_file):
def test_extract_compressed_file_destination_path_None(
recursive, remove_finished, compressed_file,
):
extract_archive(
source_path=compressed_file,
destination_path=None,
Expand Down Expand Up @@ -318,54 +347,75 @@ def test_extract_unsupported_archive_destination_path_None(


@pytest.mark.parametrize(
('recursive', 'remove_finished', 'expected_files'),
('recursive', 'remove_finished', 'remove_top_level', 'expected_files'),
[
pytest.param(
False, False,
False, False, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_finished_false',
),
pytest.param(
False, True,
False, True, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_finished_true',
),
pytest.param(
True, False,
True, False, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive', 'singlechild'),
os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'),
),
id='recursive_true_remove_finished_false',
),
pytest.param(
True, True,
True, True, False,
(
'toplevel',
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
os.path.join('toplevel', 'recursive', 'singlechild'),
os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'),
),
id='recursive_true_remove_finished_true',
),
pytest.param(
False, False, True,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_top_level_true',
),
pytest.param(
True, False, True,
(
'toplevel',
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive', 'test.file'),
),
id='recursive_true_remove_top_level_true',
),
],
)
def test_extract_archive_destination_path_not_None(
recursive, remove_finished, archive, tmp_path, expected_files,
recursive, remove_finished, remove_top_level, archive, tmp_path, expected_files,
):
destination_path = tmp_path / pathlib.Path('tmpfoo')
extract_archive(
source_path=archive,
destination_path=destination_path,
recursive=recursive,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
)

if destination_path.is_file():
Expand Down Expand Up @@ -421,7 +471,7 @@ def test_extract_compressed_file_destination_path_not_None(
pytest.param(True, id='remove_finished_true'),
],
)
def test_extract_unnsupported_archive_destination_path_not_None(
def test_extract_unsupported_archive_destination_path_not_None(
recursive,
remove_finished,
unsupported_archive,
Expand Down
9 changes: 8 additions & 1 deletion tests/unit/utils/downloads_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,14 @@ def test_download_and_extract_archive(tmp_path):
md5 = '52bbf03a7c50ee7152ccb9d357c2bb30'
extract_dirpath = tmp_path / 'extracted'

download_and_extract_archive(url, tmp_path, download_filename, extract_dirpath, md5)
download_and_extract_archive(
url,
tmp_path,
download_filename,
extract_dirpath,
md5,
remove_top_level=False,
)

assert extract_dirpath.exists()
assert (extract_dirpath / 'pymovements-0.4.0').exists()
Expand Down

0 comments on commit b6538a3

Please sign in to comment.