Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Optionally remove single-child top-level directory from extracted archives #579

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
5a790a0
Added functionality to remove single node top-level directories in da…
josephine-funken Sep 21, 2023
6fe1973
fix: restrict to matplotlib<3.8 until we fixed typing (#561)
SiQube Sep 22, 2023
0af0bf5
feat: Add gaze.from_csv() (#557)
prassepaul Sep 22, 2023
fbd6d37
feat: Add GazeDataFrame.apply() (#558)
dkrako Sep 22, 2023
9abc0c3
docs: add pip installation of pylint to CONTRIBUTING.md (#556)
SiQube Sep 22, 2023
7730fe4
ci: remove unused pre-commit env from tox.ini (#564)
SiQube Sep 22, 2023
ea00c4c
ci: pre-commit autoupdate (#566)
pre-commit-ci[bot] Sep 22, 2023
2f4c3cc
feat: Add public dataset GazeOnFaces (#567)
prassepaul Sep 22, 2023
dea5439
chore: Migrate typing to matplotlib v3.8.0 (#563)
SiQube Sep 22, 2023
23d8f62
feat: Add gaze.from_ipc() (#568)
prassepaul Sep 22, 2023
8fcb7d3
tests: Test resulting filestructure for utils.archives.extract() (#570)
dkrako Sep 22, 2023
339cd15
Added functionality and tests to remove top-level directories
josephine-funken Sep 25, 2023
b95ed7f
Added functionality for recursive function application
josephine-funken Sep 26, 2023
4f4ab5d
Merge branch 'main' into 401-remove-top-level-directory-from-extracte…
josephine-funken Sep 27, 2023
22bf1f8
Added funtionality to remove top-level directory to downloads
josephine-funken Sep 27, 2023
2c643dd
Changed test_dataset_extract_remove_finished_false to include new param
josephine-funken Sep 27, 2023
8d28f93
Added new param to existing tests
josephine-funken Sep 27, 2023
6452948
Changed line to make work on all os
josephine-funken Sep 27, 2023
9645f50
Changes to how the single child is detected when there is no destinat…
josephine-funken Sep 27, 2023
1503ab1
Changed check if single child is a directory
josephine-funken Sep 27, 2023
3829464
Changed functionality to remove top level and corresponding tests
josephine-funken Oct 12, 2023
8d04a41
Merge branch 'main' into 401-remove-top-level-directory-from-extracte…
josephine-funken Oct 12, 2023
3118012
Merge branch 'main' into 401-remove-top-level-directory-from-extracte…
dkrako Oct 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/pymovements/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,13 +767,20 @@ def download(
)
return self

def extract(self, remove_finished: bool = False, verbose: int = 1) -> Dataset:
def extract(
self,
remove_finished: bool = False,
remove_top_level: bool = True,
verbose: int = 1,
) -> Dataset:
"""Extract downloaded dataset archive files.

Parameters
----------
remove_finished : bool
Remove archive files after extraction.
remove_top_level: bool
If ``True``, remove the top-level directory if it has only one child.
verbose : int
Verbosity levels: (1) Print messages for extracting each dataset resource without
printing messages for recursive archives. (2) Print additional messages for each
Expand All @@ -788,6 +795,7 @@ def extract(self, remove_finished: bool = False, verbose: int = 1) -> Dataset:
definition=self.definition,
paths=self.paths,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
verbose=verbose,
)
return self
Expand Down
4 changes: 4 additions & 0 deletions src/pymovements/dataset/dataset_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def extract_dataset(
definition: DatasetDefinition,
paths: DatasetPaths,
remove_finished: bool = False,
remove_top_level: bool = True,
verbose: int = 1,
) -> None:
"""Extract downloaded dataset archive files.
Expand All @@ -134,6 +135,8 @@ def extract_dataset(
The dataset paths.
remove_finished : bool
Remove archive files after extraction.
remove_top_level: bool
If ``True``, remove the top-level directory if it has only one child.
verbose:
Verbosity levels: (1) Print messages for extracting each dataset resource without printing
messages for recursive archives. (2) Print messages for extracting each dataset resource and
Expand All @@ -150,5 +153,6 @@ def extract_dataset(
destination_path=destination_path,
recursive=True,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
verbose=verbose,
)
21 changes: 20 additions & 1 deletion src/pymovements/utils/archives.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import bz2
import gzip
import lzma
import os
import shutil
import tarfile
import zipfile
from collections.abc import Callable
Expand All @@ -37,6 +39,7 @@ def extract_archive(
destination_path: Path | None = None,
recursive: bool = True,
remove_finished: bool = False,
remove_top_level: bool = True,
verbose: int = 1,
) -> Path:
"""Extract an archive.
Expand All @@ -55,11 +58,12 @@ def extract_archive(
Recursively extract archives which are included in extracted archive.
remove_finished : bool
If ``True``, remove the file after the extraction.
remove_top_level: bool
If ``True``, remove the top-level directory if it has only one child.
verbose:
Verbosity levels: (1) Print messages for extracting each dataset resource without printing
messages for recursive archives. (2) Print additional messages for each recursive archive
extract.

Returns
-------
Path :
Expand Down Expand Up @@ -88,6 +92,20 @@ def extract_archive(
if remove_finished:
source_path.unlink()

if remove_top_level:
# path of extracted archive
extract_destination = destination_path / source_path.stem
if destination_path.stem == source_path.stem:
extract_destination = destination_path

# check if extracted archive has single child that is a directory
children = [str(file) for file in extract_destination.glob('*')]
if len(children) == 1 and os.path.isdir(children[0]):
# move contents of single child and remove it
for f in [str(file) for file in Path(children[0]).glob('*')]:
shutil.move(f, extract_destination)
Path(children[0]).rmdir()

if recursive:
# Get filepaths of all archives in extracted directory.
archive_extensions = [
Expand All @@ -107,6 +125,7 @@ def extract_archive(
destination_path=extract_destination,
recursive=recursive,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
verbose=0 if verbose < 2 else 2,
)

Expand Down
4 changes: 4 additions & 0 deletions src/pymovements/utils/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def download_and_extract_archive(
md5: str | None = None,
recursive: bool = True,
remove_finished: bool = False,
remove_top_level: bool = True,
verbose: int = 1,
) -> None:
"""Download and extract archive file.
Expand All @@ -60,6 +61,8 @@ def download_and_extract_archive(
Recursively extract archives which are included in extracted archive.
remove_finished : bool
Remove downloaded file after successful extraction or decompression, default: False.
remove_top_level: bool
If ``True``, remove the top-level directory if it has only one child, default:True.
verbose : int
Verbosity levels: (1) Show download progress bar and print info messages on downloading
and extracting archive files without printing messages for recursive archive extraction.
Expand Down Expand Up @@ -87,6 +90,7 @@ def download_and_extract_archive(
destination_path=extract_dirpath,
recursive=recursive,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
verbose=verbose,
)

Expand Down
4 changes: 3 additions & 1 deletion tests/unit/dataset/dataset_download_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,15 @@ def test_dataset_extract_remove_finished_true(

paths = pm.DatasetPaths(root=tmp_path, dataset='.')
dataset = pm.Dataset(dataset_definition, path=paths)
dataset.extract(remove_finished=True, verbose=1)
dataset.extract(remove_finished=True, remove_top_level=False, verbose=1)

mock_extract_archive.assert_has_calls([
mock.call(
source_path=tmp_path / 'downloads' / 'test.gz.tar',
destination_path=tmp_path / 'raw',
recursive=True,
remove_finished=True,
remove_top_level=False,
verbose=1,
),
])
Expand All @@ -245,6 +246,7 @@ def test_dataset_extract_remove_finished_false(
destination_path=tmp_path / 'raw',
recursive=True,
remove_finished=False,
remove_top_level=True,
verbose=1,
),
])
Expand Down
90 changes: 70 additions & 20 deletions tests/unit/utils/archives_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,16 @@ def fixture_archive(request, tmp_path):
test_filepath = rootpath / 'test.file'
test_filepath.write_text('test')

single_child_directory = 'singlechild'
top_level_directory = 'toplevel'

# add additional archive
filepath = rootpath / 'recursive.zip'
with zipfile.ZipFile(filepath, 'w') as zip_open:
zip_open.write(test_filepath, arcname=test_filepath.name)
zip_open.write(
test_filepath,
arcname=os.path.join(single_child_directory, test_filepath.name),
)

# declare archive path
if compression is None:
Expand Down Expand Up @@ -204,51 +208,74 @@ def fixture_unsupported_archive(request, tmp_path):


@pytest.mark.parametrize(
('recursive', 'remove_finished', 'expected_files'),
('recursive', 'remove_finished', 'remove_top_level', 'expected_files'),
[
pytest.param(
False, False,
False, False, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_finished_false',
),
pytest.param(
False, True,
False, True, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_finished_true',
),
pytest.param(
True, False,
True, False, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
os.path.join('toplevel', 'recursive', 'singlechild'),
os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'),
),
id='recursive_true_remove_finished_false',
),
pytest.param(
True, True,
True, True, False,
(
'toplevel',
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
os.path.join('toplevel', 'recursive', 'singlechild'),
os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'),
),
id='recursive_true_remove_finished_true',
),
pytest.param(
False, False, True,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_top_level_true',
),
pytest.param(
True, False, True,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
),
id='recursive_true_remove_top_level_true',
),
],
)
def test_extract_archive_destination_path_None(recursive, remove_finished, expected_files, archive):
def test_extract_archive_destination_path_None(
recursive, remove_finished, remove_top_level, expected_files, archive,
):
extract_archive(
source_path=archive,
destination_path=None,
recursive=recursive,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
)
result_files = {
str(file.relative_to(archive.parent)) for file in archive.parent.rglob('*')
Expand All @@ -269,7 +296,9 @@ def test_extract_archive_destination_path_None(recursive, remove_finished, expec
pytest.param(True, True, id='recursive_true_remove_finished_true'),
],
)
def test_extract_compressed_file_destination_path_None(recursive, remove_finished, compressed_file):
def test_extract_compressed_file_destination_path_None(
recursive, remove_finished, compressed_file,
):
extract_archive(
source_path=compressed_file,
destination_path=None,
Expand Down Expand Up @@ -318,54 +347,75 @@ def test_extract_unsupported_archive_destination_path_None(


@pytest.mark.parametrize(
('recursive', 'remove_finished', 'expected_files'),
('recursive', 'remove_finished', 'remove_top_level', 'expected_files'),
[
pytest.param(
False, False,
False, False, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_finished_false',
),
pytest.param(
False, True,
False, True, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_finished_true',
),
pytest.param(
True, False,
True, False, False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive', 'singlechild'),
os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'),
),
id='recursive_true_remove_finished_false',
),
pytest.param(
True, True,
True, True, False,
(
'toplevel',
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'test.file'),
os.path.join('toplevel', 'recursive', 'singlechild'),
os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'),
),
id='recursive_true_remove_finished_true',
),
pytest.param(
False, False, True,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
id='recursive_false_remove_top_level_true',
),
pytest.param(
True, False, True,
(
'toplevel',
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive', 'test.file'),
),
id='recursive_true_remove_top_level_true',
),
],
)
def test_extract_archive_destination_path_not_None(
recursive, remove_finished, archive, tmp_path, expected_files,
recursive, remove_finished, remove_top_level, archive, tmp_path, expected_files,
):
destination_path = tmp_path / pathlib.Path('tmpfoo')
extract_archive(
source_path=archive,
destination_path=destination_path,
recursive=recursive,
remove_finished=remove_finished,
remove_top_level=remove_top_level,
)

if destination_path.is_file():
Expand Down Expand Up @@ -421,7 +471,7 @@ def test_extract_compressed_file_destination_path_not_None(
pytest.param(True, id='remove_finished_true'),
],
)
def test_extract_unnsupported_archive_destination_path_not_None(
def test_extract_unsupported_archive_destination_path_not_None(
recursive,
remove_finished,
unsupported_archive,
Expand Down
9 changes: 8 additions & 1 deletion tests/unit/utils/downloads_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,14 @@ def test_download_and_extract_archive(tmp_path):
md5 = '52bbf03a7c50ee7152ccb9d357c2bb30'
extract_dirpath = tmp_path / 'extracted'

download_and_extract_archive(url, tmp_path, download_filename, extract_dirpath, md5)
download_and_extract_archive(
url,
tmp_path,
download_filename,
extract_dirpath,
md5,
remove_top_level=False,
)

assert extract_dirpath.exists()
assert (extract_dirpath / 'pymovements-0.4.0').exists()
Expand Down
Loading