From ce354b354f0986ea83d016010d2cd73a17dcbd51 Mon Sep 17 00:00:00 2001 From: Lene Preuss Date: Wed, 17 Jan 2024 12:01:22 +0100 Subject: [PATCH] add --parallel-actions option to execute actions in parallel --- CHANGELOG.md | 2 +- README.md | 3 ++ duplicate_images/duplicate.py | 21 +++++++---- duplicate_images/parse_commandline.py | 6 +++- tests/unit/test_actions.py | 51 +++++++++++++++++++++++++-- tests/unit/test_parse_commandline.py | 23 ++++++++++-- 6 files changed, 93 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77e017b..bb1bc5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## [0.10.8] - 2024-01-16 +## [0.10.8] - 2024-01-17 ### Added - optional argument to specify the number of threads with `--parallel` diff --git a/README.md b/README.md index b912a44..4319556 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,9 @@ with the `--group` argument may be more than two images considered equal. Use the `--parallel` option to utilize all free cores on your system for calculating image hashes. Optionally, you can specify the number of processes to use with `--parallel $N`. +To execute the `--on-equal` actions in parallel, use the `--parallel-actions` option, which also can +take an optional number of processes to use as argument. + ### Excluding subfolders Use the `--exclude-dir` option to exclude subfolders of `$IMAGE_ROOT` from the search. The argument diff --git a/duplicate_images/duplicate.py b/duplicate_images/duplicate.py index 759632e..3afae2e 100755 --- a/duplicate_images/duplicate.py +++ b/duplicate_images/duplicate.py @@ -3,6 +3,7 @@ import logging import re from argparse import Namespace +from multiprocessing.pool import ThreadPool from os import walk, access, R_OK from pathlib import Path from typing import Callable, List, Optional @@ -12,7 +13,7 @@ from pillow_heif import register_heif_opener from duplicate_images.common import path_with_parent, log_execution_time -from duplicate_images.function_types import Results +from duplicate_images.function_types import Results, ImageGroup, ActionFunction from duplicate_images.hash_store import FileHashStore from duplicate_images.image_pair_finder import ImagePairFinder, PairFinderOptions from duplicate_images.log import setup_logging @@ -81,11 +82,19 @@ def get_matches( def execute_actions(matches: Results, args: Namespace) -> None: action_equal = ACTIONS_ON_EQUALITY[args.on_equal] - for group in sorted(matches): - try: - action_equal(args, group) - except FileNotFoundError: - continue + if args.parallel_actions: + with ThreadPool(args.parallel_actions) as pool: + pool.map(lambda group: execute_action(action_equal, group, args), matches) + else: + for group in sorted(matches): + execute_action(action_equal, group, args) + + +def execute_action(action: ActionFunction, group: ImageGroup, args: Namespace) -> None: + try: + action(args, group) + except FileNotFoundError: + pass def set_max_image_pixels(args: Namespace) -> None: diff --git a/duplicate_images/parse_commandline.py b/duplicate_images/parse_commandline.py index 48a2746..98c69d2 100644 --- a/duplicate_images/parse_commandline.py +++ b/duplicate_images/parse_commandline.py @@ -48,7 +48,11 @@ def parse_command_line(args: Optional[List[str]] = None) -> Namespace: ) parser.add_argument( '--parallel', nargs='?', type=int, default=None, const=cpu_count(), - help='Calculate hashes using PARALLEL threads (default: number of cores in the system)' + help=f'Calculate hashes using PARALLEL threads (default: {cpu_count()})' + ) + parser.add_argument( + '--parallel-actions', nargs='?', type=int, default=None, const=cpu_count(), + help=f'Execute actions on equal images using PARALLEL threads (default: {cpu_count()})' ) group = parser.add_mutually_exclusive_group() group.add_argument( diff --git a/tests/unit/test_actions.py b/tests/unit/test_actions.py index 953b42a..24631e4 100644 --- a/tests/unit/test_actions.py +++ b/tests/unit/test_actions.py @@ -2,8 +2,10 @@ import shlex from argparse import Namespace +from datetime import datetime, timedelta +from math import factorial from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, NamedTemporaryFile from typing import List, Generator, Tuple from unittest.mock import Mock, patch @@ -15,7 +17,7 @@ from duplicate_images.methods import IMAGE_HASH_ALGORITHM, quote from duplicate_images.pair_finder_options import PairFinderOptions from duplicate_images.parse_commandline import parse_command_line -from .conftest import create_jpg_and_png, create_half_jpg +from .conftest import create_jpg_and_png, create_half_jpg, create_image, IMAGE_WIDTH HASH_ALGORITHM = IMAGE_HASH_ALGORITHM['phash'] @@ -33,6 +35,22 @@ def fixture_equal_images( file.unlink(missing_ok=True) +@pytest.fixture(name='many_equal_images') +def fixture_many_equal_images( + top_directory: TemporaryDirectory, num_images: int +) -> Generator[List[Path], None, None]: + images = [] + for _ in range(num_images): + file_name = Path( + NamedTemporaryFile(dir=top_directory.name, prefix='jpeg_', suffix='.jpg').name + ) + create_image(file_name, IMAGE_WIDTH) + images.append(file_name) + yield images + for file in images: + file.unlink(missing_ok=True) + + def get_equals(equal_images: List[Path], group: bool) -> List[Tuple[Path, ...]]: equals = ImagePairFinder.create( equal_images, HASH_ALGORITHM, options=PairFinderOptions(group=group) @@ -193,6 +211,35 @@ def test_symlink(equal_images: List[Path], option: str, group: bool): assert path.resolve() == relevant +@pytest.mark.parametrize('num_images', [7]) +@pytest.mark.parametrize('parallel', [4, 10, 20]) +@pytest.mark.parametrize('sleep_time', [0.005]) +def test_parallel_actions( + many_equal_images: List[Path], num_images: int, parallel: int, sleep_time: float +) -> None: + equals = ImagePairFinder.create( + many_equal_images, HASH_ALGORITHM, options=PairFinderOptions(group=False) + ).get_equal_groups() + assert len(equals) == factorial(num_images) / (factorial(2) * factorial(num_images - 2)) + + execution_time_single = actions_execution_time( + equals, sleep_time, [] + ) + execution_time_parallel = actions_execution_time( + equals, sleep_time, ['--parallel-actions', str(parallel)] + ) + assert execution_time_parallel < execution_time_single + + +def actions_execution_time(equals: Results, sleep_time: float, extra_args: List[str]) -> timedelta: + args = parse_command_line( + ['.', '--on-equal', 'exec', '--exec', f'sleep {sleep_time}'] + extra_args + ) + start_time = datetime.now() + duplicate.execute_actions(equals, args) + return datetime.now() - start_time + + @pytest.mark.parametrize('option', ['unknown-option']) def test_unknown_option(option: str) -> None: with pytest.raises(SystemExit): diff --git a/tests/unit/test_parse_commandline.py b/tests/unit/test_parse_commandline.py index 24bd9b1..0628176 100644 --- a/tests/unit/test_parse_commandline.py +++ b/tests/unit/test_parse_commandline.py @@ -47,9 +47,26 @@ def test_parallel_default_arg() -> None: assert args.parallel == cpu_count() -def test_parallel_explicit_arg() -> None: - args = parse_command_line(['.', '--parallel', '2']) - assert args.parallel == 2 +@pytest.mark.parametrize('parallel', ['1', '2', '4', '8', '16']) +def test_parallel_explicit_arg(parallel) -> None: + args = parse_command_line(['.', '--parallel', parallel]) + assert args.parallel == int(parallel) + + +def test_parallel_actions_unspecified() -> None: + args = parse_command_line(['.']) + assert args.parallel_actions is None + + +def test_parallel_actions_default_arg() -> None: + args = parse_command_line(['.', '--parallel-actions']) + assert args.parallel_actions == cpu_count() + + +@pytest.mark.parametrize('parallel', ['1', '2', '4', '8', '16']) +def test_parallel_actions_explicit_arg(parallel) -> None: + args = parse_command_line(['.', '--parallel-actions', parallel]) + assert args.parallel_actions == int(parallel) def test_exclude_dir_unspecified() -> None: