Skip to content

Commit

Permalink
mark hash store as dirty when changed and only write back to disk if …
Browse files Browse the repository at this point in the history
…dirty
  • Loading branch information
lene committed Jan 25, 2024
1 parent 5975eb8 commit da37eb2
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 28 deletions.
1 change: 0 additions & 1 deletion duplicate_images/duplicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def get_matches(
with FileHashStore.create(hash_store_path, algorithm, hash_size_kwargs) as hash_store:
return ImagePairFinder.create(
image_files, hash_algorithm, options=options, hash_store=hash_store,

).get_equal_groups()


Expand Down
13 changes: 7 additions & 6 deletions duplicate_images/hash_scanner/image_hash_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from PIL.Image import DecompressionBombError

from duplicate_images.common import path_with_parent
from duplicate_images.function_types import Cache, CacheEntry, HashFunction
from duplicate_images.function_types import CacheEntry, HashFunction
from duplicate_images.hash_store import HashStore, NullHashStore
from duplicate_images.methods import get_hash_size_kwargs
from duplicate_images.pair_finder_options import PairFinderOptions
from duplicate_images.progress_bar_manager import ProgressBarManager, NullProgressBarManager
Expand All @@ -22,7 +23,7 @@ class ImageHashScanner:
def create(
files: List[Path], hash_algorithm: HashFunction,
options: PairFinderOptions,
hash_store: Optional[Cache] = None,
hash_store: HashStore = NullHashStore(),
progress_bars: ProgressBarManager = NullProgressBarManager()
) -> 'ImageHashScanner':
hash_size_kwargs = get_hash_size_kwargs(hash_algorithm, options.hash_size)
Expand All @@ -38,13 +39,13 @@ def create(
def __init__( # pylint: disable = too-many-arguments
self, files: List[Path], hash_algorithm: HashFunction,
hash_size_kwargs: Optional[Dict] = None,
hash_store: Optional[Cache] = None,
hash_store: HashStore = NullHashStore(),
progress_bars: ProgressBarManager = NullProgressBarManager()
) -> None:
self.files = files
self.algorithm = hash_algorithm
self.hash_size_kwargs = hash_size_kwargs if hash_size_kwargs is not None else {}
self.hash_store = hash_store if hash_store is not None else {}
self.hash_store = hash_store
self.progress_bars = progress_bars
logging.info('Using %s', self.class_string())

Expand All @@ -62,7 +63,7 @@ def get_hash(self, file: Path) -> CacheEntry:
return file, cached

image_hash = self.algorithm(Image.open(file), **self.hash_size_kwargs)
self.hash_store[file] = image_hash
self.hash_store.add(file, image_hash)
return file, image_hash
except OSError as err:
logging.warning('%s: %s', path_with_parent(file), err)
Expand All @@ -79,7 +80,7 @@ def __init__( # pylint: disable = too-many-arguments
self,
files: List[Path], hash_algorithm: HashFunction,
hash_size_kwargs: Optional[Dict] = None,
hash_store: Optional[Cache] = None,
hash_store: HashStore = NullHashStore(),
progress_bars: ProgressBarManager = NullProgressBarManager(),
parallel: int = os.cpu_count() or 1
) -> None:
Expand Down
36 changes: 26 additions & 10 deletions duplicate_images/hash_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,27 @@ class NullHashStore:
def __init__(self) -> None:
logging.info('No persistent storage for calculated image hashes set up')

def __enter__(self) -> Cache:
return {}
def __enter__(self) -> 'NullHashStore':
return self

def __exit__(self, _: Any, __: Any, ___: Any) -> None:
pass

def get(self, _: Path) -> Optional[ImageHash]:
return None

def add(self, _: Path, __: ImageHash) -> None:
pass


HashStore = Union[NullHashStore, 'FileHashStore', 'PickleHashStore', 'JSONHashStore']


class FileHashStore:

@classmethod
@staticmethod
def create(
cls, store_path: Optional[Path], algorithm: str, hash_size_kwargs: Dict
store_path: Optional[Path], algorithm: str, hash_size_kwargs: Dict
) -> Union['FileHashStore', NullHashStore]:
if store_path is None:
return NullHashStore()
Expand All @@ -41,24 +50,34 @@ def __init__(self, store_path: Path, algorithm: str, hash_size_kwargs: Dict) ->
self.algorithm = algorithm
self.hash_size_kwargs = hash_size_kwargs
self.values: Cache = {}
self.dirty: bool = False
try:
self.load()
logging.info(
'Opened persistent storage %s with %d entries', store_path, len(self.values)
)
except (FileNotFoundError, EOFError, pickle.PickleError):
logging.info('Creating new persistent storage at %s', store_path)
logging.info('Creating new %s at %s', self.__class__.__name__, store_path)

def __enter__(self) -> Cache:
return self.values
def __enter__(self) -> 'FileHashStore':
return self

def __exit__(self, _: Any, __: Any, ___: Any) -> None:
if not self.dirty:
return
if self.store_path.is_file():
if self.store_path.with_suffix('.bak').is_file():
self.store_path.with_suffix('.bak').unlink()
self.store_path.rename(self.store_path.with_suffix('.bak'))
self.dump()

def add(self, file: Path, image_hash: ImageHash) -> None:
self.values[file] = image_hash
self.dirty = True

def get(self, file: Path) -> Optional[ImageHash]:
return self.values.get(file)

def metadata(self) -> Dict:
return {'algorithm': self.algorithm, **self.hash_size_kwargs}

Expand Down Expand Up @@ -140,6 +159,3 @@ def converted_values(self):
def dump(self) -> None:
with self.store_path.open('w') as file:
json.dump((self.converted_values(), self.metadata()), file)


HashStore = Union[NullHashStore, PickleHashStore, JSONHashStore]
7 changes: 4 additions & 3 deletions duplicate_images/image_pair_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
from itertools import combinations
from pathlib import Path
from time import time
from typing import Dict, List, Iterator, Optional
from typing import Dict, List, Iterator

from imagehash import ImageHash

from duplicate_images.common import log_execution_time
from duplicate_images.function_types import (
Cache, HashFunction, ImageGroup, Results, ResultsGenerator, ResultsGrouper
HashFunction, ImageGroup, Results, ResultsGenerator, ResultsGrouper
)
from duplicate_images.hash_scanner import ImageHashScanner
from duplicate_images.hash_store import HashStore, NullHashStore
from duplicate_images.pair_finder_options import PairFinderOptions
from duplicate_images.progress_bar_manager import ProgressBarManager, NullProgressBarManager

Expand All @@ -35,7 +36,7 @@ class ImagePairFinder:
def create(
cls, files: List[Path], hash_algorithm: HashFunction,
options: PairFinderOptions = PairFinderOptions(),
hash_store: Optional[Cache] = None
hash_store: HashStore = NullHashStore()
) -> 'ImagePairFinder':
group_results = group_results_as_tuples if options.group else group_results_as_pairs
progress_bars = ProgressBarManager.create(len(files), options.show_progress_bars)
Expand Down
4 changes: 3 additions & 1 deletion tests/integration/test_persistent_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_open_hash_store_with_filename(
creation_time = cache_file.stat().st_ctime
get_matches([folder], 'phash', hash_store_path=cache_file)
assert average_hash.call_count == 0
assert cache_file.stat().st_ctime > creation_time
assert cache_file.stat().st_atime > creation_time


@pytest.mark.parametrize('test_set', ['different', 'equal_but_binary_different'])
Expand Down Expand Up @@ -138,10 +138,12 @@ def test_opening_with_different_algorithm_parameters_leads_to_error(
tmp_dir: Path, data_dir: Path, test_set: str, file_type: str, hash_size: Tuple[int, int]
) -> None:
cache_file = tmp_dir / f'hash_store.{file_type}'
assert not cache_file.is_file()
get_matches(
[data_dir / test_set], 'phash', options=PairFinderOptions(hash_size=hash_size[0]),
hash_store_path=cache_file
)
assert cache_file.is_file()
with pytest.raises(ValueError, match='Metadata mismatch'):
get_matches(
[data_dir / test_set], 'phash', options=PairFinderOptions(hash_size=hash_size[1]),
Expand Down
51 changes: 44 additions & 7 deletions tests/unit/test_persistent_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,35 @@
from itertools import combinations
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional, List
from time import sleep
from typing import List

import pytest

from duplicate_images import duplicate
from duplicate_images.function_types import Cache
from duplicate_images.image_pair_finder import ImagePairFinder
from duplicate_images.pair_finder_options import PairFinderOptions
from duplicate_images.hash_store import PickleHashStore, JSONHashStore, FileHashStore
from .conftest import MOCK_IMAGE_HASH_VALUE, mock_algorithm
from duplicate_images.hash_store import (
PickleHashStore, JSONHashStore, FileHashStore, HashStore, NullHashStore
)
from .conftest import MOCK_IMAGE_HASH_VALUE, mock_algorithm, create_jpg_and_png

DEFAULT_ALGORITHM = 'phash'
DEFAULT_HASH_SIZE = {'hash_size': 8}
DEFAULT_METADATA = {'algorithm': DEFAULT_ALGORITHM, **DEFAULT_HASH_SIZE}


class MockHashStore(FileHashStore): # pylint: disable=abstract-method
def __init__(self, values: Cache) -> None: # pylint: disable=super-init-not-called
self.values = values


def test_empty_hash_store_calculates_hash_values(
top_directory: TemporaryDirectory, image_files: List[Path],
reset_call_count # pylint: disable=unused-argument
) -> None:
finder = generate_pair_finder(top_directory, None)
finder = generate_pair_finder(top_directory, NullHashStore())
assert mock_algorithm.call_count > 0
check_correct_results(finder, image_files)

Expand All @@ -34,15 +42,15 @@ def test_filled_hash_store_does_not_calculate_hash_values(
top_directory: TemporaryDirectory, image_files: List[Path],
reset_call_count # pylint: disable=unused-argument
) -> None:
hash_store = {path: MOCK_IMAGE_HASH_VALUE for path in image_files}
hash_store = MockHashStore({path: MOCK_IMAGE_HASH_VALUE for path in image_files})
generate_pair_finder(top_directory, hash_store)
assert mock_algorithm.call_count == 0


def test_empty_hash_store_is_filled(
top_directory: TemporaryDirectory, reset_call_count # pylint: disable=unused-argument
) -> None:
finder = generate_pair_finder(top_directory, None)
finder = generate_pair_finder(top_directory, NullHashStore())
original_call_number = mock_algorithm.call_count
finder.get_equal_groups()
assert mock_algorithm.call_count == original_call_number
Expand Down Expand Up @@ -133,12 +141,36 @@ def test_checked_load_sets_metadata(
assert hash_store.metadata() == DEFAULT_METADATA


@pytest.mark.parametrize('file_type', ['pickle', 'json'])
def test_hash_store_not_written_if_not_changed(
top_directory: TemporaryDirectory, hash_store_path: Path
) -> None:
create_verified_hash_store(top_directory, hash_store_path)
assert hash_store_path.is_file()
creation_time = hash_store_path.stat().st_ctime
scan_images_with_hash_store(top_directory, hash_store_path)
assert hash_store_path.stat().st_ctime == creation_time
assert hash_store_path.stat().st_mtime == creation_time


@pytest.mark.parametrize('file_type', ['pickle', 'json'])
def test_hash_store_is_accessed_even_if_not_changed(
top_directory: TemporaryDirectory, hash_store_path: Path
) -> None:
create_verified_hash_store(top_directory, hash_store_path)
assert hash_store_path.is_file()
sleep(0.01) # ensure the access time is different
creation_time = hash_store_path.stat().st_ctime
scan_images_with_hash_store(top_directory, hash_store_path)
assert hash_store_path.stat().st_atime > creation_time


def image_list(top_directory: TemporaryDirectory) -> List[Path]:
return sorted(duplicate.files_in_dirs([top_directory.name]))


def generate_pair_finder(
top_directory: TemporaryDirectory, hash_store: Optional[Cache]
top_directory: TemporaryDirectory, hash_store: HashStore
) -> ImagePairFinder:
return ImagePairFinder.create(
image_list(top_directory), mock_algorithm, options=PairFinderOptions(slow=True),
Expand All @@ -147,6 +179,11 @@ def generate_pair_finder(


def create_verified_hash_store(top_directory: TemporaryDirectory, store_path: Path) -> None:
create_jpg_and_png(top_directory)
scan_images_with_hash_store(top_directory, store_path)


def scan_images_with_hash_store(top_directory: TemporaryDirectory, store_path: Path) -> None:
with FileHashStore.create(store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) as hash_store:
finder = generate_pair_finder(top_directory, hash_store)
finder.get_equal_groups()
Expand Down

0 comments on commit da37eb2

Please sign in to comment.