diff --git a/.flake8 b/.flake8 index 7df7ef1..18e8b07 100644 --- a/.flake8 +++ b/.flake8 @@ -11,6 +11,6 @@ exclude = docs/source/conf.py max-line-length = 88 select = C,E,F,W,B,B950 -extend-ignore = E203,E501,E129,W503 +extend-ignore = E203,E501,E129,W503,E704 per-file-ignores = setup.py:F401 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 202b022..f715670 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,39 +1,40 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-yaml - - id: check-added-large-files - - repo: https://github.com/psf/black - rev: 22.3.0 - hooks: - - id: black - exclude: ^(fileformats/core/_version\.py)$ - args: - - -l 88 - - repo: https://github.com/codespell-project/codespell - rev: v2.1.0 - hooks: - - id: codespell - exclude: ^(fileformats/core/_version\.py)$ - args: - - --ignore-words=.codespell-ignorewords - - repo: https://github.com/PyCQA/flake8 - rev: 4.0.1 - hooks: - - id: flake8 - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.2 - hooks: - - id: mypy - args: - [ - --strict, - --install-types, - --non-interactive, - --no-warn-unused-ignores, - ] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + exclude: ^(fileformats/core/_version\.py)$ + args: + - -l 88 + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + exclude: ^(fileformats/core/_version\.py)$ + args: + - --ignore-words=.codespell-ignorewords + - repo: https://github.com/PyCQA/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.11.2 + hooks: + - id: mypy + args: + [ + --strict, + --install-types, + --non-interactive, + --no-warn-unused-ignores, + ] + exclude: tests diff --git a/fileformats/core/__init__.py b/fileformats/core/__init__.py index 7d09206..6fb5713 100644 --- a/fileformats/core/__init__.py +++ b/fileformats/core/__init__.py @@ -11,6 +11,7 @@ ) from .sampling import SampleFileGenerator from .extras import extra, extra_implementation, converter +from .decorators import mtime_cached_property __all__ = [ "__version__", @@ -27,4 +28,5 @@ "extra", "extra_implementation", "converter", + "mtime_cached_property", ] diff --git a/fileformats/core/classifier.py b/fileformats/core/classifier.py index 1f7fc3d..c91021d 100644 --- a/fileformats/core/classifier.py +++ b/fileformats/core/classifier.py @@ -1,5 +1,5 @@ import typing as ty -from .utils import classproperty +from .decorators import classproperty from .exceptions import FormatDefinitionError diff --git a/fileformats/core/datatype.py b/fileformats/core/datatype.py index 8e7d14f..bebfcd4 100644 --- a/fileformats/core/datatype.py +++ b/fileformats/core/datatype.py @@ -11,8 +11,8 @@ FormatConversionError, FormatRecognitionError, ) +from .decorators import classproperty from .utils import ( - classproperty, subpackages, add_exc_note, ) diff --git a/fileformats/core/decorators.py b/fileformats/core/decorators.py new file mode 100644 index 0000000..ef3f056 --- /dev/null +++ b/fileformats/core/decorators.py @@ -0,0 +1,114 @@ +import sys +import typing as ty +from pathlib import Path +import time +from threading import RLock +import fileformats.core +from .fs_mount_identifier import FsMountIdentifier + + +PropReturn = ty.TypeVar("PropReturn") + + +__all__ = ["mtime_cached_property", "classproperty"] + + +class mtime_cached_property: + """A property that is cached until the mtimes of the files in the fileset are changed""" + + def __init__(self, func: ty.Callable[..., ty.Any]): + self.func = func + self.__doc__ = func.__doc__ + self.lock = RLock() + self._cache_name = f"_{func.__name__}_mtime_cache" + + def __get__( + self, + instance: ty.Optional["fileformats.core.FileSet"], + owner: ty.Optional[ty.Type["fileformats.core.FileSet"]] = None, + ) -> ty.Any: + if instance is None: # if accessing property from class not instance + return self + assert isinstance(instance, fileformats.core.FileSet), ( + "Cannot use mtime_cached_property instance with " + f"{type(instance).__name__!r} object, only FileSet objects." + ) + try: + mtimes, value = instance.__dict__[self._cache_name] + except KeyError: + pass + else: + if ( + instance.mtimes == mtimes + and enough_time_has_elapsed_given_mtime_resolution(mtimes) + ): + return value + else: + del instance.__dict__[self._cache_name] + with self.lock: + # check if another thread filled cache while we awaited lock + try: + mtimes, value = instance.__dict__[self._cache_name] + except KeyError: + pass + else: + if ( + instance.mtimes == mtimes + and enough_time_has_elapsed_given_mtime_resolution(mtimes) + ): + return value + value = self.func(instance) + instance.__dict__[self._cache_name] = (instance.mtimes, value) + return value + + +def classproperty(meth: ty.Callable[..., PropReturn]) -> PropReturn: + """Access a @classmethod like a @property.""" + # mypy doesn't understand class properties yet: https://github.com/python/mypy/issues/2563 + return classmethod(property(meth)) # type: ignore + + +if sys.version_info[:2] < (3, 9): + + class classproperty(object): # type: ignore[no-redef] # noqa + def __init__(self, f: ty.Callable[[ty.Type[ty.Any]], ty.Any]): + self.f = f + + def __get__(self, obj: ty.Any, owner: ty.Any) -> ty.Any: + return self.f(owner) + + +def enough_time_has_elapsed_given_mtime_resolution( + mtimes: ty.Iterable[ty.Tuple[Path, ty.Union[int, float]]], + current_time: ty.Optional[int] = None, +) -> bool: + """Determines whether enough time has elapsed since the the last of the cached mtimes + to be sure that changes in mtimes will be detected given the resolution of the mtimes + on the file system. For example, on systems with a mtime resolution of a second, + a change in mtime may not be detected if the cache is re-read within a second and + the file is modified in the intervening period (probably only likely during tests). + So this function guesses the resolution of the mtimes by the + minimum number of trailing zeros in the mtimes and then checks if enough time has + passed to be sure that any changes in mtimes will be detected. + + This may result in false negatives for systems with low mtime resolutions, but + this will just result in (presumably minor) performance hit via unnecessary cache + invalidations. + + Parameters + ---------- + mtimes : Iterable[tuple[Path, int]] + the path-mtime pairs in nanoseconds to guess the resolution of + + Returns + ------- + bool + whether enough time has elapsed since the lagiven the guessed resolution of the mtimes + """ + if current_time is None: + current_time = time.time_ns() + for fspath, mtime in mtimes: + mtime_resolution = FsMountIdentifier.get_mtime_resolution(fspath) + if current_time - mtime <= mtime_resolution: + return False + return True diff --git a/fileformats/core/field.py b/fileformats/core/field.py index b79eb54..847b39c 100644 --- a/fileformats/core/field.py +++ b/fileformats/core/field.py @@ -1,8 +1,6 @@ from __future__ import annotations import typing as ty -from .utils import ( - classproperty, -) +from .decorators import classproperty from .datatype import DataType from .exceptions import FormatMismatchError diff --git a/fileformats/core/fileset.py b/fileformats/core/fileset.py index da3a07d..9b4e02f 100644 --- a/fileformats/core/fileset.py +++ b/fileformats/core/fileset.py @@ -16,13 +16,12 @@ import logging from fileformats.core.typing import Self from .utils import ( - classproperty, - mtime_cached_property, fspaths_converter, describe_task, matching_source, import_extras_module, ) +from .decorators import mtime_cached_property, classproperty from .typing import FspathsInputType, CryptoMethod, PathType from .sampling import SampleFileGenerator from .identification import ( @@ -187,16 +186,16 @@ def relative_fspaths(self) -> ty.Iterator[Path]: return (p.relative_to(self.parent) for p in self.fspaths) @property - def mtimes(self) -> ty.Tuple[ty.Tuple[str, float], ...]: + def mtimes(self) -> ty.Tuple[ty.Tuple[str, int], ...]: """Modification times of all fspaths in the file-set Returns ------- - tuple[tuple[str, float], ...] - a tuple of tuples containing the file paths and the modification time sorted - by the file path + tuple[tuple[str, int], ...] + a tuple of tuples containing the file paths and the modification time (ns) + sorted by the file path """ - return tuple((str(p), p.stat().st_mtime) for p in sorted(self.fspaths)) + return tuple((str(p), p.stat().st_mtime_ns) for p in sorted(self.fspaths)) @classproperty def mime_type(cls) -> str: diff --git a/fileformats/core/fs_mount_identifier.py b/fileformats/core/fs_mount_identifier.py index 6502a9c..14ef5ae 100644 --- a/fileformats/core/fs_mount_identifier.py +++ b/fileformats/core/fs_mount_identifier.py @@ -155,4 +155,39 @@ def patch_table(cls, mount_table: ty.List[ty.Tuple[str, str]]) -> ty.Iterator[No finally: cls._mount_table = orig_table + @classmethod + def get_mtime_resolution(cls, path: PathLike) -> int: + """Get the mount point for a given file-system path + + Parameters + ---------- + path: os.PathLike + the file-system path to identify the mount of + + Returns + ------- + mount_point: os.PathLike + the root of the mount the path sits on + fstype : str + the type of the file-system (e.g. ext4 or cifs)""" + mount_point, fstype = cls.get_mount(path) + try: + resolution = cls.FS_MAX_MTIME_NS_RESOLUTION[fstype] + except KeyError: + resolution = max(cls.FS_MAX_MTIME_NS_RESOLUTION.values()) + return resolution + _mount_table: ty.Optional[ty.List[ty.Tuple[str, str]]] = None + + # Define a table of file system types and their mtime resolutions (in seconds) + FS_MAX_MTIME_NS_RESOLUTION: ty.Dict[str, int] = { + "ext4": int(1e9), # docs say 1 nanosecond but in found 1 sec often in practice + "xfs": 1, + "btrfs": 1, + "ntfs": 100, + "hfs": 1, + "apfs": 1, + "fat32": int(2e9), # 2 seconds + "exfat": int(1e9), # 1 second + # Add more file systems and their resolutions as needed + } diff --git a/fileformats/core/mixin.py b/fileformats/core/mixin.py index d7d2540..89b8a03 100644 --- a/fileformats/core/mixin.py +++ b/fileformats/core/mixin.py @@ -4,7 +4,8 @@ import logging from .datatype import DataType import fileformats.core -from .utils import classproperty, describe_task, matching_source +from .utils import describe_task, matching_source +from .decorators import classproperty from .identification import to_mime_format_name from .converter_helpers import SubtypeVar, ConverterSpec from .classifier import Classifier diff --git a/fileformats/core/tests/test_decorators.py b/fileformats/core/tests/test_decorators.py new file mode 100644 index 0000000..42a4c2e --- /dev/null +++ b/fileformats/core/tests/test_decorators.py @@ -0,0 +1,45 @@ +from pathlib import Path +import time +from fileformats.core.decorators import ( + mtime_cached_property, + enough_time_has_elapsed_given_mtime_resolution, +) +from fileformats.generic import File + + +class MtimeTestFile(File): + + flag: int + + @mtime_cached_property + def cached_prop(self): + return self.flag + + +def test_mtime_cached_property(tmp_path: Path): + fspath = tmp_path / "file_1.txt" + fspath.write_text("hello") + + file = MtimeTestFile(fspath) + time.sleep( + 2 + ) # ensure enough time has elapsed since file creation/modification for mtime to increment + + file.flag = 0 + assert file.cached_prop == 0 + file.flag = 1 + assert file.cached_prop == 0 + fspath.write_text("world") + assert file.cached_prop == 1 + + +def test_enough_time_has_elapsed_given_mtime_resolution(): + assert enough_time_has_elapsed_given_mtime_resolution( + [("", 110), ("", 220), ("", 300)], int(3e9) # need to make it high for windows + ) + + +def test_not_enough_time_has_elapsed_given_mtime_resolution(): + assert not enough_time_has_elapsed_given_mtime_resolution( + [("", 110), ("", 220), ("", 300)], 301 + ) diff --git a/fileformats/core/tests/test_utils.py b/fileformats/core/tests/test_utils.py index 21ce5fc..071fdf7 100644 --- a/fileformats/core/tests/test_utils.py +++ b/fileformats/core/tests/test_utils.py @@ -8,7 +8,6 @@ from fileformats.generic import File, Directory, FsObject from fileformats.core.mixin import WithSeparateHeader from fileformats.core.exceptions import UnsatisfiableCopyModeError -from fileformats.core.utils import mtime_cached_property from conftest import write_test_file @@ -366,44 +365,3 @@ def test_hash_files(fsobject: FsObject, work_dir: Path, dest_dir: Path): ) cpy = fsobject.copy(dest_dir) assert cpy.hash_files() == fsobject.hash_files() - - -class MtimeTestFile(File): - - flag: int - - @mtime_cached_property - def cached_prop(self): - return self.flag - - -def test_mtime_cached_property(tmp_path: Path): - fspath = tmp_path / "file_1.txt" - fspath.write_text("hello") - - file = MtimeTestFile(fspath) - - file.flag = 0 - assert file.cached_prop == 0 - # Need a long delay to ensure the mtime changes on Ubuntu and particularly on Windows - # On MacOS, the mtime resolution is much higher so not usually an issue. Use - # explicitly cache clearing if needed - time.sleep(2) - file.flag = 1 - assert file.cached_prop == 0 - time.sleep(2) - fspath.write_text("world") - assert file.cached_prop == 1 - - -def test_mtime_cached_property_force_clear(tmp_path: Path): - fspath = tmp_path / "file_1.txt" - fspath.write_text("hello") - - file = MtimeTestFile(fspath) - - file.flag = 0 - assert file.cached_prop == 0 - file.flag = 1 - MtimeTestFile.cached_prop.clear(file) - assert file.cached_prop == 1 diff --git a/fileformats/core/utils.py b/fileformats/core/utils.py index e7a5845..24ec888 100644 --- a/fileformats/core/utils.py +++ b/fileformats/core/utils.py @@ -1,12 +1,10 @@ import importlib from pathlib import Path import inspect -import sys import typing as ty from types import ModuleType import urllib.request import urllib.error -from threading import RLock import os import logging import pkgutil @@ -96,25 +94,6 @@ def fspaths_converter(fspaths: FspathsInputType) -> ty.FrozenSet[Path]: return frozenset(Path(p).absolute() for p in fspaths) -PropReturn = ty.TypeVar("PropReturn") - - -def classproperty(meth: ty.Callable[..., PropReturn]) -> PropReturn: - """Access a @classmethod like a @property.""" - # mypy doesn't understand class properties yet: https://github.com/python/mypy/issues/2563 - return classmethod(property(meth)) # type: ignore - - -if sys.version_info[:2] < (3, 9): - - class classproperty(object): # type: ignore[no-redef] # noqa - def __init__(self, f: ty.Callable[[ty.Type[ty.Any]], ty.Any]): - self.f = f - - def __get__(self, obj: ty.Any, owner: ty.Any) -> ty.Any: - return self.f(owner) - - def add_exc_note(e: Exception, note: str) -> Exception: """Adds a note to an exception in a Python <3.11 compatible way @@ -249,54 +228,3 @@ def import_extras_module(klass: ty.Type["fileformats.core.DataType"]) -> ExtrasM else: extras_imported = True return ExtrasModule(extras_imported, extras_pkg, extras_pypi) - - -ReturnType = ty.TypeVar("ReturnType") - - -class mtime_cached_property: - """A property that is cached until the mtimes of the files in the fileset are changed""" - - def __init__(self, func: ty.Callable[..., ty.Any]): - self.func = func - self.__doc__ = func.__doc__ - self.lock = RLock() - - @property - def _cache_name(self) -> str: - return f"_{self.func.__name__}_mtime_cache" - - def clear(self, instance: "fileformats.core.FileSet") -> None: - """Forcibly clear the cache""" - del instance.__dict__[self._cache_name] - - def __get__( - self, - instance: ty.Optional["fileformats.core.FileSet"], - owner: ty.Optional[ty.Type["fileformats.core.FileSet"]] = None, - ) -> ty.Any: - if instance is None: # if accessing property from class not instance - return self - assert isinstance(instance, fileformats.core.FileSet), ( - "Cannot use mtime_cached_property instance with " - f"{type(instance).__name__!r} object, only FileSet objects." - ) - try: - mtimes, value = instance.__dict__[self._cache_name] - except KeyError: - pass - else: - if instance.mtimes == mtimes: - return value - with self.lock: - # check if another thread filled cache while we awaited lock - try: - mtimes, value = instance.__dict__[self._cache_name] - except KeyError: - pass - else: - if instance.mtimes == mtimes: - return value - value = self.func(instance) - instance.__dict__[self._cache_name] = (instance.mtimes, value) - return value diff --git a/fileformats/generic/directory.py b/fileformats/generic/directory.py index b1a29fb..75784cd 100644 --- a/fileformats/generic/directory.py +++ b/fileformats/generic/directory.py @@ -1,7 +1,7 @@ import typing as ty from pathlib import Path from fileformats.core.exceptions import FormatMismatchError -from fileformats.core.utils import classproperty +from fileformats.core.decorators import classproperty from .fsobject import FsObject from fileformats.core.fileset import FileSet, FILE_CHUNK_LEN_DEFAULT from fileformats.core.mixin import WithClassifiers diff --git a/fileformats/generic/file.py b/fileformats/generic/file.py index ba096eb..6098051 100644 --- a/fileformats/generic/file.py +++ b/fileformats/generic/file.py @@ -1,3 +1,4 @@ +import io from pathlib import Path import typing as ty from fileformats.core.fileset import FileSet @@ -5,7 +6,7 @@ FormatMismatchError, UnconstrainedExtensionException, ) -from fileformats.core.utils import classproperty, mtime_cached_property +from fileformats.core.decorators import classproperty, mtime_cached_property from .fsobject import FsObject @@ -107,7 +108,7 @@ def read_contents( ) -> ty.Union[str, bytes]: with self.open() as f: if offset: - f.read(offset) + f.seek(offset, (io.SEEK_SET if offset >= 0 else io.SEEK_END)) contents = f.read(size) if size else f.read() return contents diff --git a/fileformats/generic/fsobject.py b/fileformats/generic/fsobject.py index 6deb42f..a468638 100644 --- a/fileformats/generic/fsobject.py +++ b/fileformats/generic/fsobject.py @@ -6,7 +6,7 @@ from fileformats.core.exceptions import ( FormatMismatchError, ) -from fileformats.core.utils import classproperty +from fileformats.core.decorators import classproperty class FsObject(FileSet, os.PathLike): # type: ignore diff --git a/fileformats/generic/set.py b/fileformats/generic/set.py index 2da783b..abb55b6 100644 --- a/fileformats/generic/set.py +++ b/fileformats/generic/set.py @@ -3,7 +3,7 @@ from fileformats.core.exceptions import ( FormatMismatchError, ) -from fileformats.core.utils import mtime_cached_property +from fileformats.core.decorators import mtime_cached_property from fileformats.core.mixin import WithClassifiers diff --git a/fileformats/image/raster.py b/fileformats/image/raster.py index 12ef1a5..0e391ed 100644 --- a/fileformats/image/raster.py +++ b/fileformats/image/raster.py @@ -23,11 +23,11 @@ class RasterImage(Image): @extra def read_data(self) -> DataArrayType: - raise NotImplementedError + ... @extra def write_data(self, data_array: DataArrayType) -> None: - raise NotImplementedError + ... @classmethod def save_new(cls, fspath: Path, data_array: DataArrayType) -> Self: