From f57d6e916e17ee2ff574ba6096ccc21911d27533 Mon Sep 17 00:00:00 2001 From: Till Ehrengruber Date: Mon, 2 Dec 2024 20:02:44 +0100 Subject: [PATCH] fix[next]: Guard diskcache creation by file lock (#1745) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The disk cache used to cache compilation in the gtfn backend has a race condition manifesting itself in `sqlite3.OperationalError: database is locked` errors if multiple python processes try to initialize the `diskcache.Cache` object concurrently. This PR fixes this by guarding the object creation by a file-based lock in the same directory as the database. While this issue occurred frequently and was observed to be fixed on distributed file systems, the lock does not guarantee correct behavior in particular for accesses to the cache (beyond opening) since the underlying SQLite database is unreliable when stored on an NFS based file system. It does however ensure correctness of concurrent cache accesses on a local file system. See more information here: https://grantjenks.com/docs/diskcache/tutorial.html#settings https://www.sqlite.org/faq.html#q5 https://github.com/tox-dev/filelock/issues/73 NFS safe locking: https://gitlab.com/warsaw/flufl.lock [Barry Warsaw / FLUFL Lock ยท GitLab](https://gitlab.com/warsaw/flufl.lock) --- .pre-commit-config.yaml | 1 + constraints.txt | 8 ++--- min-extra-requirements-test.txt | 1 + min-requirements-test.txt | 1 + pyproject.toml | 1 + requirements-dev.txt | 8 ++--- .../next/program_processors/runners/gtfn.py | 32 ++++++++++++++++--- 7 files changed, 40 insertions(+), 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c3b6e693f..7e1870c67f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -102,6 +102,7 @@ repos: - devtools==0.12.2 - diskcache==5.6.3 - factory-boy==3.3.1 + - filelock==3.16.1 - frozendict==2.4.6 - gridtools-cpp==2.3.8 - importlib-resources==6.4.5 diff --git a/constraints.txt b/constraints.txt index b4b8bc00d4..f039fa2125 100644 --- a/constraints.txt +++ b/constraints.txt @@ -49,7 +49,7 @@ executing==2.1.0 # via devtools, stack-data factory-boy==3.3.1 # via gt4py (pyproject.toml), pytest-factoryboy faker==33.0.0 # via factory-boy fastjsonschema==2.20.0 # via nbformat -filelock==3.16.1 # via tox, virtualenv +filelock==3.16.1 # via gt4py (pyproject.toml), tox, virtualenv fonttools==4.55.0 # via matplotlib fparser==0.1.4 # via dace frozendict==2.4.6 # via gt4py (pyproject.toml) @@ -113,8 +113,8 @@ psutil==6.1.0 # via -r requirements-dev.in, ipykernel, pytest-xdist ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data pybind11==2.13.6 # via gt4py (pyproject.toml) -pydantic==2.9.2 # via bump-my-version, pydantic-settings -pydantic-core==2.23.4 # via pydantic +pydantic==2.10.0 # via bump-my-version, pydantic-settings +pydantic-core==2.27.0 # via pydantic pydantic-settings==2.6.1 # via bump-my-version pydot==3.0.2 # via tach pygments==2.18.0 # via -r requirements-dev.in, devtools, ipython, nbmake, rich, sphinx @@ -159,7 +159,7 @@ stack-data==0.6.3 # via ipython stdlib-list==0.10.0 # via tach sympy==1.13.3 # via dace tabulate==0.9.0 # via gt4py (pyproject.toml) -tach==0.14.3 # via -r requirements-dev.in +tach==0.14.4 # via -r requirements-dev.in tomli==2.1.0 ; python_version < "3.11" # via -r requirements-dev.in, black, build, coverage, jupytext, mypy, pip-tools, pyproject-api, pytest, setuptools-scm, tach, tox tomli-w==1.0.0 # via tach tomlkit==0.13.2 # via bump-my-version diff --git a/min-extra-requirements-test.txt b/min-extra-requirements-test.txt index 57c0d3969d..d7679a1f0f 100644 --- a/min-extra-requirements-test.txt +++ b/min-extra-requirements-test.txt @@ -67,6 +67,7 @@ deepdiff==5.6.0 devtools==0.6 diskcache==5.6.3 factory-boy==3.3.0 +filelock==3.0.0 frozendict==2.3 gridtools-cpp==2.3.8 hypothesis==6.0.0 diff --git a/min-requirements-test.txt b/min-requirements-test.txt index 81a1c2dea3..cf505e88d6 100644 --- a/min-requirements-test.txt +++ b/min-requirements-test.txt @@ -63,6 +63,7 @@ deepdiff==5.6.0 devtools==0.6 diskcache==5.6.3 factory-boy==3.3.0 +filelock==3.0.0 frozendict==2.3 gridtools-cpp==2.3.8 hypothesis==6.0.0 diff --git a/pyproject.toml b/pyproject.toml index 02d301957c..1e24094fa2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ 'devtools>=0.6', 'diskcache>=5.6.3', 'factory-boy>=3.3.0', + 'filelock>=3.0.0', 'frozendict>=2.3', 'gridtools-cpp>=2.3.8,==2.*', "importlib-resources>=5.0;python_version<'3.9'", diff --git a/requirements-dev.txt b/requirements-dev.txt index 9f95779fd5..6542be36f1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -49,7 +49,7 @@ executing==2.1.0 # via -c constraints.txt, devtools, stack-data factory-boy==3.3.1 # via -c constraints.txt, gt4py (pyproject.toml), pytest-factoryboy faker==33.0.0 # via -c constraints.txt, factory-boy fastjsonschema==2.20.0 # via -c constraints.txt, nbformat -filelock==3.16.1 # via -c constraints.txt, tox, virtualenv +filelock==3.16.1 # via -c constraints.txt, gt4py (pyproject.toml), tox, virtualenv fonttools==4.55.0 # via -c constraints.txt, matplotlib fparser==0.1.4 # via -c constraints.txt, dace frozendict==2.4.6 # via -c constraints.txt, gt4py (pyproject.toml) @@ -113,8 +113,8 @@ psutil==6.1.0 # via -c constraints.txt, -r requirements-dev.in, ipyk ptyprocess==0.7.0 # via -c constraints.txt, pexpect pure-eval==0.2.3 # via -c constraints.txt, stack-data pybind11==2.13.6 # via -c constraints.txt, gt4py (pyproject.toml) -pydantic==2.9.2 # via -c constraints.txt, bump-my-version, pydantic-settings -pydantic-core==2.23.4 # via -c constraints.txt, pydantic +pydantic==2.10.0 # via -c constraints.txt, bump-my-version, pydantic-settings +pydantic-core==2.27.0 # via -c constraints.txt, pydantic pydantic-settings==2.6.1 # via -c constraints.txt, bump-my-version pydot==3.0.2 # via -c constraints.txt, tach pygments==2.18.0 # via -c constraints.txt, -r requirements-dev.in, devtools, ipython, nbmake, rich, sphinx @@ -158,7 +158,7 @@ stack-data==0.6.3 # via -c constraints.txt, ipython stdlib-list==0.10.0 # via -c constraints.txt, tach sympy==1.13.3 # via -c constraints.txt, dace tabulate==0.9.0 # via -c constraints.txt, gt4py (pyproject.toml) -tach==0.14.3 # via -c constraints.txt, -r requirements-dev.in +tach==0.14.4 # via -c constraints.txt, -r requirements-dev.in tomli==2.1.0 ; python_version < "3.11" # via -c constraints.txt, -r requirements-dev.in, black, build, coverage, jupytext, mypy, pip-tools, pyproject-api, pytest, setuptools-scm, tach, tox tomli-w==1.0.0 # via -c constraints.txt, tach tomlkit==0.13.2 # via -c constraints.txt, bump-my-version diff --git a/src/gt4py/next/program_processors/runners/gtfn.py b/src/gt4py/next/program_processors/runners/gtfn.py index 1f3778f227..55f479c665 100644 --- a/src/gt4py/next/program_processors/runners/gtfn.py +++ b/src/gt4py/next/program_processors/runners/gtfn.py @@ -7,11 +7,14 @@ # SPDX-License-Identifier: BSD-3-Clause import functools +import pathlib +import tempfile import warnings from typing import Any, Optional import diskcache import factory +import filelock import gt4py._core.definitions as core_defs import gt4py.next.allocators as next_allocators @@ -139,13 +142,34 @@ def fingerprint_compilable_program(inp: stages.CompilableProgram) -> str: class FileCache(diskcache.Cache): """ - This class extends `diskcache.Cache` to ensure the cache is closed upon deletion, - i.e. it ensures that any resources associated with the cache are properly - released when the instance is garbage collected. + This class extends `diskcache.Cache` to ensure the cache is properly + - opened when accessed by multiple processes using a file lock. This guards the creating of the + cache object, which has been reported to cause `sqlite3.OperationalError: database is locked` + errors and slow startup times when multiple processes access the cache concurrently. While this + issue occurred frequently and was observed to be fixed on distributed file systems, the lock + does not guarantee correct behavior in particular for accesses to the cache (beyond opening) + since the underlying SQLite database is unreliable when stored on an NFS based file system. + It does however ensure correctness of concurrent cache accesses on a local file system. See + #1745 for more details. + - closed upon deletion, i.e. it ensures that any resources associated with the cache are + properly released when the instance is garbage collected. """ + def __init__(self, directory: Optional[str | pathlib.Path] = None, **settings: Any) -> None: + if directory: + lock_dir = pathlib.Path(directory).parent + else: + lock_dir = pathlib.Path(tempfile.gettempdir()) + + lock = filelock.FileLock(lock_dir / "file_cache.lock") + with lock: + super().__init__(directory=directory, **settings) + + self._init_complete = True + def __del__(self) -> None: - self.close() + if getattr(self, "_init_complete", False): # skip if `__init__` didn't finished + self.close() class GTFNCompileWorkflowFactory(factory.Factory):