From f9a9e8ac47e0d9112b9b408d5e1cac45b6119a70 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Fri, 17 May 2024 09:00:45 -0400 Subject: [PATCH 1/2] LocalCache: limit allowed blob size --- lindi/LindiH5ZarrStore/LindiH5ZarrStore.py | 15 +++++---- .../LindiReferenceFileSystemStore.py | 5 ++- lindi/LindiRemfile/LindiRemfile.py | 31 ++++++++++++------- lindi/LocalCache/LocalCache.py | 5 +++ tests/test_local_cache.py | 28 ++++++++++++++++- 5 files changed, 64 insertions(+), 20 deletions(-) diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index 86cc173..a76e203 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -345,12 +345,15 @@ def _get_chunk_file_bytes(self, key_parent: str, key_name: str): buf = _read_bytes(self._file, byte_offset, byte_count) if self._local_cache is not None: assert self._url is not None, "Unexpected: url is None but local_cache is not None" - self._local_cache.put_remote_chunk( - url=self._url, - offset=byte_offset, - size=byte_count, - data=buf - ) + if byte_count < 1000 * 1000 * 900: + self._local_cache.put_remote_chunk( + url=self._url, + offset=byte_offset, + size=byte_count, + data=buf + ) + else: + print(f"Warning: Not storing chunk of size {byte_count} in local cache in LindiH5ZarrStore (key: {key_parent}/{key_name})") return buf def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py index 8fea9f8..19b2e80 100644 --- a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py +++ b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py @@ -141,7 +141,10 @@ def __getitem__(self, key: str): return x val = _read_bytes_from_url_or_path(url, offset, length) if self.local_cache is not None: - self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val) + if length < 1000 * 1000 * 900: + self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val) + else: + print(f'Warning: not caching chunk of size {length} om LindiReferenceFileSystemStore (key: {key})') return val else: # should not happen given checks in __init__, but self.rfs is mutable diff --git a/lindi/LindiRemfile/LindiRemfile.py b/lindi/LindiRemfile/LindiRemfile.py index db3e7ec..af11f60 100644 --- a/lindi/LindiRemfile/LindiRemfile.py +++ b/lindi/LindiRemfile/LindiRemfile.py @@ -217,12 +217,16 @@ def _load_chunk(self, chunk_index: int) -> bytes: if self._local_cache is None: self._memory_chunks[chunk_index] = x if self._local_cache is not None: - self._local_cache.put_remote_chunk( - url=self._url, - offset=chunk_index * self._min_chunk_size, - size=min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size), - data=x - ) + size = min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size) + if size < 1000 * 1000 * 900: + self._local_cache.put_remote_chunk( + url=self._url, + offset=chunk_index * self._min_chunk_size, + size=size, + data=x + ) + else: + raise Exception(f'Unexpected large size for chunk when caching: {size}') self._memory_chunk_indices.append(chunk_index) else: for i in range(self._smart_loader_chunk_sequence_length): @@ -238,12 +242,15 @@ def _load_chunk(self, chunk_index: int) -> bytes: data = x[i * self._min_chunk_size: (i + 1) * self._min_chunk_size] if len(data) != size: raise ValueError(f'Unexpected: len(data) != size: {len(data)} != {size}') - self._local_cache.put_remote_chunk( - url=self._url, - offset=(chunk_index + i) * self._min_chunk_size, - size=size, - data=data - ) + if size < 1000 * 1000 * 900: + self._local_cache.put_remote_chunk( + url=self._url, + offset=(chunk_index + i) * self._min_chunk_size, + size=size, + data=data + ) + else: + raise Exception(f'Unexpected large size for chunk when caching: {size}') self._smart_loader_last_chunk_index_accessed = ( chunk_index + self._smart_loader_chunk_sequence_length - 1 ) diff --git a/lindi/LocalCache/LocalCache.py b/lindi/LocalCache/LocalCache.py index a244eb7..9b65586 100644 --- a/lindi/LocalCache/LocalCache.py +++ b/lindi/LocalCache/LocalCache.py @@ -18,6 +18,11 @@ def get_remote_chunk(self, *, url: str, offset: int, size: int): def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes): if len(data) != size: raise ValueError("data size does not match size") + if size >= 1000 * 1000 * 900: + # This is a sqlite limitation/configuration + # https://www.sqlite.org/limits.html + # For some reason 1000 * 1000 * 1000 seems to be too large, whereas 1000 * 1000 * 900 is fine + raise ValueError("Cannot store blobs larger than 900 MB in LocalCache") self._sqlite_client.put_remote_chunk(url=url, offset=offset, size=size, data=data) diff --git a/tests/test_local_cache.py b/tests/test_local_cache.py index 4f7644e..eda23ec 100644 --- a/tests/test_local_cache.py +++ b/tests/test_local_cache.py @@ -45,5 +45,31 @@ def test_remote_data_1(): assert elapsed_1 < elapsed_0 * 0.3 # type: ignore +def test_put_local_cache(): + with tempfile.TemporaryDirectory() as tmpdir: + local_cache = lindi.LocalCache(cache_dir=tmpdir + '/local_cache') + data = b'x' * (1000 * 1000 * 900 - 1) + local_cache.put_remote_chunk( + url='dummy', + offset=0, + size=len(data), + data=data + ) + data2 = local_cache.get_remote_chunk( + url='dummy', + offset=0, + size=len(data) + ) + assert data == data2 + data_too_large = b'x' * (1000 * 1000 * 900) + with pytest.raises(ValueError): + local_cache.put_remote_chunk( + url='dummy', + offset=0, + size=len(data_too_large), + data=data_too_large + ) + + if __name__ == "__main__": - test_remote_data_1() + test_put_local_cache() From 1535202ed6ee0c82f9e30ca253aa086e8e8afcc0 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Mon, 20 May 2024 23:32:10 -0400 Subject: [PATCH 2/2] Raise exception for trying to set too large chunk in local cache --- lindi/LindiH5ZarrStore/LindiH5ZarrStore.py | 8 ++--- .../LindiReferenceFileSystemStore.py | 8 ++--- lindi/LindiRemfile/LindiRemfile.py | 30 ++++++++----------- lindi/LocalCache/LocalCache.py | 14 +++++---- lindi/__init__.py | 2 +- tests/test_local_cache.py | 2 +- 6 files changed, 31 insertions(+), 33 deletions(-) diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index afec442..e40fcef 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -20,7 +20,7 @@ from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore -from ..LocalCache.LocalCache import LocalCache +from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache from ..LindiRemfile.LindiRemfile import LindiRemfile from .LindiH5ZarrStoreOpts import LindiH5ZarrStoreOpts @@ -346,15 +346,15 @@ def _get_chunk_file_bytes(self, key_parent: str, key_name: str): buf = _read_bytes(self._file, byte_offset, byte_count) if self._local_cache is not None: assert self._url is not None, "Unexpected: url is None but local_cache is not None" - if byte_count < 1000 * 1000 * 900: + try: self._local_cache.put_remote_chunk( url=self._url, offset=byte_offset, size=byte_count, data=buf ) - else: - print(f"Warning: Not storing chunk of size {byte_count} in local cache in LindiH5ZarrStore (key: {key_parent}/{key_name})") + except ChunkTooLargeError: + print(f"Warning: Unable to store chunk of size {byte_count} in local cache in LindiH5ZarrStore (key: {key_parent}/{key_name})") return buf def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py index 19b2e80..48b2801 100644 --- a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py +++ b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py @@ -4,7 +4,7 @@ import requests from zarr.storage import Store as ZarrStore -from ..LocalCache.LocalCache import LocalCache +from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache class LindiReferenceFileSystemStore(ZarrStore): @@ -141,10 +141,10 @@ def __getitem__(self, key: str): return x val = _read_bytes_from_url_or_path(url, offset, length) if self.local_cache is not None: - if length < 1000 * 1000 * 900: + try: self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val) - else: - print(f'Warning: not caching chunk of size {length} om LindiReferenceFileSystemStore (key: {key})') + except ChunkTooLargeError: + print(f'Warning: unable to cache chunk of size {length} on LocalCache (key: {key})') return val else: # should not happen given checks in __init__, but self.rfs is mutable diff --git a/lindi/LindiRemfile/LindiRemfile.py b/lindi/LindiRemfile/LindiRemfile.py index af11f60..f19eb95 100644 --- a/lindi/LindiRemfile/LindiRemfile.py +++ b/lindi/LindiRemfile/LindiRemfile.py @@ -218,15 +218,12 @@ def _load_chunk(self, chunk_index: int) -> bytes: self._memory_chunks[chunk_index] = x if self._local_cache is not None: size = min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size) - if size < 1000 * 1000 * 900: - self._local_cache.put_remote_chunk( - url=self._url, - offset=chunk_index * self._min_chunk_size, - size=size, - data=x - ) - else: - raise Exception(f'Unexpected large size for chunk when caching: {size}') + self._local_cache.put_remote_chunk( + url=self._url, + offset=chunk_index * self._min_chunk_size, + size=size, + data=x + ) self._memory_chunk_indices.append(chunk_index) else: for i in range(self._smart_loader_chunk_sequence_length): @@ -242,15 +239,12 @@ def _load_chunk(self, chunk_index: int) -> bytes: data = x[i * self._min_chunk_size: (i + 1) * self._min_chunk_size] if len(data) != size: raise ValueError(f'Unexpected: len(data) != size: {len(data)} != {size}') - if size < 1000 * 1000 * 900: - self._local_cache.put_remote_chunk( - url=self._url, - offset=(chunk_index + i) * self._min_chunk_size, - size=size, - data=data - ) - else: - raise Exception(f'Unexpected large size for chunk when caching: {size}') + self._local_cache.put_remote_chunk( + url=self._url, + offset=(chunk_index + i) * self._min_chunk_size, + size=size, + data=data + ) self._smart_loader_last_chunk_index_accessed = ( chunk_index + self._smart_loader_chunk_sequence_length - 1 ) diff --git a/lindi/LocalCache/LocalCache.py b/lindi/LocalCache/LocalCache.py index 9b65586..f675f30 100644 --- a/lindi/LocalCache/LocalCache.py +++ b/lindi/LocalCache/LocalCache.py @@ -18,14 +18,13 @@ def get_remote_chunk(self, *, url: str, offset: int, size: int): def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes): if len(data) != size: raise ValueError("data size does not match size") - if size >= 1000 * 1000 * 900: - # This is a sqlite limitation/configuration - # https://www.sqlite.org/limits.html - # For some reason 1000 * 1000 * 1000 seems to be too large, whereas 1000 * 1000 * 900 is fine - raise ValueError("Cannot store blobs larger than 900 MB in LocalCache") self._sqlite_client.put_remote_chunk(url=url, offset=offset, size=size, data=data) +class ChunkTooLargeError(Exception): + pass + + class LocalCacheSQLiteClient: def __init__(self, *, db_fname: str): import sqlite3 @@ -63,6 +62,11 @@ def get_remote_chunk(self, *, url: str, offset: int, size: int): return row[0] def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes): + if size >= 1000 * 1000 * 900: + # This is a sqlite limitation/configuration + # https://www.sqlite.org/limits.html + # For some reason 1000 * 1000 * 1000 seems to be too large, whereas 1000 * 1000 * 900 is fine + raise ChunkTooLargeError("Cannot store blobs larger than 900 MB in LocalCache") self._cursor.execute( """ INSERT OR REPLACE INTO remote_chunks (url, offset, size, data) VALUES (?, ?, ?, ?) diff --git a/lindi/__init__.py b/lindi/__init__.py index 628432e..f300066 100644 --- a/lindi/__init__.py +++ b/lindi/__init__.py @@ -1,5 +1,5 @@ from .LindiH5ZarrStore import LindiH5ZarrStore, LindiH5ZarrStoreOpts # noqa: F401 from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401 from .LindiStagingStore import LindiStagingStore, StagingArea # noqa: F401 -from .LocalCache.LocalCache import LocalCache # noqa: F401 +from .LocalCache.LocalCache import LocalCache, ChunkTooLargeError # noqa: F401 from .File.File import File # noqa: F401 diff --git a/tests/test_local_cache.py b/tests/test_local_cache.py index eda23ec..7fdb857 100644 --- a/tests/test_local_cache.py +++ b/tests/test_local_cache.py @@ -62,7 +62,7 @@ def test_put_local_cache(): ) assert data == data2 data_too_large = b'x' * (1000 * 1000 * 900) - with pytest.raises(ValueError): + with pytest.raises(lindi.ChunkTooLargeError): local_cache.put_remote_chunk( url='dummy', offset=0,