Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LocalCache: limit allowed blob size #69

Merged
merged 3 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs
from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data
from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore
from ..LocalCache.LocalCache import LocalCache
from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache
from ..LindiRemfile.LindiRemfile import LindiRemfile
from .LindiH5ZarrStoreOpts import LindiH5ZarrStoreOpts

Expand Down Expand Up @@ -346,12 +346,15 @@ def _get_chunk_file_bytes(self, key_parent: str, key_name: str):
buf = _read_bytes(self._file, byte_offset, byte_count)
if self._local_cache is not None:
assert self._url is not None, "Unexpected: url is None but local_cache is not None"
self._local_cache.put_remote_chunk(
url=self._url,
offset=byte_offset,
size=byte_count,
data=buf
)
try:
self._local_cache.put_remote_chunk(
url=self._url,
offset=byte_offset,
size=byte_count,
data=buf
)
except ChunkTooLargeError:
print(f"Warning: Unable to store chunk of size {byte_count} in local cache in LindiH5ZarrStore (key: {key_parent}/{key_name})")
return buf

def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str):
Expand Down
7 changes: 5 additions & 2 deletions lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import requests
from zarr.storage import Store as ZarrStore

from ..LocalCache.LocalCache import LocalCache
from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache


class LindiReferenceFileSystemStore(ZarrStore):
Expand Down Expand Up @@ -141,7 +141,10 @@ def __getitem__(self, key: str):
return x
val = _read_bytes_from_url_or_path(url, offset, length)
if self.local_cache is not None:
self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val)
try:
self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val)
except ChunkTooLargeError:
print(f'Warning: unable to cache chunk of size {length} on LocalCache (key: {key})')
return val
else:
# should not happen given checks in __init__, but self.rfs is mutable
Expand Down
3 changes: 2 additions & 1 deletion lindi/LindiRemfile/LindiRemfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,11 @@ def _load_chunk(self, chunk_index: int) -> bytes:
if self._local_cache is None:
self._memory_chunks[chunk_index] = x
if self._local_cache is not None:
size = min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size)
self._local_cache.put_remote_chunk(
url=self._url,
offset=chunk_index * self._min_chunk_size,
size=min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size),
size=size,
data=x
)
self._memory_chunk_indices.append(chunk_index)
Expand Down
9 changes: 9 additions & 0 deletions lindi/LocalCache/LocalCache.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes):
self._sqlite_client.put_remote_chunk(url=url, offset=offset, size=size, data=data)


class ChunkTooLargeError(Exception):
pass


class LocalCacheSQLiteClient:
def __init__(self, *, db_fname: str):
import sqlite3
Expand Down Expand Up @@ -58,6 +62,11 @@ def get_remote_chunk(self, *, url: str, offset: int, size: int):
return row[0]

def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes):
if size >= 1000 * 1000 * 900:
# This is a sqlite limitation/configuration
# https://www.sqlite.org/limits.html
# For some reason 1000 * 1000 * 1000 seems to be too large, whereas 1000 * 1000 * 900 is fine
raise ChunkTooLargeError("Cannot store blobs larger than 900 MB in LocalCache")
self._cursor.execute(
"""
INSERT OR REPLACE INTO remote_chunks (url, offset, size, data) VALUES (?, ?, ?, ?)
Expand Down
2 changes: 1 addition & 1 deletion lindi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .LindiH5ZarrStore import LindiH5ZarrStore, LindiH5ZarrStoreOpts # noqa: F401
from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401
from .LindiStagingStore import LindiStagingStore, StagingArea # noqa: F401
from .LocalCache.LocalCache import LocalCache # noqa: F401
from .LocalCache.LocalCache import LocalCache, ChunkTooLargeError # noqa: F401
from .File.File import File # noqa: F401
28 changes: 27 additions & 1 deletion tests/test_local_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,31 @@ def test_remote_data_1():
assert elapsed_1 < elapsed_0 * 0.3 # type: ignore


def test_put_local_cache():
with tempfile.TemporaryDirectory() as tmpdir:
local_cache = lindi.LocalCache(cache_dir=tmpdir + '/local_cache')
data = b'x' * (1000 * 1000 * 900 - 1)
local_cache.put_remote_chunk(
url='dummy',
offset=0,
size=len(data),
data=data
)
data2 = local_cache.get_remote_chunk(
url='dummy',
offset=0,
size=len(data)
)
assert data == data2
data_too_large = b'x' * (1000 * 1000 * 900)
with pytest.raises(lindi.ChunkTooLargeError):
local_cache.put_remote_chunk(
url='dummy',
offset=0,
size=len(data_too_large),
data=data_too_large
)


if __name__ == "__main__":
test_remote_data_1()
test_put_local_cache()