Skip to content

Commit

Permalink
Merge pull request #69 from NeurodataWithoutBorders/local-cache-blob-…
Browse files Browse the repository at this point in the history
…size

LocalCache: limit allowed blob size
  • Loading branch information
magland authored May 21, 2024
2 parents 797869d + 1535202 commit 197a388
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 12 deletions.
17 changes: 10 additions & 7 deletions lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs
from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data
from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore
from ..LocalCache.LocalCache import LocalCache
from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache
from ..LindiRemfile.LindiRemfile import LindiRemfile
from .LindiH5ZarrStoreOpts import LindiH5ZarrStoreOpts

Expand Down Expand Up @@ -346,12 +346,15 @@ def _get_chunk_file_bytes(self, key_parent: str, key_name: str):
buf = _read_bytes(self._file, byte_offset, byte_count)
if self._local_cache is not None:
assert self._url is not None, "Unexpected: url is None but local_cache is not None"
self._local_cache.put_remote_chunk(
url=self._url,
offset=byte_offset,
size=byte_count,
data=buf
)
try:
self._local_cache.put_remote_chunk(
url=self._url,
offset=byte_offset,
size=byte_count,
data=buf
)
except ChunkTooLargeError:
print(f"Warning: Unable to store chunk of size {byte_count} in local cache in LindiH5ZarrStore (key: {key_parent}/{key_name})")
return buf

def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str):
Expand Down
7 changes: 5 additions & 2 deletions lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import requests
from zarr.storage import Store as ZarrStore

from ..LocalCache.LocalCache import LocalCache
from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache


class LindiReferenceFileSystemStore(ZarrStore):
Expand Down Expand Up @@ -141,7 +141,10 @@ def __getitem__(self, key: str):
return x
val = _read_bytes_from_url_or_path(url, offset, length)
if self.local_cache is not None:
self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val)
try:
self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val)
except ChunkTooLargeError:
print(f'Warning: unable to cache chunk of size {length} on LocalCache (key: {key})')
return val
else:
# should not happen given checks in __init__, but self.rfs is mutable
Expand Down
3 changes: 2 additions & 1 deletion lindi/LindiRemfile/LindiRemfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,11 @@ def _load_chunk(self, chunk_index: int) -> bytes:
if self._local_cache is None:
self._memory_chunks[chunk_index] = x
if self._local_cache is not None:
size = min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size)
self._local_cache.put_remote_chunk(
url=self._url,
offset=chunk_index * self._min_chunk_size,
size=min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size),
size=size,
data=x
)
self._memory_chunk_indices.append(chunk_index)
Expand Down
9 changes: 9 additions & 0 deletions lindi/LocalCache/LocalCache.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes):
self._sqlite_client.put_remote_chunk(url=url, offset=offset, size=size, data=data)


class ChunkTooLargeError(Exception):
pass


class LocalCacheSQLiteClient:
def __init__(self, *, db_fname: str):
import sqlite3
Expand Down Expand Up @@ -58,6 +62,11 @@ def get_remote_chunk(self, *, url: str, offset: int, size: int):
return row[0]

def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes):
if size >= 1000 * 1000 * 900:
# This is a sqlite limitation/configuration
# https://www.sqlite.org/limits.html
# For some reason 1000 * 1000 * 1000 seems to be too large, whereas 1000 * 1000 * 900 is fine
raise ChunkTooLargeError("Cannot store blobs larger than 900 MB in LocalCache")
self._cursor.execute(
"""
INSERT OR REPLACE INTO remote_chunks (url, offset, size, data) VALUES (?, ?, ?, ?)
Expand Down
2 changes: 1 addition & 1 deletion lindi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .LindiH5ZarrStore import LindiH5ZarrStore, LindiH5ZarrStoreOpts # noqa: F401
from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401
from .LindiStagingStore import LindiStagingStore, StagingArea # noqa: F401
from .LocalCache.LocalCache import LocalCache # noqa: F401
from .LocalCache.LocalCache import LocalCache, ChunkTooLargeError # noqa: F401
from .File.File import File # noqa: F401
28 changes: 27 additions & 1 deletion tests/test_local_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,31 @@ def test_remote_data_1():
assert elapsed_1 < elapsed_0 * 0.3 # type: ignore


def test_put_local_cache():
with tempfile.TemporaryDirectory() as tmpdir:
local_cache = lindi.LocalCache(cache_dir=tmpdir + '/local_cache')
data = b'x' * (1000 * 1000 * 900 - 1)
local_cache.put_remote_chunk(
url='dummy',
offset=0,
size=len(data),
data=data
)
data2 = local_cache.get_remote_chunk(
url='dummy',
offset=0,
size=len(data)
)
assert data == data2
data_too_large = b'x' * (1000 * 1000 * 900)
with pytest.raises(lindi.ChunkTooLargeError):
local_cache.put_remote_chunk(
url='dummy',
offset=0,
size=len(data_too_large),
data=data_too_large
)


if __name__ == "__main__":
test_remote_data_1()
test_put_local_cache()

0 comments on commit 197a388

Please sign in to comment.