diff --git a/.gitignore b/.gitignore index 9a71519..c55f5c4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -*.lindi.json +*.lindi.json* *.nwb .coverage diff --git a/lindi/File/File.py b/lindi/File/File.py new file mode 100644 index 0000000..59e0f4c --- /dev/null +++ b/lindi/File/File.py @@ -0,0 +1,34 @@ +from typing import Literal +import os +import h5py +from ..LindiH5pyFile.LindiH5pyFile import LindiH5pyFile +from ..LindiStagingStore.StagingArea import StagingArea +from ..LocalCache.LocalCache import LocalCache + + +class File(h5py.File): + """ + A drop-in replacement for h5py.File that is either a lindi.LindiH5pyFile or + h5py.File depending on whether the file name ends with .lindi.json or not. + """ + def __new__(cls, name, mode: Literal['r', 'r+', 'w', 'w-', 'x', 'a'] = 'r', **kwds): + if isinstance(name, str) and name.endswith('.lindi.json'): + # should we raise exceptions on select unsupported kwds? or just go with the flow? + if mode != 'r': + staging_area = StagingArea.create(dir=name + '.d') + else: + staging_area = None + local_cache_dir = os.environ.get('LINDI_LOCAL_CACHE_DIR', None) + if local_cache_dir is not None: + local_cache = LocalCache(cache_dir=local_cache_dir) + else: + local_cache = None + + return LindiH5pyFile.from_lindi_file( + name, + mode=mode, + staging_area=staging_area, + local_cache=local_cache + ) + else: + return h5py.File(name, mode=mode, **kwds) diff --git a/lindi/File/__init__.py b/lindi/File/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index 82373eb..99bd1ac 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -403,7 +403,12 @@ def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): ) if h5_item.chunks is not None: # Get the byte range in the file for the chunk. - byte_offset, byte_count = _get_chunk_byte_range(h5_item, chunk_coords) + try: + byte_offset, byte_count = _get_chunk_byte_range(h5_item, chunk_coords) + except Exception as e: + raise Exception( + f"Error getting byte range for chunk {key_parent}/{key_name}. Shape: {h5_item.shape}, Chunks: {h5_item.chunks}, Chunk coords: {chunk_coords}: {e}" + ) else: # In this case (contiguous dataset), we need to check that the chunk # coordinates are (0, 0, 0, ...) diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index 780b143..39c29cb 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -52,6 +52,9 @@ def from_lindi_file(url_or_path: str, *, mode: LindiFileMode = "r", staging_area For a description of parameters, see from_reference_file_system(). """ + if local_file_path is None: + if not url_or_path.startswith("http://") and not url_or_path.startswith("https://"): + local_file_path = url_or_path return LindiH5pyFile.from_reference_file_system(url_or_path, mode=mode, staging_area=staging_area, local_cache=local_cache, local_file_path=local_file_path) @staticmethod @@ -166,7 +169,7 @@ def from_reference_file_system(rfs: Union[dict, str, None], *, mode: LindiFileMo store = LindiReferenceFileSystemStore(rfs, local_cache=local_cache) if staging_area: store = LindiStagingStore(base_store=store, staging_area=staging_area) - return LindiH5pyFile.from_zarr_store(store, mode=mode, local_file_path=local_file_path) + return LindiH5pyFile.from_zarr_store(store, mode=mode, local_file_path=local_file_path, local_cache=local_cache) else: raise Exception(f"Unhandled type for rfs: {type(rfs)}") @@ -279,7 +282,7 @@ def upload( _write_rfs_to_file(rfs=rfs, output_file_name=rfs_fname) return on_upload_main(rfs_fname) - def write_lindi_file(self, filename: str): + def write_lindi_file(self, filename: str, *, generation_metadata: Union[dict, None] = None): """ Write the reference file system to a .lindi.json file. @@ -287,10 +290,16 @@ def write_lindi_file(self, filename: str): ---------- filename : str The filename to write to. It must end with '.lindi.json'. + generation_metadata : Union[dict, None], optional + The optional generation metadata to include in the reference file + system, by default None. This information dict is simply set to the + 'generationMetadata' key in the reference file system. """ if not filename.endswith(".lindi.json"): raise Exception("Filename must end with '.lindi.json'") rfs = self.to_reference_file_system() + if generation_metadata is not None: + rfs['generationMetadata'] = generation_metadata _write_rfs_to_file(rfs=rfs, output_file_name=filename) @property diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py index 825e162..8fea9f8 100644 --- a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py +++ b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py @@ -49,8 +49,7 @@ class LindiReferenceFileSystemStore(ZarrStore): We also support the use of templates as in fsspec, but do not support the full jinja2 templating. There may be an optional "templates" key in the - dictionary, which is a dictionary of template strings. For example, - { + dictionary, which is a dictionary of template strings. For example, { "templates": {"u1": "https://some/url", "u2": "https://some/other/url"}, "refs": { ... "/some/key/0": [ @@ -58,13 +57,16 @@ class LindiReferenceFileSystemStore(ZarrStore): ], ... } - } - In this case, the "{{u1}}" will be replaced with the value of the "u1" + } In this case, the "{{u1}}" will be replaced with the value of the "u1" template string. + Optionally, the reference file system may contain a "generationMetadata" + key, which is a dictionary of metadata about the generation of the reference + file system. This metadata is not used by this class, but could be by other + software. See LindiH5pyFile.write_lindi_file(...) + It is okay for rfs to be modified outside of this class, and the changes - will be reflected immediately in the store. This can be used by experimental - tools such as lindi-cloud. + will be reflected immediately in the store. """ def __init__(self, rfs: dict, *, mode: Literal["r", "r+"] = "r+", local_cache: Union[LocalCache, None] = None): """ diff --git a/lindi/LindiStagingStore/StagingArea.py b/lindi/LindiStagingStore/StagingArea.py index b9f41d3..460bc9b 100644 --- a/lindi/LindiStagingStore/StagingArea.py +++ b/lindi/LindiStagingStore/StagingArea.py @@ -1,3 +1,4 @@ +from typing import Union import os import random import string @@ -21,17 +22,27 @@ def __init__(self, *, _directory: str) -> None: self._directory = os.path.abspath(_directory) @staticmethod - def create(base_dir: str) -> 'StagingArea': + def create(*, base_dir: Union[str, None] = None, dir: Union[str, None] = None) -> 'StagingArea': """ - Create a new staging area. + Create a new staging area. Provide either `base_dir` or `dir`, but not + both. Parameters ---------- - base_dir : str - The base directory where the staging area will be created. The - staging directory will be a subdirectory of this directory. + base_dir : str or None + If provided, the base directory where the staging area will be + created. The staging directory will be a subdirectory of this + directory. + dir : str or None + If provided, the exact directory where the staging area will be + created. It is okay if this directory already exists. """ - dir = os.path.join(base_dir, _create_random_id()) + if base_dir is not None and dir is not None: + raise ValueError("Provide either base_dir or dir, but not both") + if base_dir is not None: + dir = os.path.join(base_dir, _create_random_id()) + if dir is None: + raise ValueError("Provide either base_dir or dir") return StagingArea(_directory=dir) def cleanup(self) -> None: diff --git a/lindi/__init__.py b/lindi/__init__.py index f50d436..628432e 100644 --- a/lindi/__init__.py +++ b/lindi/__init__.py @@ -2,3 +2,4 @@ from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401 from .LindiStagingStore import LindiStagingStore, StagingArea # noqa: F401 from .LocalCache.LocalCache import LocalCache # noqa: F401 +from .File.File import File # noqa: F401 diff --git a/pyproject.toml b/pyproject.toml index 842e667..e7449e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "lindi" -version = "0.3.0" +version = "0.3.1" description = "" authors = [ "Jeremy Magland ", diff --git a/tests/test_lindi_file.py b/tests/test_lindi_file.py new file mode 100644 index 0000000..720579a --- /dev/null +++ b/tests/test_lindi_file.py @@ -0,0 +1,23 @@ +import tempfile +import numpy as np +import h5py +import lindi + + +def test_lindi_file(): + with tempfile.TemporaryDirectory() as tmpdir: + fname = f'{tmpdir}/test.lindi.json' + with lindi.File(fname, 'w') as f: + f.create_dataset('data', data=np.arange(500000, dtype=np.uint32), chunks=(100000,)) + + with lindi.File(fname, 'r') as f: + ds = f['data'] + assert isinstance(ds, h5py.Dataset) + assert ds.shape == (500000,) + assert ds.chunks == (100000,) + assert ds.dtype == np.uint32 + assert np.all(ds[:] == np.arange(500000, dtype=np.uint32)) + + +if __name__ == '__main__': + test_lindi_file() diff --git a/tests/test_staging_area.py b/tests/test_staging_area.py index fbe4758..763cb45 100644 --- a/tests/test_staging_area.py +++ b/tests/test_staging_area.py @@ -7,7 +7,7 @@ def test_staging_area(): with tempfile.TemporaryDirectory() as tmpdir: - staging_area = lindi.StagingArea.create(tmpdir + '/staging_area') + staging_area = lindi.StagingArea.create(base_dir=tmpdir + '/staging_area') client = lindi.LindiH5pyFile.from_reference_file_system(None, mode='r+', staging_area=staging_area) X = np.random.randn(1000, 1000).astype(np.float32) client.create_dataset('large_array', data=X, chunks=(400, 400))