diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 780bd54..79bc35f 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -13,6 +13,8 @@ jobs: - name: Install packages needed for tests # pyright 1.1.336 can produce annoying errors run: pip install pytest pytest-cov pyright==1.1.335 + - name: Install pynwb needed for tests + run: pip install pynwb - name: Run pyright run: cd lindi && pyright - name: Run tests and collect coverage diff --git a/.vscode/tasks/quick_test.sh b/.vscode/tasks/quick_test.sh index 6d8a84a..5e8873e 100755 --- a/.vscode/tasks/quick_test.sh +++ b/.vscode/tasks/quick_test.sh @@ -2,6 +2,10 @@ set -ex # black --check . + +cd lindi flake8 . # pyright -pytest --cov=lindi --cov-report=xml --cov-report=term -m "not slow" tests/ +cd .. + +pytest --cov=lindi --cov-report=xml --cov-report=term -m "not slow and not network" tests/ diff --git a/.vscode/tasks/test.sh b/.vscode/tasks/test.sh index 0e09ca0..c87184e 100755 --- a/.vscode/tasks/test.sh +++ b/.vscode/tasks/test.sh @@ -2,6 +2,10 @@ set -ex # black --check . + +cd lindi flake8 . pyright +cd .. + pytest --cov=lindi --cov-report=xml --cov-report=term tests/ diff --git a/README.md b/README.md index 4fa55da..dfc7476 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,15 @@ # LINDI - Linked Data Interface -:warning: Please note, LINDI is currently under development and not yet operational. +:warning: Please note, LINDI is currently under development and should not yet be used in practice. -LINDI is a Python library that facilitates handling NWB (Neurodata Without Borders) files in an efficient, flexible manner that minimizes the need for excessive data downloads. Our goal is to enable composition of NWB files by integrating data from multiple sources. +LINDI is a Python library that facilitates handling NWB (Neurodata Without Borders) files in an efficient, flexible manner, especially when dealing with large datasets on remote servers. The goal is to enable composition of NWB files by integrating data from multiple sources without the need to copy or move large datasets. -Key features include: +LINDI features include: -- A read-only [Zarr storage backend](https://zarr.readthedocs.io/en/stable/) for HDF5 files, including NWB. In other words, a Zarr wrapper for HDF5. *(in progress)* -- Generation of a relatively small JSON file representing the NWB Zarr store, inspired by [kerchunk](https://github.com/fsspec/kerchunk). -- An [h5py](https://www.h5py.org/)-like interface for accessing NWB Zarr stores that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/). In other words an h5py-like wrapper for Zarr. This feature is only partially functional. -- The ability to assemble composite NWB files that draw from multiple sources. Also not yet functional. - -Why a Zarr wrapper of HDF5, and then an h5py-like wrapper of Zarr? It's because we need a Zarr representation of existing files HDF5 NWB files, but we want to still leverage tools that utilize h5py. +- A specification for representing arbitrary HDF5 files as Zarr stores. This handles scalar datasets, references, soft links, and compound data types for datasets. +- A Zarr wrapper for remote or local HDF5 files (LindiH5ZarrStore). This involves pointers to remote files for remote data chunks. +- A function for generating a reference file system .zarr.json file from a Zarr store. This is inspired by [kerchunk](https://github.com/fsspec/kerchunk). +- An h5py-like interface for accessing these Zarr stores that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/). This project was inspired by [kerchunk](https://github.com/fsspec/kerchunk) and [hdmf-zarr](https://hdmf-zarr.readthedocs.io/en/latest/index.html) and depends on [zarr](https://zarr.readthedocs.io/en/stable/), [h5py](https://www.h5py.org/), [remfile](https://github.com/magland/remfile), [fsspec](https://filesystem-spec.readthedocs.io/en/latest/), and [numcodecs](https://numcodecs.readthedocs.io/en/stable/). @@ -27,36 +25,61 @@ pip install -e . ## Example usage ```python -import lindi -import pynwb +# examples/example1.py -# Here's an example DANDI NWB file - -# https://neurosift.app/?p=/nwb&dandisetId=000717&dandisetVersion=draft&url=https://api.dandiarchive.org/api/assets/3d12a902-139a-4c1a-8fd0-0a7faf2fb223/download/ -h5_url = "https://api.dandiarchive.org/api/assets/3d12a902-139a-4c1a-8fd0-0a7faf2fb223/download/" +import json +import pynwb +import lindi +# Define the URL for a remote NWB file +h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/" -# Create the read-only Zarr store for the h5 file -store = lindi.LindiH5Store.from_file(h5_url) +# Create a read-only Zarr store as a wrapper for the h5 file +store = lindi.LindiH5ZarrStore.from_file(h5_url) -# Create the reference file system object +# Generate a reference file system rfs = store.to_reference_file_system() -# For reference, save it to a file +# Save it to a file for later use with open("example.zarr.json", "w") as f: json.dump(rfs, f, indent=2) -# Create the h5py-like client from the reference file system -client = lindi.LindiClient.from_reference_file_system(rfs) +# Create an h5py-like client from the reference file system +client = lindi.LindiH5pyFile.from_reference_file_system(rfs) + +# Open using pynwb +with pynwb.NWBHDF5IO(file=client, mode="r") as io: + nwbfile = io.read() + print(nwbfile) +``` + +Or if you already have a .zarr.json file prepared (loading is much faster) + +```python +# examples/example2.py + +import pynwb +import lindi -# Try to read using pynwb -# (This part does not work yet) +# Define the URL for a remote .zarr.json file +url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' + +# Load the h5py-like client from the reference file system +client = lindi.LindiH5pyFile.from_reference_file_system(url) + +# Open using pynwb with pynwb.NWBHDF5IO(file=client, mode="r") as io: nwbfile = io.read() print(nwbfile) ``` -The idea is that you would then be able to augment, mix, and merge the reference file system JSON files, creating NWB files that can involve chunk data from multiple sources. +## Mixing and matching data from multiple sources + +Once we have NWB files represented by relatively small reference file systems (e.g., .zarr.json files), we can begin to mix and match data from multiple sources. More on this to come. + +## For developers + +[Special Zarr annotations used by LINDI](docs/special_zarr_annotations.md) ## License diff --git a/devel/old_tests/old_tests.py b/devel/old_tests/old_tests.py deleted file mode 100644 index 6ea621f..0000000 --- a/devel/old_tests/old_tests.py +++ /dev/null @@ -1,117 +0,0 @@ -import json -import tempfile -import numpy as np -import h5py -import zarr -import kerchunk.hdf # type: ignore -from lindi import LindiH5Store -from fsspec.implementations.reference import ReferenceFileSystem - - -def test_scalar_dataset(): - for val in ["abc", b"abc", 1, 3.6]: - print(f"Testing scalar {val} of type {type(val)}") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=val) - zarr_kerchunk, store_kerchunk = _get_kerchunk_zarr(filename) - val_kerchunk = zarr_kerchunk["X"][0] - zarr_lindi, store_lindi = _get_lindi_zarr(filename) - try: - val_lindi = zarr_lindi["X"][0] - if val_kerchunk != val: - print(f"WARNING: val_kerchunk={val_kerchunk} != val={val}") - if val_lindi != val: - print(f"WARNING: val_lindi={val_lindi} != val={val}") - if type(val_kerchunk) is not type(val): - print( - "WARNING: type mismatch for kerchunk:", - type(val), - type(val_kerchunk), - ) - if type(val_lindi) is not type(val): - print("WARNING: type mismatch for lindi:", type(val), type(val_lindi)) - print("") - x = store_lindi.to_reference_file_system() # noqa: F841 - finally: - store_lindi.close() - - -def test_numpy_array(): - print("Testing numpy array") - X1 = (np.arange(60).reshape(3, 20), (3, 7)) - X2 = (np.arange(60).reshape(3, 20), None) - for array, chunks in [X1, X2]: - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=array, chunks=chunks) - zarr_kerchunk, store_kerchunk = _get_kerchunk_zarr(filename) - array_kerchunk = zarr_kerchunk["X"][:] - assert isinstance(array_kerchunk, np.ndarray) - zarr_lindi, store_lindi = _get_lindi_zarr(filename) - array_lindi = zarr_lindi["X"][:] - assert isinstance(array_lindi, np.ndarray) - if not np.array_equal(array_kerchunk, array): - print("WARNING: array_kerchunk does not match array") - print(array_kerchunk) - print(array) - if not np.array_equal(array_lindi, array): - print("WARNING: array_lindi does not match array") - print(array_lindi) - print(array) - x = store_lindi.to_reference_file_system() # noqa: F841 - - -def test_numpy_array_of_strings(): - print("Testing numpy array of strings") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=["abc", "def", "ghi"]) - zarr_kerchunk, store_kerchunk = _get_kerchunk_zarr(filename) - array_kerchunk = zarr_kerchunk["X"][:] - assert isinstance(array_kerchunk, np.ndarray) - zarr_lindi, store_lindi = _get_lindi_zarr(filename) - array_lindi = zarr_lindi["X"][:] - assert isinstance(array_lindi, np.ndarray) - if not np.array_equal(array_kerchunk, ["abc", "def", "ghi"]): - print("WARNING: array_kerchunk does not match array") - print(array_kerchunk) - print(["abc", "def", "ghi"]) - if not np.array_equal(array_lindi, ["abc", "def", "ghi"]): - print("WARNING: array_lindi does not match array") - print(array_lindi) - print(["abc", "def", "ghi"]) - x = store_lindi.to_reference_file_system() # noqa: F841 - - -def _get_lindi_zarr(filename): - store = LindiH5Store.from_file(filename, url='.') # use url='.' so that a reference file system can be created - root = zarr.open(store) - return root, store - - -def _get_kerchunk_zarr(filename): - with h5py.File(filename, "r") as f: - h5chunks = kerchunk.hdf.SingleHdf5ToZarr( - f, - url=filename, - hdmf_mode=True, - num_chunks_per_dataset_threshold=1000, - max_num_items=1000, - ) - a = h5chunks.translate() - with open("test_example.zarr.json", "w") as store: - json.dump(a, store, indent=2) - fs = ReferenceFileSystem(a) - store0 = fs.get_mapper(root="/", check=False) - root = zarr.open(store0) - return root, store0 - - -if __name__ == "__main__": - test_scalar_dataset() - test_numpy_array() - test_numpy_array_of_strings() diff --git a/devel/old_tests/test_lindi_client.py b/devel/old_tests/test_lindi_client.py deleted file mode 100644 index 2fa62de..0000000 --- a/devel/old_tests/test_lindi_client.py +++ /dev/null @@ -1,36 +0,0 @@ -from lindi import LindiClient, LindiGroup, LindiDataset - - -def test_lindi_client(): - client = LindiClient.from_file("example_0.zarr.json") - - for k, v in client.attrs.items(): - print(f"{k}: {v}") - - for k in client.keys(): - print(k) - - acquisition = client["acquisition"] - assert isinstance(acquisition, LindiGroup) - for k in acquisition.keys(): - print(k) - - aa = client["acquisition/ElectricalSeriesAp"] - assert isinstance(aa, LindiGroup) - x = aa["data"] - assert isinstance(x, LindiDataset) - - print(x.shape) - print(x[:5]) - - general = client["general"] - assert isinstance(general, LindiGroup) - for k in general.keys(): - a = general[k] - if isinstance(a, LindiDataset): - print(f"{k}: {a.shape}") - print(a[()]) - - -if __name__ == "__main__": - test_lindi_client() diff --git a/devel/old_tests/test_with_real_data.py b/devel/old_tests/test_with_real_data.py deleted file mode 100644 index 9de0291..0000000 --- a/devel/old_tests/test_with_real_data.py +++ /dev/null @@ -1,226 +0,0 @@ -import sys -import numpy as np -import zarr -import h5py -import remfile -from lindi import LindiH5Store - -examples = [] - -# example 0 -# This one seems to load properly -# https://neurosift.app/?p=/nwb&dandisetId=000717&dandisetVersion=draft&url=https://api.dandiarchive.org/api/assets/3d12a902-139a-4c1a-8fd0-0a7faf2fb223/download/ -examples.append( - { - "h5_url": "https://api.dandiarchive.org/api/assets/3d12a902-139a-4c1a-8fd0-0a7faf2fb223/download/", - } -) - -# example 1 -# https://neurosift.app/?p=/nwb&dandisetId=000776&dandisetVersion=draft&url=https://api.dandiarchive.org/api/assets/54895119-f739-4544-973e-a9341a5c66ad/download/ -# Exception: Not yet implemented (3): dataset /processing/CalciumActivity/CalciumSeriesSegmentation/Aligned_neuron_coordinates/voxel_mask with -# dtype [('x', ' 1 else 0 - do_compare(example_num) diff --git a/devel/old_tests/tests.py b/devel/old_tests/tests.py deleted file mode 100644 index 8359136..0000000 --- a/devel/old_tests/tests.py +++ /dev/null @@ -1,156 +0,0 @@ -import tempfile -import numpy as np -import h5py -from lindi import LindiH5Store, LindiClient, LindiGroup, LindiDataset - - -def test_scalar_dataset(): - for val in ["abc", b"abc", 1, 3.6]: - print(f"Testing scalar {val} of type {type(val)}") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=val) - with LindiH5Store.from_file( - filename, url=filename - ) as store: # set url so that a reference file system can be created - rfs = store.to_reference_file_system() - client = LindiClient.from_reference_file_system(rfs) - h5f = h5py.File(filename, "r") - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiDataset) - if not _check_equal(X1[()], X2[()]): - print(f"WARNING: {X1} ({type(X1)}) != {X2} ({type(X2)})") - - -def _check_equal(a, b): - # allow comparison of bytes and strings - if isinstance(a, str): - a = a.encode() - if isinstance(b, str): - b = b.encode() - - if type(a) != type(b): # noqa: E721 - return False - - if isinstance(a, np.ndarray): - assert isinstance(b, np.ndarray) - return _check_arrays_equal(a, b) - - return a == b - - -def _check_arrays_equal(a: np.ndarray, b: np.ndarray): - # If it's an array of strings, we convert to an array of bytes - if a.dtype == object: - # need to modify all the entries - a = np.array([x.encode() if type(x) is str else x for x in a.ravel()]).reshape( - a.shape - ) - if b.dtype == object: - b = np.array([x.encode() if type(x) is str else x for x in b.ravel()]).reshape( - b.shape - ) - # if this is numeric data we need to use allclose so that we can handle NaNs - if np.issubdtype(a.dtype, np.number): - return np.allclose(a, b, equal_nan=True) - else: - return np.array_equal(a, b) - - -def test_numpy_array(): - X1 = ("1", np.arange(60).reshape(3, 20), (3, 7)) - X2 = ("2", np.arange(60).reshape(3, 20), None) - for label, array, chunks in [X1, X2]: - print(f"Testing numpy array {label}") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=array, chunks=chunks) - with LindiH5Store.from_file( - filename, url=filename - ) as store: # set url so that a reference file system can be created - rfs = store.to_reference_file_system() - client = LindiClient.from_reference_file_system(rfs) - h5f = h5py.File(filename, "r") - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiDataset) - if not _check_equal(X1[:], X2[:]): - print("WARNING. Arrays are not equal") - print(X1[:]) - print(X2[:]) - - -def test_numpy_array_of_strings(): - print("Testing numpy array of strings") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=["abc", "def", "ghi"]) - h5f = h5py.File(filename, "r") - with LindiH5Store.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - client = LindiClient.from_reference_file_system(rfs) - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiDataset) - if not _check_equal(X1[:], X2[:]): - print("WARNING. Arrays are not equal") - print(X1[:]) - print(X2[:]) - - -def test_attributes(): - print("Testing attributes") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=[1, 2, 3]) - f["X"].attrs["foo"] = "bar" - f["X"].attrs["baz"] = 3.14 - f["X"].attrs["qux"] = [1, 2, 3] - f["X"].attrs["quux"] = {"a": 1, "b": 2, "c": 3} - f["X"].attrs["corge"] = np.int32(5) - f.create_group("group") - f["group"].attrs["foo"] = "bar2" - f["group"].attrs["baz"] = 3.15 - h5f = h5py.File(filename, "r") - with LindiH5Store.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - client = LindiClient.from_reference_file_system(rfs) - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiDataset) - if X1.attrs["foo"] != X2.attrs["foo"]: - print("WARNING. Attributes are not equal") - print(X1.attrs["foo"]) - print(X2.attrs["foo"]) - if X1.attrs["baz"] != X2.attrs["baz"]: - print("WARNING. Attributes are not equal") - print(X1.attrs["baz"]) - print(X2.attrs["baz"]) - group1 = h5f["group"] - assert isinstance(group1, h5py.Group) - group2 = client["group"] - assert isinstance(group2, LindiGroup) - if group1.attrs["foo"] != group2.attrs["foo"]: - print("WARNING. Attributes are not equal") - print(group1.attrs["foo"]) - print(group2.attrs["foo"]) - if group1.attrs["baz"] != group2.attrs["baz"]: - print("WARNING. Attributes are not equal") - print(group1.attrs["baz"]) - print(group2.attrs["baz"]) - - -if __name__ == "__main__": - test_scalar_dataset() - test_numpy_array() - test_numpy_array_of_strings() - test_attributes() diff --git a/docs/special_zarr_annotations.md b/docs/special_zarr_annotations.md new file mode 100644 index 0000000..7f64dc3 --- /dev/null +++ b/docs/special_zarr_annotations.md @@ -0,0 +1,53 @@ +# Special Zarr Annotations in LINDI + +LINDI defines a set of special Zarr annotations to correspond with HDF5 features that are not natively supported in Zarr. + +## Scalar Datasets + +### `_SCALAR = True` + +In HDF5, datasets can be scalar, but Zarr does not natively support scalar arrays. To overcome this limitation, LINDI represents scalar datasets as Zarr arrays with a shape of `(1,)` and marks them with the `_SCALAR = True` attribute. + +## Soft Links + +### `_SOFT_LINK = { 'path': '...' }` + +Soft links in HDF5 are pointers to other groups within the file. LINDI utilizes the `_SOFT_LINK` attribute on a Zarr group to represent this relationship. The `path` key within the attribute specifies the target group within the Zarr structure. Soft link groups in Zarr should be otherwise empty, serving only as a reference to another location in the dataset. + +Note that we do not currently support external links. + +## References + +```json +{ + "_REFERENCE": { + "source": ".", + "path": "...", + "object_id": "...", + "source_object_id": "...", + } +} +``` + +- `source`: Always `.` for now, indicating that the reference is to an object within the same Zarr. +- `path`: Path to the target object within the Zarr. +- `object_id`: The object_id attribute of the target object (for validation). +- `source_object_id`: The object_id attribute of the source object (for validation). + +The largely follows the [convention used by hdmf-zarr](https://hdmf-zarr.readthedocs.io/en/latest/storage.html#storing-object-references-in-attributes). + +HDF5 references can appear within both attributes and datasets. For attributes, the value of the attribute is a dict in the above form. For datasets, the value of an item within the dataset is a dict in the above form. + +**Note**: Region references are not supported at this time and are planned for future implementation. + +## Compound Data Types + +### `_COMPOUND_DTYPE: [['x', 'int32'], ['y', 'float64'], ...]` + +Zarr arrays can represent compound data types from HDF5 datasets. The `_COMPOUND_DTYPE` attribute on a Zarr array indicates this, listing each field's name and data type. The array data should be JSON encoded, aligning with the specified compound structure. The `h5py.Reference` type is also supported within these structures, enabling references within compound data types. + +## External Array Links + +### `_EXTERNAL_ARRAY_LINK = {'link_type': 'hdf5_dataset', 'url': '...', 'name': '...'}` + +For datasets with an extensive number of chunks such that inclusion in the Zarr or reference file system is impractical, LINDI uses the `_EXTERNAL_ARRAY_LINK` attribute on a Zarr array. This attribute points to an external HDF5 file, specifying the `url` for remote access (or local path) and the `name` of the target dataset within that file. When slicing that dataset, the `LindiH5pyClient` will handle data retrieval, leveraging `h5py` and `remfile` for remote access. \ No newline at end of file diff --git a/examples/_check_equal.py b/examples/_check_equal.py deleted file mode 100644 index 6cdf4c5..0000000 --- a/examples/_check_equal.py +++ /dev/null @@ -1,52 +0,0 @@ -import numpy as np - - -def _check_equal(a, b): - # allow comparison of bytes and strings - if isinstance(a, str): - a = a.encode() - if isinstance(b, str): - b = b.encode() - - # allow comparison of numpy scalars with python scalars - if np.issubdtype(type(a), np.floating): - a = float(a) - if np.issubdtype(type(b), np.floating): - b = float(b) - if np.issubdtype(type(a), np.integer): - a = int(a) - if np.issubdtype(type(b), np.integer): - b = int(b) - - # allow comparison of numpy arrays to python lists - if isinstance(a, list): - a = np.array(a) - if isinstance(b, list): - b = np.array(b) - - if type(a) != type(b): # noqa: E721 - return False - - if isinstance(a, np.ndarray): - assert isinstance(b, np.ndarray) - return _check_arrays_equal(a, b) - - return a == b - - -def _check_arrays_equal(a: np.ndarray, b: np.ndarray): - # If it's an array of strings, we convert to an array of bytes - if a.dtype == object: - # need to modify all the entries - a = np.array([x.encode() if type(x) is str else x for x in a.ravel()]).reshape( - a.shape - ) - if b.dtype == object: - b = np.array([x.encode() if type(x) is str else x for x in b.ravel()]).reshape( - b.shape - ) - # if this is numeric data we need to use allclose so that we can handle NaNs - if np.issubdtype(a.dtype, np.number): - return np.allclose(a, b, equal_nan=True) - else: - return np.array_equal(a, b) diff --git a/examples/example1.py b/examples/example1.py index b22de17..f5c1b6b 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -1,95 +1,24 @@ -import numpy as np -import h5py -import tempfile +import json +import pynwb import lindi -from lindi import LindiH5Store, LindiClient, LindiDataset -from _check_equal import _check_equal +# Define the URL for a remote NWB file +h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/" -def example1(): - # In this example, we create a temporary hdf5 file, with some sample - # datasets, groups, and attributes. We then load that file using - # LindiH5Store which is a zarr storage backend providing read-only view of - # that hdf5 file. We then create a reference file system and use that to - # create a LindiClient, which mimics the h5py API. We then compare the - # datasets, groups, and attributes of the original hdf5 file with those of - # the LindiClient. +# Create a read-only Zarr store as a wrapper for the h5 file +store = lindi.LindiH5ZarrStore.from_file(h5_url) - with tempfile.TemporaryDirectory() as tmpdir: - print("Creating an example hdf5 file") - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.attrs['top_level_attr'] = "top_level_value" - f.create_dataset("X", data=[1, 2, 3]) - f["X"].attrs["foo"] = "bar" - f["X"].attrs["baz"] = 3.14 - f["X"].attrs["qux"] = [1, 2, 3] - f["X"].attrs["corge"] = np.int32(5) - f.create_dataset("scalar_dataset", data=42, dtype="int32") - f.create_group("group") - f["group"].attrs["foo"] = "bar2" - f["group"].attrs["baz"] = 3.15 - h5f = h5py.File(filename, "r") +# Generate a reference file system +rfs = store.to_reference_file_system() - print("Creating a LindiH5Store from the hdf5 file") - # We set url to filename so that the references can point to a local file - # but normally, this would be a remote URL - with LindiH5Store.from_file(filename, url=filename) as store: - print("Creating a reference file system from the LindiH5Store") - rfs_fname = f"{tmpdir}/example.zarr.json" - store.to_file(rfs_fname) +# Save it to a file for later use +with open("example.zarr.json", "w") as f: + json.dump(rfs, f, indent=2) - print("Creating a LindiClient from the reference file system") - client = LindiClient.from_file(rfs_fname) +# Create an h5py-like client from the reference file system +client = lindi.LindiH5pyFile.from_reference_file_system(rfs) - print("Comparing dataset: X") - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiDataset) - assert len(X1) == len(X2) - assert X1.shape == X2.shape - assert X1.dtype == X2.dtype - assert X1.size == X2.size - assert X1.nbytes == X2.nbytes - assert X1[0] == X2[0] - assert X1[1] == X2[1] - assert X1[2] == X2[2] - assert _check_equal(X1[0:2], X2[0:2]) - assert _check_equal(X1[1:], X2[1:]) - assert _check_equal(X1[:], X2[:]) - assert _check_equal(X1[...], X2[...]) - for k, v in X2.attrs.items(): - assert k in X1.attrs - assert _check_equal(v, X1.attrs[k]) - for k, v in X1.attrs.items(): - assert k in X2.attrs - assert _check_equal(v, X2.attrs[k]) - - print("Comparing scalar dataset: scalar_dataset") - scalar_dataset1 = h5f["scalar_dataset"] - assert isinstance(scalar_dataset1, h5py.Dataset) - scalar_dataset2 = client["scalar_dataset"] - assert isinstance(scalar_dataset2, LindiDataset) - assert scalar_dataset1[()] == scalar_dataset2[()] - - print("Comparing group: group") - G1 = h5f["group"] - G2 = client["group"] - for k, v in G1.attrs.items(): - if not isinstance(G2, lindi.LindiReference): - assert k in G2.attrs - assert _check_equal(v, G2.attrs[k]) - - print("Comparing root group") - for k, v in h5f.attrs.items(): - assert k in h5f.attrs - assert _check_equal(v, h5f.attrs[k]) - for k in client.keys(): - assert k in h5f.keys() - for k in h5f.keys(): - assert k in client.keys() - - -if __name__ == "__main__": - example1() +# Open using pynwb +with pynwb.NWBHDF5IO(file=client, mode="r") as io: + nwbfile = io.read() + print(nwbfile) diff --git a/examples/example2.py b/examples/example2.py new file mode 100644 index 0000000..915085e --- /dev/null +++ b/examples/example2.py @@ -0,0 +1,13 @@ +import pynwb +import lindi + +# Define the URL for a remote .zarr.json file +url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' + +# Load the h5py-like client from the reference file system +client = lindi.LindiH5pyFile.from_reference_file_system(url) + +# Open using pynwb +with pynwb.NWBHDF5IO(file=client, mode="r") as io: + nwbfile = io.read() + print(nwbfile) diff --git a/examples/try_pynwb.py b/examples/try_pynwb.py deleted file mode 100644 index 610b66c..0000000 --- a/examples/try_pynwb.py +++ /dev/null @@ -1,31 +0,0 @@ -import json -import lindi -import pynwb - - -# https://neurosift.app/?p=/nwb&dandisetId=000717&dandisetVersion=draft&url=https://api.dandiarchive.org/api/assets/3d12a902-139a-4c1a-8fd0-0a7faf2fb223/download/ -h5_url = "https://api.dandiarchive.org/api/assets/3d12a902-139a-4c1a-8fd0-0a7faf2fb223/download/" - - -def try_pynwb(): - # Create the read-only store for the h5 file - store = lindi.LindiH5Store.from_file(h5_url) - - # Create the reference file system object - rfs = store.to_reference_file_system() - - # For reference, save it to a file - with open("example.zarr.json", "w") as f: - json.dump(rfs, f, indent=2) - - # Create the client from the reference file system - client = lindi.LindiClient.from_reference_file_system(rfs) - - # Try to read using pynwb - with pynwb.NWBHDF5IO(file=client, mode="r") as io: - nwbfile = io.read() - print(nwbfile) - - -if __name__ == "__main__": - try_pynwb() diff --git a/lindi/LindiClient/LindiAttributes.py b/lindi/LindiClient/LindiAttributes.py deleted file mode 100644 index 0b9682d..0000000 --- a/lindi/LindiClient/LindiAttributes.py +++ /dev/null @@ -1,34 +0,0 @@ -from typing import Union -import zarr - - -class LindiAttributes: - def __init__(self, *, _object: Union[zarr.Group, zarr.Array]): - self._object = _object - - def get(self, key, default=None): - return self._object.attrs.get(key, default) - - def __getitem__(self, key): - return self._object.attrs[key] - - def __setitem__(self, key, value): - raise KeyError("Cannot set attributes on read-only object") - - def __delitem__(self, key): - raise KeyError("Cannot delete attributes on read-only object") - - def __iter__(self): - return iter(self._object.attrs) - - def items(self): - return self._object.attrs.items() - - def __len__(self): - return len(self._object.attrs) - - def __repr__(self): - return repr(self._object.attrs) - - def __str__(self): - return str(self._object.attrs) diff --git a/lindi/LindiClient/LindiClient.py b/lindi/LindiClient/LindiClient.py deleted file mode 100644 index 152105f..0000000 --- a/lindi/LindiClient/LindiClient.py +++ /dev/null @@ -1,64 +0,0 @@ -from typing import Union -import json -import tempfile -from typing import Literal -from fsspec import FSMap -import zarr -import urllib.request -from fsspec.implementations.reference import ReferenceFileSystem -from zarr.storage import Store -from .LindiGroup import LindiGroup - - -class LindiClient(LindiGroup): - def __init__( - self, - *, - _zarr_group: zarr.Group, - ) -> None: - self._zarr_group = _zarr_group - super().__init__(_zarr_group=self._zarr_group) - - @staticmethod - def from_zarr_store(zarr_store: Union[Store, FSMap]) -> "LindiClient": - zarr_group = zarr.open(store=zarr_store, mode="r") - assert isinstance(zarr_group, zarr.Group) - return LindiClient.from_zarr_group(zarr_group) - - @staticmethod - def from_file( - json_file: str, file_type: Literal["zarr.json"] = "zarr.json" - ) -> "LindiClient": - if file_type == "zarr.json": - if json_file.startswith("http") or json_file.startswith("https"): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/temp.zarr.json" - _download_file(json_file, filename) - with open(filename, "r") as f: - data = json.load(f) - return LindiClient.from_reference_file_system(data) - else: - with open(json_file, "r") as f: - data = json.load(f) - return LindiClient.from_reference_file_system(data) - else: - raise ValueError(f"Unknown file_type: {file_type}") - - @staticmethod - def from_zarr_group(zarr_group: zarr.Group) -> "LindiClient": - return LindiClient(_zarr_group=zarr_group) - - @staticmethod - def from_reference_file_system(data: dict) -> "LindiClient": - fs = ReferenceFileSystem(data).get_mapper(root="/") - return LindiClient.from_zarr_store(fs) - - -def _download_file(url: str, filename: str) -> None: - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" - } - req = urllib.request.Request(url, headers=headers) - with urllib.request.urlopen(req) as response: - with open(filename, "wb") as f: - f.write(response.read()) diff --git a/lindi/LindiClient/LindiDataset.py b/lindi/LindiClient/LindiDataset.py deleted file mode 100644 index 019c90e..0000000 --- a/lindi/LindiClient/LindiDataset.py +++ /dev/null @@ -1,89 +0,0 @@ -from typing import Dict -import zarr -import h5py -import remfile -from .LindiAttributes import LindiAttributes - - -class LindiDataset: - def __init__(self, *, _zarr_array: zarr.Array): - self._zarr_array = _zarr_array - self._is_scalar = self._zarr_array.attrs.get("_SCALAR", False) - - self._external_hdf5_clients: Dict[str, h5py.File] = {} - - @property - def name(self): - return self._zarr_array.name - - @property - def attrs(self): - """Attributes attached to this object""" - return LindiAttributes(_object=self._zarr_array) - - @property - def ndim(self): - if self._is_scalar: - return 0 - return self._zarr_array.ndim - - @property - def shape(self): - if self._is_scalar: - return () - return self._zarr_array.shape - - @property - def dtype(self): - return self._zarr_array.dtype - - @property - def size(self): - if self._is_scalar: - return 1 - return self._zarr_array.size - - @property - def nbytes(self): - return self._zarr_array.nbytes - - def __len__(self): - """We conform to h5py, which is the number of elements in the first dimension, TypeError if scalar""" - if self.ndim == 0: - raise TypeError("Scalar dataset") - return self.shape[0] # type: ignore - - def __iter__(self): - """We conform to h5py, which is: Iterate over the first axis. TypeError if scalar.""" - shape = self.shape - if len(shape) == 0: - raise TypeError("Can't iterate over a scalar dataset") - for i in range(shape[0]): - yield self[i] - - def __getitem__(self, selection): - # First check whether this is an external array link - external_array_link = self._zarr_array.attrs.get("_EXTERNAL_ARRAY_LINK", None) - if external_array_link and isinstance(external_array_link, dict): - link_type = external_array_link.get("link_type", None) - if link_type == 'hdf5_dataset': - url = external_array_link.get("url", None) - name = external_array_link.get("name", None) - if url is not None and name is not None: - client = self._get_external_hdf5_client(url) - dataset = client[name] - assert isinstance(dataset, h5py.Dataset) - return dataset[selection] - # We use zarr's slicing, except in the case of a scalar dataset - if self.ndim == 0: - # make sure selection is () - if selection != (): - raise TypeError(f'Cannot slice a scalar dataset with {selection}') - return self._zarr_array[0] - return self._zarr_array[selection] - - def _get_external_hdf5_client(self, url: str) -> h5py.File: - if url not in self._external_hdf5_clients: - remf = remfile.File(url) - self._external_hdf5_clients[url] = h5py.File(remf, 'r') - return self._external_hdf5_clients[url] diff --git a/lindi/LindiClient/LindiGroup.py b/lindi/LindiClient/LindiGroup.py deleted file mode 100644 index d4b5bf2..0000000 --- a/lindi/LindiClient/LindiGroup.py +++ /dev/null @@ -1,66 +0,0 @@ -import zarr -from .LindiAttributes import LindiAttributes -from .LindiDataset import LindiDataset - - -class LindiGroup: - def __init__(self, *, _zarr_group: zarr.Group): - self._zarr_group = _zarr_group - - @property - def attrs(self): - """Attributes attached to this object""" - return LindiAttributes(_object=self._zarr_group) - - def keys(self): - return self._zarr_group.keys() - - @property - def name(self): - return self._zarr_group.name - - def get(self, key, default=None): - try: - return self[key] - except KeyError: - return default - - def __getitem__(self, key): - if isinstance(key, dict): - # might be a reference - if "_REFERENCE" in key: - return LindiReference(key['_REFERENCE']) - if not isinstance(key, str): - raise Exception(f'Cannot use key "{key}" of type "{type(key)}" to index into a LindiGroup, at path "{self._zarr_group.name}"') - if key in self._zarr_group.keys(): - x = self._zarr_group[key] - if isinstance(x, zarr.Group): - return LindiGroup(_zarr_group=x) - elif isinstance(x, zarr.Array): - return LindiDataset(_zarr_array=x) - else: - raise Exception(f"Unknown type: {type(x)}") - else: - raise KeyError(f'Key "{key}" not found in group "{self._zarr_group.name}"') - - def __iter__(self): - for k in self.keys(): - yield k - - -class LindiReference: - def __init__(self, obj: dict): - self._object_id = obj["object_id"] - self._path = obj["path"] - self._source = obj["source"] - self._source_object_id = obj["source_object_id"] - - @property - def name(self): - return self._path - - def __repr__(self): - return f"LindiReference({self._source}, {self._path})" - - def __str__(self): - return f"LindiReference({self._source}, {self._path})" diff --git a/lindi/LindiClient/__init__.py b/lindi/LindiClient/__init__.py deleted file mode 100644 index 6d8a5e0..0000000 --- a/lindi/LindiClient/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .LindiClient import LindiClient # noqa: F401 -from .LindiGroup import LindiGroup # noqa: F401 -from .LindiDataset import LindiDataset # noqa: F401 -from .LindiAttributes import LindiAttributes # noqa: F401 -from .LindiGroup import LindiReference # noqa: F401 diff --git a/lindi/LindiH5Store/__init__.py b/lindi/LindiH5Store/__init__.py deleted file mode 100644 index 858de62..0000000 --- a/lindi/LindiH5Store/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .LindiH5Store import LindiH5Store, LindiH5StoreOpts # noqa: F401 diff --git a/lindi/LindiH5Store/LindiH5Store.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py similarity index 68% rename from lindi/LindiH5Store/LindiH5Store.py rename to lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index 4d519f5..5f87aa1 100644 --- a/lindi/LindiH5Store/LindiH5Store.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -1,4 +1,5 @@ import json +import base64 from typing import Union, List, IO, Any, Dict, Literal from dataclasses import dataclass import numpy as np @@ -9,7 +10,6 @@ from zarr.errors import ReadOnlyError from ._zarr_info_for_h5_dataset import _zarr_info_for_h5_dataset from ._util import ( - _get_h5_item, _read_bytes, _get_chunk_byte_range, _get_byte_range_for_contiguous_dataset, @@ -18,22 +18,31 @@ @dataclass -class LindiH5StoreOpts: +class LindiH5ZarrStoreOpts: + """ + Options for the LindiH5ZarrStore class. + + Attributes: + num_dataset_chunks_threshold (Union[int, None]): For each dataset in the + HDF5 file, if the number of chunks is greater than this threshold, then + the dataset will be represented as an external array link. If None, then + the threshold is not used. Default is 1000. + """ num_dataset_chunks_threshold: Union[int, None] = 1000 -class LindiH5Store(Store): +class LindiH5ZarrStore(Store): """A zarr store that provides a read-only view of an HDF5 file. Do not call the constructor directly. Instead do one of the following: - store = LindiH5Store.from_file(hdf5_file_name) + store = LindiH5ZarrStore.from_file(hdf5_file_name) # do stuff with store store.close() or - with LindiH5Store.from_file(hdf5_file_name) as store: + with LindiH5ZarrStore.from_file(hdf5_file_name) as store: # do stuff with store """ @@ -41,7 +50,7 @@ def __init__( self, *, _file: Union[IO, Any], - _opts: LindiH5StoreOpts, + _opts: LindiH5ZarrStoreOpts, _url: Union[str, None] = None, ): """ @@ -61,32 +70,21 @@ def __init__( self._external_array_links: Dict[str, Union[dict, None]] = {} - def close(self): - # overrides the base class method - # now a context manager can be used - if hasattr(self, "_h5f") and self._h5f: - self._h5f.close() - self._h5f = None - if hasattr(self, "_file") and self._file: - if not isinstance(self._file, remfile.File): - self._file.close() - self._file = None - @staticmethod def from_file( hdf5_file_name_or_url: str, *, - opts: LindiH5StoreOpts = LindiH5StoreOpts(), + opts: LindiH5ZarrStoreOpts = LindiH5ZarrStoreOpts(), url: Union[str, None] = None, ): """ - Create a LindiH5Store from a file or url. + Create a LindiH5ZarrStore from a file or url pointing to an HDF5 file. Parameters ---------- hdf5_file_name_or_url : str The name of the HDF5 file or a URL to the HDF5 file. - opts : LindiH5StoreOpts + opts : LindiH5ZarrStoreOpts Options for the store. url : str or None If hdf5_file_name_or_url is a local file name, then this can @@ -100,10 +98,20 @@ def from_file( "http://" ) or hdf5_file_name_or_url.startswith("https://"): remf = remfile.File(hdf5_file_name_or_url, verbose=False) - return LindiH5Store(_file=remf, _url=hdf5_file_name_or_url, _opts=opts) + return LindiH5ZarrStore(_file=remf, _url=hdf5_file_name_or_url, _opts=opts) else: f = open(hdf5_file_name_or_url, "rb") - return LindiH5Store(_file=f, _url=url, _opts=opts) + return LindiH5ZarrStore(_file=f, _url=url, _opts=opts) + + def close(self): + """Close the store.""" + if hasattr(self, "_h5f") and self._h5f: + self._h5f.close() + self._h5f = None + if hasattr(self, "_file") and self._file: + if not isinstance(self._file, remfile.File): + self._file.close() + self._file = None def __getitem__(self, key): """Get an item from the store (required by base class).""" @@ -138,7 +146,7 @@ def __contains__(self, key): key_name = parts[-1] key_parent = "/".join(parts[:-1]) if key_name == ".zattrs": - h5_item = _get_h5_item(self._h5f, key_parent) + h5_item = self._h5f.get('/' + key_parent, None) if not h5_item: return False # We always return True here even if the attributes are going to be @@ -146,18 +154,18 @@ def __contains__(self, key): # write out the ref file system, we exclude it there. return isinstance(h5_item, h5py.Group) or isinstance(h5_item, h5py.Dataset) elif key_name == ".zgroup": - h5_item = _get_h5_item(self._h5f, key_parent) + h5_item = self._h5f.get('/' + key_parent, None) if not h5_item: return False return isinstance(h5_item, h5py.Group) elif key_name == ".zarray": - h5_item = _get_h5_item(self._h5f, key_parent) + h5_item = self._h5f.get('/' + key_parent, None) if not h5_item: return False return isinstance(h5_item, h5py.Dataset) else: # a chunk file - h5_item = _get_h5_item(self._h5f, key_parent) + h5_item = self._h5f.get('/' + key_parent, None) if not h5_item: return False if not isinstance(h5_item, h5py.Dataset): @@ -193,10 +201,27 @@ def _get_zattrs_bytes(self, parent_key: str): """Get the attributes of a group or dataset""" if self._h5f is None: raise Exception("Store is closed") - h5_item = _get_h5_item(self._h5f, parent_key) - assert isinstance(h5_item, h5py.Group) or isinstance(h5_item, h5py.Dataset), ( - f"Item {parent_key} is not a group or dataset in _get_zattrs_bytes" - ) + h5_item = self._h5f.get('/' + parent_key, None) + if h5_item is None: + raise KeyError(parent_key) + if not isinstance(h5_item, h5py.Group) and not isinstance(h5_item, h5py.Dataset): + raise Exception(f"Item {parent_key} is not a group or dataset. It is {type(h5_item)}") + + # Check whether this is a soft link + if isinstance(h5_item, h5py.Group) and parent_key != '': + link = self._h5f.get('/' + parent_key, getlink=True) + if isinstance(link, h5py.ExternalLink): + print(f'WARNING: External links not supported: {parent_key}') + elif isinstance(link, h5py.SoftLink): + # if it's a soft link, we return a special attribute and ignore + # the rest of the attributes because they should be stored in + # the target of the soft link + return _reformat_json(json.dumps({ + "_SOFT_LINK": { + "path": link.path + } + }).encode("utf-8")) + # We create a dummy zarr group and copy the attributes to it. That way # we know that zarr has accepted them and they are serialized in the # correct format. @@ -209,6 +234,13 @@ def _get_zattrs_bytes(self, parent_key: str): if isinstance(h5_item, h5py.Dataset): if h5_item.ndim == 0: dummy_group.attrs["_SCALAR"] = True + if h5_item.dtype.kind == "V": # compound type + compound_dtype = [ + [name, str(h5_item.dtype[name])] + for name in h5_item.dtype.names + ] + # For example: [['x', 'uint32'], ['y', 'uint32'], ['weight', 'float32']] + dummy_group.attrs["_COMPOUND_DTYPE"] = compound_dtype external_array_link = self._get_external_array_link(parent_key, h5_item) if external_array_link is not None: dummy_group.attrs["_EXTERNAL_ARRAY_LINK"] = external_array_link @@ -223,7 +255,7 @@ def _get_zgroup_bytes(self, parent_key: str): """Get the .zgroup JSON text for a group""" if self._h5f is None: raise Exception("Store is closed") - h5_item = _get_h5_item(self._h5f, parent_key) + h5_item = self._h5f.get('/' + parent_key, None) if not isinstance(h5_item, h5py.Group): raise Exception(f"Item {parent_key} is not a group") # We create a dummy zarr group and then get the .zgroup JSON text @@ -236,7 +268,7 @@ def _get_zarray_bytes(self, parent_key: str): """Get the .zarray JSON text for a dataset""" if self._h5f is None: raise Exception("Store is closed") - h5_item = _get_h5_item(self._h5f, parent_key) + h5_item = self._h5f.get('/' + parent_key, None) if not isinstance(h5_item, h5py.Dataset): raise Exception(f"Item {parent_key} is not a dataset") # get the shape, chunks, dtype, and filters from the h5 dataset @@ -282,7 +314,7 @@ def _get_chunk_file_bytes(self, key_parent: str, key_name: str): def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): if self._h5f is None: raise Exception("Store is closed") - h5_item = _get_h5_item(self._h5f, key_parent) + h5_item = self._h5f.get('/' + key_parent, None) if not isinstance(h5_item, h5py.Dataset): raise Exception(f"Item {key_parent} is not a dataset") @@ -303,6 +335,10 @@ def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): f"Chunk name {key_name} does not match dataset dimensions" ) + # Check whether we have inline data for this array. This would be set + # when we created the .zarray JSON text for the dataset. Note that this + # means that the .zarray file needs to be read before the chunk files, + # which should always be the case (I assume). if key_parent in self._inline_data_for_arrays: x = self._inline_data_for_arrays[key_parent] if isinstance(x, bytes): @@ -314,6 +350,7 @@ def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): f"Inline data for dataset {key_parent} is not bytes or str. It is {type(x)}" ) + # If this is a scalar, then the data should have been inline if h5_item.ndim == 0: raise Exception(f"No inline data for scalar dataset {key_parent}") @@ -328,12 +365,10 @@ def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): f"Chunk coordinates {chunk_coords} out of range for dataset {key_parent}" ) if h5_item.chunks is not None: - # Get the byte range in the file for the chunk. We do it this way - # rather than reading from the h5py dataset Because we want to - # ensure that we are reading the exact bytes. + # Get the byte range in the file for the chunk. byte_offset, byte_count = _get_chunk_byte_range(h5_item, chunk_coords) else: - # in this case (contiguous dataset), we need to check that the chunk + # In this case (contiguous dataset), we need to check that the chunk # coordinates are (0, 0, 0, ...) if chunk_coords != (0,) * h5_item.ndim: raise Exception( @@ -344,12 +379,15 @@ def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): return byte_offset, byte_count, None def _get_external_array_link(self, parent_key: str, h5_item: h5py.Dataset): + # First check the memory cache if parent_key in self._external_array_links: return self._external_array_links[parent_key] - self._external_array_links[parent_key] = ( - None # important to set to None so that we don't keep checking - ) + # Important to set it to None so that we don't keep checking it + self._external_array_links[parent_key] = None if h5_item.chunks and self._opts.num_dataset_chunks_threshold is not None: + # We compute the expected number of chunks using the shape and chunks + # and compare it to the threshold. If it's greater, then we create an + # external array link. shape = h5_item.shape chunks = h5_item.chunks chunk_coords_shape = [ @@ -370,27 +408,37 @@ def _get_external_array_link(self, parent_key: str, h5_item: h5py.Dataset): return self._external_array_links[parent_key] def listdir(self, path: str = "") -> List[str]: + # This function is used by zarr. We need to return the names of the + # subdirectories of the given path in the store. We should not return + # the names of files. if self._h5f is None: raise Exception("Store is closed") try: - item = _get_h5_item(self._h5f, path) + item = self._h5f['/' + path] except KeyError: return [] if isinstance(item, h5py.Group): + # check whether it's a soft link + link = self._h5f.get('/' + path, getlink=True) if path != '' else None + if isinstance(link, h5py.SoftLink): + # in this case we don't return any keys because the keys should + # be in the target of the soft link + return [] + # We will have one subdir for each key in the group ret = [] for k in item.keys(): ret.append(k) return ret elif isinstance(item, h5py.Dataset): - ret = [] - return ret + # Datasets do not have subdirectories + return [] else: return [] def to_file(self, file_name: str, *, file_type: Literal["zarr.json"] = "zarr.json"): """Write a reference file system cooresponding to this store to a file. - This can then be loaded using LindiClient.from_file(fname) + This can then be loaded using LindiH5pyFile.from_reference_file_system(file_name) """ if file_type != "zarr.json": raise Exception(f"Unsupported file type: {file_type}") @@ -402,8 +450,7 @@ def to_file(self, file_name: str, *, file_type: Literal["zarr.json"] = "zarr.jso def to_reference_file_system(self) -> dict: """Create a reference file system cooresponding to this store. - This can then be loaded using - LindiClient.from_reference_file_system(obj) + This can then be loaded using LindiH5pyFile.from_reference_file_system(obj) """ if self._h5f is None: raise Exception("Store is closed") @@ -411,69 +458,98 @@ def to_reference_file_system(self) -> dict: raise Exception("You must specify a url to create a reference file system") ret = {"refs": {}, "version": 1} + # TODO: use templates to decrease the size of the JSON + def _add_ref(key: str, content: Union[bytes, None]): if content is None: raise Exception(f"Unable to get content for key {key}") - try: - ret["refs"][key] = content.decode("ascii") - except UnicodeDecodeError: - import base64 - - # from kerchunk + if content.startswith(b"base64:"): + # This is the rare case where the content actually starts with "base64:" + # which is confusing. Not sure when this would happen, but it could. + # TODO: needs a unit test ret["refs"][key] = (b"base64:" + base64.b64encode(content)).decode( "ascii" ) + else: + # This is the usual case. It will raise a UnicodeDecodeError if the + # content is not valid ASCII, in which case the content will be + # base64 encoded. + try: + ret["refs"][key] = content.decode("ascii") + except UnicodeDecodeError: + # If the content is not valid ASCII, then we base64 encode it. The + # reference file system reader will know what to do with it. + ret["refs"][key] = (b"base64:" + base64.b64encode(content)).decode( + "ascii" + ) def _process_group(key, item: h5py.Group): if isinstance(item, h5py.Group): + # Add the .zattrs and .zgroup files for the group zattrs_bytes = self.get(_join(key, ".zattrs")) if zattrs_bytes != b"{}": # don't include empty zattrs _add_ref(_join(key, ".zattrs"), self.get(_join(key, ".zattrs"))) _add_ref(_join(key, ".zgroup"), self.get(_join(key, ".zgroup"))) + # check if this is a soft link + link = item.file.get('/' + key, getlink=True) if key != '' else None + if isinstance(link, h5py.SoftLink): + # if it's a soft link, we don't include the keys because + # they should be in the target of the soft link + return for k in item.keys(): subitem = item[k] if isinstance(subitem, h5py.Group): + # recursively process subgroups _process_group(_join(key, k), subitem) elif isinstance(subitem, h5py.Dataset): - subkey = _join(key, k) - zattrs_bytes = self.get(f"{subkey}/.zattrs") - assert zattrs_bytes is not None - _add_ref(f"{subkey}/.zattrs", zattrs_bytes) - zattrs_dict = json.loads(zattrs_bytes.decode("utf-8")) - external_array_link = zattrs_dict.get( - "_EXTERNAL_ARRAY_LINK", None - ) - zarray_bytes = self.get(f"{subkey}/.zarray") - assert zarray_bytes is not None - _add_ref(f"{subkey}/.zarray", zarray_bytes) - zarray_dict = json.loads(zarray_bytes.decode("utf-8")) - if ( - external_array_link is None - ): # only process chunks for non-external array links - shape = zarray_dict["shape"] - chunks = zarray_dict.get("chunks", None) - chunk_coords_shape = [ - (shape[i] + chunks[i] - 1) // chunks[i] - for i in range(len(shape)) - ] - chunk_names = _get_chunk_names_for_dataset( - chunk_coords_shape - ) - for chunk_name in chunk_names: - byte_offset, byte_count, inline_data = ( - self._get_chunk_file_bytes_data(subkey, chunk_name) - ) - if inline_data is not None: - _add_ref(f"{subkey}/{chunk_name}", inline_data) - else: - assert byte_offset is not None - assert byte_count is not None - ret["refs"][f"{subkey}/{chunk_name}"] = [ - self._url, - byte_offset, - byte_count, - ] - + _process_dataset(_join(key, k)) + + def _process_dataset(key): + # Add the .zattrs and .zarray files for the dataset + zattrs_bytes = self.get(f"{key}/.zattrs") + assert zattrs_bytes is not None + if zattrs_bytes != b"{}": # don't include empty zattrs + _add_ref(f"{key}/.zattrs", zattrs_bytes) + zattrs_dict = json.loads(zattrs_bytes.decode("utf-8")) + external_array_link = zattrs_dict.get( + "_EXTERNAL_ARRAY_LINK", None + ) + zarray_bytes = self.get(f"{key}/.zarray") + assert zarray_bytes is not None + _add_ref(f"{key}/.zarray", zarray_bytes) + zarray_dict = json.loads(zarray_bytes.decode("utf-8")) + + if external_array_link is None: + # Only add chunk references for datasets without an external array link + shape = zarray_dict["shape"] + chunks = zarray_dict.get("chunks", None) + chunk_coords_shape = [ + (shape[i] + chunks[i] - 1) // chunks[i] + for i in range(len(shape)) + ] + # For example, chunk_names could be ['0', '1', '2', ...] + # or ['0.0', '0.1', '0.2', ...] + chunk_names = _get_chunk_names_for_dataset( + chunk_coords_shape + ) + for chunk_name in chunk_names: + byte_offset, byte_count, inline_data = ( + self._get_chunk_file_bytes_data(key, chunk_name) + ) + if inline_data is not None: + # The data is inline for this chunk + _add_ref(f"{key}/{chunk_name}", inline_data) + else: + # In this case we reference a chunk of data in a separate file + assert byte_offset is not None + assert byte_count is not None + ret["refs"][f"{key}/{chunk_name}"] = [ + self._url, + byte_offset, + byte_count, + ] + + # Process the groups recursively starting with the root group _process_group("", self._h5f) return ret @@ -486,7 +562,11 @@ def _join(a: str, b: str) -> str: def _get_chunk_names_for_dataset(chunk_coords_shape: List[int]) -> List[str]: - """Get the chunk names for a dataset with the given chunk coords shape.""" + """Get the chunk names for a dataset with the given chunk coords shape. + + For example: _get_chunk_names_for_dataset([1, 2, 3]) returns + ['0.0.0', '0.0.1', '0.0.2', '0.1.0', '0.1.1', '0.1.2'] + """ ndim = len(chunk_coords_shape) if ndim == 0: return ["0"] @@ -506,7 +586,7 @@ def _reformat_json(x: Union[bytes, None]) -> Union[bytes, None]: if x is None: return None a = json.loads(x.decode("utf-8")) - return json.dumps(a, cls=FloatJSONEncoder).encode("utf-8") + return json.dumps(a, cls=FloatJSONEncoder, separators=(",", ":")).encode("utf-8") # From https://github.com/rly/h5tojson/blob/b162ff7f61160a48f1dc0026acb09adafdb422fa/h5tojson/h5tojson.py#L121-L156 diff --git a/lindi/LindiH5ZarrStore/__init__.py b/lindi/LindiH5ZarrStore/__init__.py new file mode 100644 index 0000000..6972f78 --- /dev/null +++ b/lindi/LindiH5ZarrStore/__init__.py @@ -0,0 +1,6 @@ +from .LindiH5ZarrStore import LindiH5ZarrStore, LindiH5ZarrStoreOpts + +__all__ = [ + "LindiH5ZarrStore", + "LindiH5ZarrStoreOpts", +] diff --git a/lindi/LindiH5Store/_h5_attr_to_zarr_attr.py b/lindi/LindiH5ZarrStore/_h5_attr_to_zarr_attr.py similarity index 74% rename from lindi/LindiH5Store/_h5_attr_to_zarr_attr.py rename to lindi/LindiH5ZarrStore/_h5_attr_to_zarr_attr.py index 26530b6..054b290 100644 --- a/lindi/LindiH5Store/_h5_attr_to_zarr_attr.py +++ b/lindi/LindiH5ZarrStore/_h5_attr_to_zarr_attr.py @@ -14,20 +14,34 @@ def _h5_attr_to_zarr_attr(attr: Any, *, label: str = '', h5f: h5py.File): Otherwise, raise NotImplementedError """ - if isinstance(attr, bytes): - return attr.decode('utf-8') # is this reversible? + + # first disallow special strings + special_strings = ['NaN', 'Infinity', '-Infinity'] + if isinstance(attr, str) and attr in special_strings: + raise ValueError(f"Special string {attr} not allowed in attribute value at {label}") + if isinstance(attr, bytes) and attr in [x.encode('utf-8') for x in special_strings]: + raise ValueError(f"Special string {attr} not allowed in attribute value at {label}") + + if attr is None: + return None + elif isinstance(attr, bytes): + return attr.decode('utf-8') elif isinstance(attr, (int, float, str)): return attr + elif np.issubdtype(type(attr), np.integer): + return int(attr) + elif np.issubdtype(type(attr), np.floating): + return float(attr) + elif np.issubdtype(type(attr), np.bool_): + return bool(attr) + elif type(attr) is np.bytes_: + return attr.tobytes().decode('utf-8') + elif isinstance(attr, h5py.Reference): + return _h5_ref_to_zarr_attr(attr, label=label + '._REFERENCE', h5f=h5f) elif isinstance(attr, list): return [_h5_attr_to_zarr_attr(x, label=label, h5f=h5f) for x in attr] elif isinstance(attr, dict): return {k: _h5_attr_to_zarr_attr(v, label=label, h5f=h5f) for k, v in attr.items()} - elif isinstance(attr, h5py.Reference): - return _h5_ref_to_zarr_attr(attr, label=label, h5f=h5f) - elif np.issubdtype(type(attr), np.integer): - return int(attr) # possible loss of precision? - elif np.issubdtype(type(attr), np.floating): - return float(attr) # possible loss of precision? elif isinstance(attr, np.ndarray): return _h5_attr_to_zarr_attr(attr.tolist(), label=label, h5f=h5f) else: @@ -62,13 +76,15 @@ def _h5_ref_to_zarr_attr(ref: h5py.Reference, *, label: str = '', h5f: h5py.File # is to do an initial pass through the file and build a map of object IDs to # paths. This would need to happen elsewhere in the code. deref_objname = h5py.h5r.get_name(ref, file_id) + if deref_objname is None: + raise ValueError(f"Could not dereference object with reference {ref}") deref_objname = deref_objname.decode("utf-8") dref_obj = h5f[deref_objname] object_id = dref_obj.attrs.get("object_id", None) # Here we assume that the file has a top-level attribute called "object_id". - # This will be the case for files created by the LindiH5Store class. + # This will be the case for files created by the LindiH5ZarrStore class. file_object_id = h5f.attrs.get("object_id", None) # See https://hdmf-zarr.readthedocs.io/en/latest/storage.html#storing-object-references-in-attributes @@ -89,6 +105,8 @@ def _h5_ref_to_zarr_attr(ref: h5py.Reference, *, label: str = '', h5f: h5py.File # "value": value # } - return { + # important to run it through _h5_attr_to_zarr_attr to handle object IDs of + # type bytes + return _h5_attr_to_zarr_attr({ "_REFERENCE": value - } + }, label=label, h5f=h5f) diff --git a/lindi/LindiH5Store/_h5_filters_to_codecs.py b/lindi/LindiH5ZarrStore/_h5_filters_to_codecs.py similarity index 100% rename from lindi/LindiH5Store/_h5_filters_to_codecs.py rename to lindi/LindiH5ZarrStore/_h5_filters_to_codecs.py diff --git a/lindi/LindiH5Store/_util.py b/lindi/LindiH5ZarrStore/_util.py similarity index 93% rename from lindi/LindiH5Store/_util.py rename to lindi/LindiH5ZarrStore/_util.py index ece2544..2976dbc 100644 --- a/lindi/LindiH5Store/_util.py +++ b/lindi/LindiH5ZarrStore/_util.py @@ -3,11 +3,6 @@ import h5py -def _get_h5_item(h5f: h5py.File, key: str): - """Get an item from the h5 file, given its key.""" - return h5f.get('/' + key, None) - - def _read_bytes(file: IO, offset: int, count: int): """Read a range of bytes from a file-like object.""" file.seek(offset) diff --git a/lindi/LindiH5Store/_zarr_info_for_h5_dataset.py b/lindi/LindiH5ZarrStore/_zarr_info_for_h5_dataset.py similarity index 63% rename from lindi/LindiH5Store/_zarr_info_for_h5_dataset.py rename to lindi/LindiH5ZarrStore/_zarr_info_for_h5_dataset.py index 568079d..de1105c 100644 --- a/lindi/LindiH5Store/_zarr_info_for_h5_dataset.py +++ b/lindi/LindiH5ZarrStore/_zarr_info_for_h5_dataset.py @@ -6,6 +6,7 @@ import numcodecs import h5py from numcodecs.abc import Codec +from ._h5_attr_to_zarr_attr import _h5_ref_to_zarr_attr from ._h5_filters_to_codecs import _h5_filters_to_codecs @@ -23,7 +24,7 @@ class ZarrInfoForH5Dataset: def _zarr_info_for_h5_dataset(h5_dataset: h5py.Dataset) -> ZarrInfoForH5Dataset: """Get the information needed to create a zarr dataset from an h5py dataset. - This is the main workhorse function for LindiH5Store. It takes an h5py + This is the main workhorse function for LindiH5ZarrStore. It takes an h5py dataset and returns a ZarrInfoForH5Dataset object. It handles the following cases: @@ -32,16 +33,16 @@ def _zarr_info_for_h5_dataset(h5_dataset: h5py.Dataset) -> ZarrInfoForH5Dataset: is in the hdf5 file. The hdf5 filters are translated into zarr filters using the _h5_filters_to_codecs function. - If it is a non-scalar object array, then the inline_data will be a JSON string and the - JSON codec will be used. + If it is a non-scalar object array, then the inline_data will be a JSON + string and the JSON codec will be used. When the shape is (), we have a scalar dataset. Since zarr doesn't support - scalar datasets, we make an array of shape (1,). The _ARRAY_DIMENSIONS - attribute will be set to [] elsewhere to indicate that it is actually a - scalar. The inline_data attribute will be set. In the case of a numeric - scalar, it will be a bytes object with the binary representation of the - value. In the case of an object, the inline_data will be a JSON string and - the JSON codec will be used. + scalar datasets, we make an array of shape (1,). The _SCALAR attribute will + be set to True elsewhere to indicate that it is actually a scalar. The + inline_data attribute will be set. In the case of a numeric scalar, it will + be a bytes object with the binary representation of the value. In the case + of an object, the inline_data will be a JSON string and the JSON codec will + be used. """ shape = h5_dataset.shape dtype = h5_dataset.dtype @@ -80,7 +81,7 @@ def _zarr_info_for_h5_dataset(h5_dataset: h5py.Dataset) -> ZarrInfoForH5Dataset: filters=None, fill_value=' ', object_codec=numcodecs.JSON(), - inline_data=json.dumps([value, '|O', [1]]).encode('utf-8') + inline_data=json.dumps([value, '|O', [1]], separators=(',', ':')).encode('utf-8') ) else: raise Exception(f'Not yet implemented (1): object scalar dataset with value {value} and dtype {dtype}') @@ -111,20 +112,16 @@ def _zarr_info_for_h5_dataset(h5_dataset: h5py.Dataset) -> ZarrInfoForH5Dataset: object_codec = numcodecs.JSON() data = h5_dataset[:] data_vec_view = data.ravel() - _warning_reference_in_dataset_printed = False for i, val in enumerate(data_vec_view): if isinstance(val, bytes): data_vec_view[i] = val.decode() elif isinstance(val, str): data_vec_view[i] = val - elif isinstance(val, h5py.h5r.Reference): - if not _warning_reference_in_dataset_printed: - print(f'Warning: reference in dataset {h5_dataset.name} not handled') - _warning_reference_in_dataset_printed = True - data_vec_view[i] = None + elif isinstance(val, h5py.Reference): + data_vec_view[i] = _h5_ref_to_zarr_attr(val, label=f'{h5_dataset.name}[{i}]', h5f=h5_dataset.file) else: raise Exception(f'Cannot handle dataset {h5_dataset.name} with dtype {dtype} and shape {shape}') - inline_data = json.dumps(data.tolist() + ['|O', list(shape)]).encode('utf-8') + inline_data = json.dumps(data.tolist() + ['|O', list(shape)], separators=(',', ':')).encode('utf-8') return ZarrInfoForH5Dataset( shape=shape, chunks=shape, # be explicit about chunks @@ -136,10 +133,59 @@ def _zarr_info_for_h5_dataset(h5_dataset: h5py.Dataset) -> ZarrInfoForH5Dataset: ) elif dtype.kind in 'SU': # byte string or unicode string raise Exception(f'Not yet implemented (2): dataset {h5_dataset.name} with dtype {dtype} and shape {shape}') + elif dtype.kind == 'V': # void (i.e. compound) + if h5_dataset.ndim == 1: + # for now we only handle the case of a 1D compound dataset + data = h5_dataset[:] + # Create an array that would be for example like this + # dtype = np.dtype([('x', np.float64), ('y', np.int32), ('weight', np.float64)]) + # array_list = [[3, 4, 5.3], [2, 1, 7.1], ...] + # where the first entry corresponds to x in the example above, the second to y, and the third to weight + # This is a more compact representation than [{'x': ...}] + # The _COMPOUND_DTYPE attribute will be set on the dataset in the zarr store + # which will be used to interpret the data + array_list = [ + [ + _json_serialize(data[name][i], dtype[name], h5_dataset) + for name in dtype.names + ] + for i in range(h5_dataset.shape[0]) + ] + object_codec = numcodecs.JSON() + inline_data = array_list + ['|O', list(shape)] + return ZarrInfoForH5Dataset( + shape=shape, + chunks=shape, # be explicit about chunks + dtype='object', + filters=None, + fill_value=' ', # not sure what to put here + object_codec=object_codec, + inline_data=json.dumps(inline_data, separators=(',', ':')).encode('utf-8') + ) + else: + raise Exception(f'More than one dimension not supported for compound dataset {h5_dataset.name} with dtype {dtype} and shape {shape}') else: + print(dtype.kind) raise Exception(f'Not yet implemented (3): dataset {h5_dataset.name} with dtype {dtype} and shape {shape}') +def _json_serialize(val: Any, dtype: np.dtype, h5_dataset: h5py.Dataset) -> Any: + if dtype.kind in ['i', 'u']: # integer, unsigned integer + return int(val) + elif dtype.kind == 'f': # float + return float(val) + elif dtype.kind == 'b': # boolean + return bool(val) + elif dtype.kind == 'S': # byte string + return val.decode() + elif dtype.kind == 'U': # unicode string + return val + elif dtype == h5py.Reference: + return _h5_ref_to_zarr_attr(val, label=f'{h5_dataset.name}', h5f=h5_dataset.file) + else: + raise Exception(f'Cannot serialize item {val} with dtype {dtype} when serializing dataset {h5_dataset.name} with compound dtype.') + + def _get_numeric_format_str(dtype: Any) -> Union[str, None]: """Get the format string for a numeric dtype. diff --git a/lindi/LindiH5pyFile/LindiH5pyAttributes.py b/lindi/LindiH5pyFile/LindiH5pyAttributes.py new file mode 100644 index 0000000..235fb03 --- /dev/null +++ b/lindi/LindiH5pyFile/LindiH5pyAttributes.py @@ -0,0 +1,101 @@ +from typing import Literal +from .LindiH5pyReference import LindiH5pyReference + +_special_attribute_keys = [ + "_SCALAR", + "_COMPOUND_DTYPE", + "_REFERENCE", + "_EXTERNAL_ARRAY_LINK", + "_SOFT_LINK", +] + + +class LindiH5pyAttributes: + def __init__(self, attrs, attrs_type: Literal["h5py", "zarr"]): + self._attrs = attrs + self._attrs_type = attrs_type + + def get(self, key, default=None): + if self._attrs_type == "h5py": + return self._attrs.get(key, default) + elif self._attrs_type == "zarr": + try: + if key in _special_attribute_keys: + raise KeyError + return self[key] + except KeyError: + return default + else: + raise ValueError(f"Unknown attrs_type: {self._attrs_type}") + + def __contains__(self, key): + if self._attrs_type == "h5py": + return key in self._attrs + elif self._attrs_type == "zarr": + return key in self._attrs + else: + raise ValueError(f"Unknown attrs_type: {self._attrs_type}") + + def __getitem__(self, key): + val = self._attrs[key] + if self._attrs_type == "h5py": + return val + elif self._attrs_type == "zarr": + if isinstance(val, dict) and "_REFERENCE" in val: + return LindiH5pyReference(val["_REFERENCE"]) + + # Convert special float values to actual floats (NaN, Inf, -Inf) + # Note that string versions of these values are not supported + val = _decode_nan_inf_ninf_in_attr_val(val) + + return val + else: + raise ValueError(f"Unknown attrs_type: {self._attrs_type}") + + def __setitem__(self, key, value): + raise KeyError("Cannot set attributes on read-only object") + + def __delitem__(self, key): + raise KeyError("Cannot delete attributes on read-only object") + + def __iter__(self): + if self._attrs_type == "h5py": + return self._attrs.__iter__() + elif self._attrs_type == "zarr": + # Do not return special zarr attributes during iteration + for k in self._attrs: + if k not in _special_attribute_keys: + yield k + else: + raise ValueError(f"Unknown attrs_type: {self._attrs_type}") + + def items(self): + for k in self: + yield k, self[k] + + def __len__(self): + ct = 0 + for _ in self: + ct += 1 + return ct + + def __repr__(self): + return repr(self._attrs) + + def __str__(self): + return str(self._attrs) + + +def _decode_nan_inf_ninf_in_attr_val(val): + if isinstance(val, list): + return [_decode_nan_inf_ninf_in_attr_val(v) for v in val] + elif isinstance(val, dict): + return {k: _decode_nan_inf_ninf_in_attr_val(v) for k, v in val.items()} + elif val == 'NaN': + return float('nan') + elif val == 'Infinity': + return float('inf') + elif val == '-Infinity': + return float('-inf') + else: + return val diff --git a/lindi/LindiH5pyFile/LindiH5pyDataset.py b/lindi/LindiH5pyFile/LindiH5pyDataset.py new file mode 100644 index 0000000..eba5cd4 --- /dev/null +++ b/lindi/LindiH5pyFile/LindiH5pyDataset.py @@ -0,0 +1,274 @@ +from typing import TYPE_CHECKING, Union, Any, Dict +import numpy as np +import h5py +import zarr +import remfile + +from .LindiH5pyAttributes import LindiH5pyAttributes +from .LindiH5pyReference import LindiH5pyReference + + +if TYPE_CHECKING: + from .LindiH5pyFile import LindiH5pyFile + + +class LindiH5pyDatasetId: + def __init__(self, _h5py_dataset_id): + self._h5py_dataset_id = _h5py_dataset_id + + +# This is a global list of external hdf5 clients, which are used by +# possibly multiple LindiH5pyFile objects. The key is the URL of the +# external hdf5 file, and the value is the h5py.File object. +# TODO: figure out how to close these clients +_external_hdf5_clients: Dict[str, h5py.File] = {} + + +class LindiH5pyDataset(h5py.Dataset): + def __init__(self, _dataset_object: Union[h5py.Dataset, zarr.Array], _file: "LindiH5pyFile"): + self._dataset_object = _dataset_object + self._file = _file + + # See if we have the _COMPOUND_DTYPE attribute, which signifies that + # this is a compound dtype + if isinstance(_dataset_object, zarr.Array): + compound_dtype_obj = _dataset_object.attrs.get("_COMPOUND_DTYPE", None) + if compound_dtype_obj is not None: + # If we have a compound dtype, then create the numpy dtype + self._compound_dtype = np.dtype( + [(compound_dtype_obj[i][0], compound_dtype_obj[i][1]) for i in range(len(compound_dtype_obj))] + ) + else: + self._compound_dtype = None + else: + self._compound_dtype = None + + # Check whether this is a scalar dataset + if isinstance(_dataset_object, zarr.Array): + self._is_scalar = self._dataset_object.attrs.get("_SCALAR", False) + else: + self._is_scalar = self._dataset_object.ndim == 0 + + @property + def id(self): + if isinstance(self._dataset_object, h5py.Dataset): + return LindiH5pyDatasetId(self._dataset_object.id) + else: + return LindiH5pyDatasetId(None) + + @property + def shape(self): # type: ignore + if self._is_scalar: + return () + return self._dataset_object.shape + + @property + def size(self): + if self._is_scalar: + return 1 + return self._dataset_object.size + + @property + def dtype(self): + if self._compound_dtype is not None: + return self._compound_dtype + ret = self._dataset_object.dtype + if ret.kind == 'O': + if not ret.metadata: + # The following correction is needed because of + # this code in hdmf/backends/hdf5/h5tools.py: + # def _check_str_dtype(self, h5obj): + # dtype = h5obj.dtype + # if dtype.kind == 'O': + # if dtype.metadata.get('vlen') == str and H5PY_3: + # return StrDataset(h5obj, None) + # return h5obj + # We cannot have a dtype with kind 'O' and no metadata + ret = np.dtype(str(ret), metadata={}) + return ret + + @property + def nbytes(self): + return self._dataset_object.nbytes + + @property + def file(self): + return self._file + + @property + def name(self): + return self._dataset_object.name + + @property + def maxshape(self): + # not sure what to return here, so let's return self.shape rather than self._h5py_dataset.maxshape + # return self._h5py_dataset.maxshape + return self.shape + + @property + def ndim(self): + if self._is_scalar: + return 0 + return self._dataset_object.ndim + + @property + def attrs(self): # type: ignore + if isinstance(self._dataset_object, h5py.Dataset): + attrs_type = 'h5py' + elif isinstance(self._dataset_object, zarr.Array): + attrs_type = 'zarr' + else: + raise Exception(f'Unexpected dataset object type: {type(self._dataset_object)}') + return LindiH5pyAttributes(self._dataset_object.attrs, attrs_type=attrs_type) + + def __getitem__(self, args, new_dtype=None): + if isinstance(self._dataset_object, h5py.Dataset): + ret = self._dataset_object.__getitem__(args, new_dtype) + elif isinstance(self._dataset_object, zarr.Array): + if new_dtype is not None: + raise Exception("new_dtype is not supported for zarr.Array") + ret = self._get_item_for_zarr(self._dataset_object, args) + ret = _resolve_references(ret) + else: + raise Exception(f"Unexpected type: {type(self._dataset_object)}") + return ret + + def _get_item_for_zarr(self, zarr_array: zarr.Array, selection: Any): + # First check whether this is an external array link + external_array_link = zarr_array.attrs.get("_EXTERNAL_ARRAY_LINK", None) + if external_array_link and isinstance(external_array_link, dict): + link_type = external_array_link.get("link_type", None) + if link_type == 'hdf5_dataset': + url = external_array_link.get("url", None) + name = external_array_link.get("name", None) + if url is not None and name is not None: + client = self._get_external_hdf5_client(url) + dataset = client[name] + assert isinstance(dataset, h5py.Dataset) + return dataset[selection] + if self._compound_dtype is not None: + # Compound dtype + # In this case we index into the compound dtype using the name of the field + # For example, if the dtype is [('x', 'f4'), ('y', 'f4')], then we can do + # dataset['x'][0] to get the first x value + assert self._compound_dtype.names is not None + if isinstance(selection, str): + # Find the index of this field in the compound dtype + ind = self._compound_dtype.names.index(selection) + # Get the dtype of this field + dt = self._compound_dtype[ind] + if dt == 'object': + dtype = h5py.Reference + else: + dtype = np.dtype(dt) + # Return a new object that can be sliced further + # It's important that the return type is Any here, because otherwise we get linter problems + ret = LindiH5pyDatasetCompoundFieldSelection( + dataset=self, ind=ind, dtype=dtype + ) + return ret + else: + raise TypeError( + f"Compound dataset {self.name} does not support selection with {selection}" + ) + + # We use zarr's slicing, except in the case of a scalar dataset + if self.ndim == 0: + # make sure selection is () + if selection != (): + raise TypeError(f'Cannot slice a scalar dataset with {selection}') + return zarr_array[0] + return zarr_array[selection] + + def _get_external_hdf5_client(self, url: str) -> h5py.File: + if url not in _external_hdf5_clients: + if url.startswith("http://") or url.startswith("https://"): + ff = remfile.File(url) + else: + ff = open(url, "rb") # this never gets closed + _external_hdf5_clients[url] = h5py.File(ff, "r") + return _external_hdf5_clients[url] + + +def _resolve_references(x: Any): + if isinstance(x, dict): + # x should only be a dict when x represents a converted reference + if '_REFERENCE' in x: + return LindiH5pyReference(x['_REFERENCE']) + else: # pragma: no cover + raise Exception(f"Unexpected dict in selection: {x}") + elif isinstance(x, list): + # Replace any references in the list with the resolved ref in-place + for i, v in enumerate(x): + x[i] = _resolve_references(v) + elif isinstance(x, np.ndarray): + if x.dtype == object or x.dtype is None: + # Replace any references in the object array with the resolved ref in-place + view_1d = x.reshape(-1) + for i in range(len(view_1d)): + view_1d[i] = _resolve_references(view_1d[i]) + return x + + +class LindiH5pyDatasetCompoundFieldSelection: + """ + This class is returned when a compound dataset is indexed with a field name. + For example, if the dataset has dtype [('x', 'f4'), ('y', 'f4')], then we + can do dataset['x'][0] to get the first x value. The dataset['x'] returns an + object of this class. + """ + def __init__(self, *, dataset: LindiH5pyDataset, ind: int, dtype: np.dtype): + self._dataset = dataset # The parent dataset + self._ind = ind # The index of the field in the compound dtype + self._dtype = dtype # The dtype of the field + if self._dataset.ndim != 1: + # For now we only support 1D datasets + raise TypeError( + f"Compound field selection only implemented for 1D datasets, not {self._dataset.ndim}D" + ) + if not isinstance(self._dataset._dataset_object, zarr.Array): + raise TypeError( + f"Compound field selection only implemented for zarr.Array, not {type(self._dataset._dataset_object)}" + ) + za = self._dataset._dataset_object + self._zarr_array = za + # Prepare the data in memory + d = [za[i][self._ind] for i in range(len(za))] + if self._dtype == h5py.Reference: + # Convert to LindiH5pyReference + # Every element in the selection should be a reference dict + d = [LindiH5pyReference(x['_REFERENCE']) for x in d] + self._data = np.array(d, dtype=self._dtype) + + def __len__(self): + """We conform to h5py, which is the number of elements in the first dimension. TypeError if scalar""" + if self.ndim == 0: + raise TypeError("Scalar dataset") + return self.shape[0] # type: ignore + + def __iter__(self): + """We conform to h5py, which is: Iterate over the first axis. TypeError if scalar.""" + shape = self.shape + if len(shape) == 0: + raise TypeError("Can't iterate over a scalar dataset") + for i in range(shape[0]): + yield self[i] + + @property + def ndim(self): + return self._zarr_array.ndim + + @property + def shape(self): + return self._zarr_array.shape + + @property + def dtype(self): + self._dtype + + @property + def size(self): + return self._data.size + + def __getitem__(self, selection): + return self._data[selection] diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py new file mode 100644 index 0000000..93b39b7 --- /dev/null +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -0,0 +1,218 @@ +from typing import Union +import json +import tempfile +import urllib.request +import h5py +import zarr +from zarr.storage import Store as ZarrStore +from fsspec.implementations.reference import ReferenceFileSystem +from fsspec import FSMap + +from .LindiH5pyGroup import LindiH5pyGroup +from .LindiH5pyDataset import LindiH5pyDataset +from .LindiH5pyAttributes import LindiH5pyAttributes +from .LindiH5pyReference import LindiH5pyReference + + +class LindiH5pyFile(h5py.File): + def __init__(self, _file_object: Union[h5py.File, zarr.Group]): + """ + Do not use this constructor directly. Instead, use: + from_reference_file_system, from_zarr_store, from_zarr_group, + or from_h5py_file + """ + self._file_object = _file_object + self._the_group = LindiH5pyGroup(_file_object, self) + + @staticmethod + def from_reference_file_system(rfs: Union[dict, str]): + """ + Create a LindiH5pyFile from a reference file system. + """ + if isinstance(rfs, str): + if rfs.startswith("http") or rfs.startswith("https"): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/temp.zarr.json" + _download_file(rfs, filename) + with open(filename, "r") as f: + data = json.load(f) + assert isinstance(data, dict) # prevent infinite recursion + return LindiH5pyFile.from_reference_file_system(data) + else: + with open(rfs, "r") as f: + data = json.load(f) + assert isinstance(data, dict) # prevent infinite recursion + return LindiH5pyFile.from_reference_file_system(data) + elif isinstance(rfs, dict): + fs = ReferenceFileSystem(rfs).get_mapper(root="") + return LindiH5pyFile.from_zarr_store(fs) + else: + raise Exception(f"Unhandled type for rfs: {type(rfs)}") + + @staticmethod + def from_zarr_store(zarr_store: Union[ZarrStore, FSMap]): + """ + Create a LindiH5pyFile from a zarr store. + """ + zarr_group = zarr.open(store=zarr_store, mode="r") + assert isinstance(zarr_group, zarr.Group) + return LindiH5pyFile.from_zarr_group(zarr_group) + + @staticmethod + def from_zarr_group(zarr_group: zarr.Group): + """ + Create a LindiH5pyFile from a zarr group. + """ + return LindiH5pyFile(zarr_group) + + @staticmethod + def from_h5py_file(h5py_file: h5py.File): + """ + Create a LindiH5pyFile from an h5py file. + """ + return LindiH5pyFile(h5py_file) + + @property + def attrs(self): # type: ignore + if isinstance(self._file_object, h5py.File): + attrs_type = 'h5py' + elif isinstance(self._file_object, zarr.Group): + attrs_type = 'zarr' + else: + raise Exception(f'Unexpected file object type: {type(self._file_object)}') + return LindiH5pyAttributes(self._file_object.attrs, attrs_type=attrs_type) + + @property + def filename(self): + # This is not a string, but this is what h5py seems to do + if isinstance(self._file_object, h5py.File): + return self._file_object.filename + elif isinstance(self._file_object, zarr.Group): + return '' + else: + raise Exception(f"Unhandled type for file object: {type(self._file_object)}") + + @property + def driver(self): + raise Exception("Getting driver is not allowed") + + # @property + # def mode(self): + # if isinstance(self._file_object, h5py.File): + # return self._file_object.mode + # elif isinstance(self._file_object, zarr.Group): + # # hard-coded to read-only + # return "r" + # else: + # raise Exception(f"Unhandled type: {type(self._file_object)}") + + @property + def libver(self): + raise Exception("Getting libver is not allowed") + + @property + def userblock_size(self): + raise Exception("Getting userblock_size is not allowed") + + @property + def meta_block_size(self): + raise Exception("Getting meta_block_size is not allowed") + + def swmr_mode(self, value): # type: ignore + raise Exception("Getting swmr_mode is not allowed") + + def close(self): + # return self._file_object.close() + pass + + def flush(self): + if isinstance(self._file_object, h5py.File): + return self._file_object.flush() + + def __enter__(self): # type: ignore + return self + + def __exit__(self, *args): + self.close() + + def __repr__(self): + return f'' + + # Group methods + def __getitem__(self, name): # type: ignore + return self._get_item(name) + + def _get_item(self, name, getlink=False, default=None): + if isinstance(name, LindiH5pyReference) and isinstance(self._file_object, zarr.Group): + if getlink: + raise Exception("Getting link is not allowed for references") + zarr_group = self._file_object + if name._source != '.': + raise Exception(f'For now, source of reference must be ".", got "{name._source}"') + if name._source_object_id is not None: + if name._source_object_id != zarr_group.attrs.get("object_id"): + raise Exception(f'Mismatch in source object_id: "{name._source_object_id}" and "{zarr_group.attrs.get("object_id")}"') + target = self[name._path] + if name._object_id is not None: + if name._object_id != target.attrs.get("object_id"): + raise Exception(f'Mismatch in object_id: "{name._object_id}" and "{target.attrs.get("object_id")}"') + return target + elif isinstance(name, h5py.Reference) and isinstance(self._file_object, h5py.File): + if getlink: + raise Exception("Getting link is not allowed for references") + x = self._file_object[name] + if isinstance(x, h5py.Group): + return LindiH5pyGroup(x, self) + elif isinstance(x, h5py.Dataset): + return LindiH5pyDataset(x, self) + else: + raise Exception(f"Unexpected type for resolved reference at path {name}: {type(x)}") + # if it contains slashes, it's a path + if isinstance(name, str) and "/" in name: + parts = name.split("/") + x = self._the_group + for i, part in enumerate(parts): + if i == len(parts) - 1: + assert isinstance(x, LindiH5pyGroup) + x = x.get(part, default=default, getlink=getlink) + else: + assert isinstance(x, LindiH5pyGroup) + x = x.get(part) + return x + return self._the_group.get(name, default=default, getlink=getlink) + + def get(self, name, default=None, getclass=False, getlink=False): + if getclass: + raise Exception("Getting class is not allowed") + return self._get_item(name, getlink=getlink, default=default) + + def __iter__(self): + return self._the_group.__iter__() + + def __reversed__(self): + return self._the_group.__reversed__() + + def __contains__(self, name): + return self._the_group.__contains__(name) + + @property + def id(self): + return self._the_group.id + + @property + def file(self): + return self._the_group.file + + @property + def name(self): + return self._the_group.name + + +def _download_file(url: str, filename: str) -> None: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + } + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req) as response: + with open(filename, "wb") as f: + f.write(response.read()) diff --git a/lindi/LindiH5pyFile/LindiH5pyGroup.py b/lindi/LindiH5pyFile/LindiH5pyGroup.py new file mode 100644 index 0000000..48b3d90 --- /dev/null +++ b/lindi/LindiH5pyFile/LindiH5pyGroup.py @@ -0,0 +1,139 @@ +from typing import TYPE_CHECKING, Union +import h5py +import zarr + +from .LindiH5pyDataset import LindiH5pyDataset +from .LindiH5pyLink import LindiH5pyHardLink, LindiH5pySoftLink +from .LindiH5pyAttributes import LindiH5pyAttributes + + +if TYPE_CHECKING: + from .LindiH5pyFile import LindiH5pyFile # pragma: no cover + + +class LindiH5pyGroupId: + def __init__(self, _h5py_group_id): + self._h5py_group_id = _h5py_group_id + + +class LindiH5pyGroup(h5py.Group): + def __init__(self, _group_object: Union[h5py.Group, zarr.Group], _file: "LindiH5pyFile"): + self._group_object = _group_object + self._file = _file + + def __getitem__(self, name): + if isinstance(self._group_object, h5py.Group): + if isinstance(name, (bytes, str)): + x = self._group_object[name] + else: + raise TypeError( + "Accessing a group is done with bytes or str, " + "not {}".format(type(name)) + ) + if isinstance(x, h5py.Group): + return LindiH5pyGroup(x, self._file) + elif isinstance(x, h5py.Dataset): + return LindiH5pyDataset(x, self._file) + else: + raise Exception(f"Unknown type: {type(x)}") + elif isinstance(self._group_object, zarr.Group): + if isinstance(name, (bytes, str)): + x = self._group_object[name] + else: + raise TypeError( + "Accessing a group is done with bytes or str, " + "not {}".format(type(name)) + ) + if isinstance(x, zarr.Group): + # follow the link if this is a soft link + soft_link = x.attrs.get('_SOFT_LINK', None) + if soft_link is not None: + link_path = soft_link['path'] + target_grp = self._file.get(link_path) + if not isinstance(target_grp, LindiH5pyGroup): + raise Exception( + f"Expected a group at {link_path} but got {type(x)}" + ) + return target_grp + return LindiH5pyGroup(x, self._file) + elif isinstance(x, zarr.Array): + return LindiH5pyDataset(x, self._file) + else: + raise Exception(f"Unknown type: {type(x)}") + else: + raise Exception(f"Unhandled type: {type(self._group_object)}") + + def get(self, name, default=None, getclass=False, getlink=False): + if not (getclass or getlink): + try: + return self[name] + except KeyError: + return default + + if name not in self: + return default + elif getclass and not getlink: + raise Exception("Getting class is not allowed") + elif getlink and not getclass: + if isinstance(self._group_object, h5py.Group): + x = self._group_object.get(name, default=default, getlink=True) + if isinstance(x, h5py.HardLink): + return LindiH5pyHardLink() + elif isinstance(x, h5py.SoftLink): + return LindiH5pySoftLink(x.path) + else: + raise Exception( + f"Unhandled type for get with getlink at {self.name} {name}: {type(x)}" + ) + elif isinstance(self._group_object, zarr.Group): + x = self._group_object.get(name, default=default) + if x is None: + return default + soft_link = x.attrs.get('_SOFT_LINK', None) + if isinstance(x, zarr.Group) and soft_link is not None: + return LindiH5pySoftLink(soft_link['path']) + else: + return LindiH5pyHardLink() + else: + raise Exception(f"Unhandled type: {type(self._group_object)}") + else: + raise Exception("Impossible") + + @property + def name(self): + return self._group_object.name + + def keys(self): # type: ignore + return self._group_object.keys() + + def __iter__(self): + return self._group_object.__iter__() + + def __reversed__(self): + raise Exception("Not implemented: __reversed__") + + def __contains__(self, name): + return self._group_object.__contains__(name) + + @property + def id(self): + if isinstance(self._group_object, h5py.Group): + return LindiH5pyGroupId(self._group_object.id) + elif isinstance(self._group_object, zarr.Group): + return LindiH5pyGroupId(None) + else: + raise Exception(f'Unexpected group object type: {type(self._group_object)}') + + @property + def file(self): + return self._file + + @property + def attrs(self): # type: ignore + if isinstance(self._group_object, h5py.Group): + attrs_type = 'h5py' + elif isinstance(self._group_object, zarr.Group): + attrs_type = 'zarr' + else: + raise Exception(f'Unexpected group object type: {type(self._group_object)}') + return LindiH5pyAttributes(self._group_object.attrs, attrs_type=attrs_type) diff --git a/lindi/LindiH5pyFile/LindiH5pyLink.py b/lindi/LindiH5pyFile/LindiH5pyLink.py new file mode 100644 index 0000000..f50576f --- /dev/null +++ b/lindi/LindiH5pyFile/LindiH5pyLink.py @@ -0,0 +1,15 @@ +import h5py + + +class LindiH5pyHardLink(h5py.HardLink): + def __init__(self): + pass + + +class LindiH5pySoftLink(h5py.SoftLink): + def __init__(self, path: str): + self._path = path + + @property + def path(self): + return self._path diff --git a/lindi/LindiH5pyFile/LindiH5pyReference.py b/lindi/LindiH5pyFile/LindiH5pyReference.py new file mode 100644 index 0000000..e1bda22 --- /dev/null +++ b/lindi/LindiH5pyFile/LindiH5pyReference.py @@ -0,0 +1,15 @@ +import h5py + + +class LindiH5pyReference(h5py.h5r.Reference): + def __init__(self, obj: dict): + self._object_id = obj["object_id"] + self._path = obj["path"] + self._source = obj["source"] + self._source_object_id = obj["source_object_id"] + + def __repr__(self): + return f"LindiH5pyReference({self._object_id}, {self._path})" + + def __str__(self): + return f"LindiH5pyReference({self._object_id}, {self._path})" diff --git a/lindi/LindiH5pyFile/__init__.py b/lindi/LindiH5pyFile/__init__.py new file mode 100644 index 0000000..4d95cea --- /dev/null +++ b/lindi/LindiH5pyFile/__init__.py @@ -0,0 +1,12 @@ +from .LindiH5pyFile import LindiH5pyFile +from .LindiH5pyDataset import LindiH5pyDataset +from .LindiH5pyGroup import LindiH5pyGroup +from .LindiH5pyLink import LindiH5pySoftLink, LindiH5pyHardLink + +__all__ = [ + "LindiH5pyFile", + "LindiH5pyDataset", + "LindiH5pyGroup", + "LindiH5pySoftLink", + "LindiH5pyHardLink", +] diff --git a/lindi/__init__.py b/lindi/__init__.py index 7d82117..3c9754a 100644 --- a/lindi/__init__.py +++ b/lindi/__init__.py @@ -1,2 +1,2 @@ -from .LindiClient import LindiClient, LindiGroup, LindiDataset, LindiAttributes, LindiReference # noqa: F401 -from .LindiH5Store import LindiH5Store, LindiH5StoreOpts # noqa: F401 +from .LindiH5ZarrStore import LindiH5ZarrStore, LindiH5ZarrStoreOpts # noqa: F401 +from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401 diff --git a/pyproject.toml b/pyproject.toml index 6ec534d..0bde7d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,10 +12,15 @@ zarr = "^2.16.1" h5py = "^3.10.0" remfile = "^0.1.9" fsspec = "^2023.12.2" +aiohttp = "^3.9.3" +requests = "^2.31.0" -[tool.poetry.dev-dependencies] +[tool.poetry.group.dev.dependencies] +pynwb = "^2.6.0" pytest = "^7.4.4" pytest-cov = "^4.1.0" +ruff = "^0.3.3" +pyright = "1.1.335" [build-system] requires = ["poetry-core"] diff --git a/tests/test_core.py b/tests/test_core.py index ca4fb86..614c6de 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,41 +1,209 @@ +import pytest import numpy as np import h5py import tempfile -from lindi import LindiH5Store, LindiClient, LindiDataset, LindiGroup -import pytest +import lindi +from lindi import LindiH5ZarrStore -def test_scalar_datasets(): - for val in ["abc", b"abc", 1, 3.6]: - print(f"Testing scalar {val} of type {type(val)}") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - ds = f.create_dataset("X", data=val) - ds.attrs["foo"] = "bar" - with LindiH5Store.from_file( - filename, url=filename - ) as store: # set url so that a reference file system can be created - rfs = store.to_reference_file_system() - client = LindiClient.from_reference_file_system(rfs) - h5f = h5py.File(filename, "r") - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiDataset) - if not _check_equal(X1[()], X2[()]): - print(f"WARNING: {X1} ({type(X1)}) != {X2} ({type(X2)})") - raise ValueError("Scalar datasets are not equal") - assert '.zgroup' in store - assert '.zarray' not in rfs['refs'] - assert '.zarray' not in store - assert '.zattrs' in store # it's in the store but not in the ref file system -- see notes in LindiH5Store source code - assert '.zattrs' not in rfs['refs'] - assert 'X/.zgroup' not in store - assert 'X/.zattrs' in store # foo is set to bar - assert store['X/.zattrs'] == rfs['refs']['X/.zattrs'].encode() - assert 'X/.zarray' in rfs['refs'] - assert store['X/.zarray'] == rfs['refs']['X/.zarray'].encode() +def test_variety(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + f.create_dataset("dataset1", data=[1, 2, 3]) + f.create_group("group1") + f.attrs["int1"] = 1 + f.attrs["float1"] = 3.14 + f.attrs["str1"] = "abc" + f.attrs["bytes1"] = b"def" + f.attrs["list1"] = [1, 2, 3] + f.attrs["tuple1"] = (3, 4, 5) + f.attrs["array1"] = np.arange(10) + f.attrs["dataset1_ref"] = f["dataset1"].ref + f.attrs["group1_ref"] = f["group1"].ref + f["dataset1"].attrs["test_attr1"] = "attribute-of-dataset1" + f["group1"].attrs["test_attr2"] = "attribute-of-group1" + h5f = h5py.File(filename, "r") + h5f_wrapped = lindi.LindiH5pyFile.from_h5py_file(h5f) + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs = store.to_reference_file_system() + h5f_rfs = lindi.LindiH5pyFile.from_reference_file_system(rfs) + for h5f_2 in [h5f_rfs, h5f_wrapped]: + assert h5f_2.attrs["int1"] == h5f.attrs["int1"] + assert h5f_2.attrs["float1"] == h5f.attrs["float1"] + assert h5f_2.attrs["str1"] == h5f.attrs["str1"] + assert h5f_2.attrs["bytes1"] == h5f.attrs["bytes1"] + assert _lists_are_equal(h5f_2.attrs["list1"], h5f.attrs["list1"]) + assert _lists_are_equal(h5f_2.attrs["tuple1"], h5f.attrs["tuple1"]) + assert _arrays_are_equal(np.array(h5f_2.attrs["array1"]), h5f.attrs["array1"]) + assert h5f_2["dataset1"].attrs["test_attr1"] == h5f["dataset1"].attrs["test_attr1"] # type: ignore + assert _arrays_are_equal(h5f_2["dataset1"][()], h5f["dataset1"][()]) # type: ignore + assert h5f_2["group1"].attrs["test_attr2"] == h5f["group1"].attrs["test_attr2"] # type: ignore + target_1 = h5f[h5f.attrs["dataset1_ref"]] + target_2 = h5f_2[h5f_2.attrs["dataset1_ref"]] + assert target_1.attrs["test_attr1"] == target_2.attrs["test_attr1"] # type: ignore + target_1 = h5f[h5f.attrs["group1_ref"]] + target_2 = h5f_2[h5f_2.attrs["group1_ref"]] + assert target_1.attrs["test_attr2"] == target_2.attrs["test_attr2"] # type: ignore + + +def test_soft_links(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + g = f.create_group('group_target') + g.attrs['foo'] = 'bar' + g.create_dataset('dataset1', data=[5, 6, 7]) + f['soft_link'] = h5py.SoftLink('/group_target') + h5f = h5py.File(filename, "r") + h5f_wrapped = lindi.LindiH5pyFile.from_h5py_file(h5f) + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs = store.to_reference_file_system() + h5f_rfs = lindi.LindiH5pyFile.from_reference_file_system(rfs) + for h5f_2 in [h5f_rfs, h5f_wrapped]: + g1 = h5f['group_target'] + assert isinstance(g1, h5py.Group) + g2 = h5f_2['group_target'] + assert isinstance(g2, h5py.Group) + assert g1.attrs['foo'] == g2.attrs['foo'] # type: ignore + with pytest.raises(TypeError): + g1[np.array([0, 1, 2])] + h1 = h5f['soft_link'] + assert isinstance(h1, h5py.Group) + h2 = h5f_2['soft_link'] + assert isinstance(h2, h5py.Group) + assert h1.attrs['foo'] == h2.attrs['foo'] # type: ignore + # this is tricky: it seems that with h5py, the name of the soft link + # is the source name. So the following assertion will fail. + # assert h1.name == h2.name + k1 = h5f.get('soft_link', getlink=True) + k2 = h5f_2.get('soft_link', getlink=True) + assert isinstance(k1, h5py.SoftLink) + assert isinstance(k2, h5py.SoftLink) + ds1 = h5f['soft_link']['dataset1'] # type: ignore + assert isinstance(ds1, h5py.Dataset) + ds2 = h5f_2['soft_link']['dataset1'] # type: ignore + assert isinstance(ds2, h5py.Dataset) + assert _arrays_are_equal(ds1[()], ds2[()]) + ds1 = h5f['soft_link/dataset1'] + assert isinstance(ds1, h5py.Dataset) + ds2 = h5f_2['soft_link/dataset1'] + assert isinstance(ds2, h5py.Dataset) + assert _arrays_are_equal(ds1[()], ds2[()]) + ds1 = h5f['group_target/dataset1'] + assert isinstance(ds1, h5py.Dataset) + ds2 = h5f_2['group_target/dataset1'] + assert isinstance(ds2, h5py.Dataset) + assert _arrays_are_equal(ds1[()], ds2[()]) + + +def test_arrays_of_compound_dtype(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + dt = np.dtype([("x", "i4"), ("y", "f8")]) + f.create_dataset("dataset1", data=[(1, 3.14), (2, 6.28)], dtype=dt) + dt = np.dtype([("a", "i4"), ("b", "f8"), ("c", "S10")]) + f.create_dataset("dataset2", data=[(1, 3.14, "abc"), (2, 6.28, "def")], dtype=dt) + h5f = h5py.File(filename, "r") + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs = store.to_reference_file_system() + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + ds1_1 = h5f['dataset1'] + assert isinstance(ds1_1, h5py.Dataset) + ds1_2 = h5f_2['dataset1'] + assert isinstance(ds1_2, h5py.Dataset) + assert ds1_1.dtype == ds1_2.dtype + assert _arrays_are_equal(ds1_1['x'][()], ds1_2['x'][()]) # type: ignore + assert _arrays_are_equal(ds1_1['y'][()], ds1_2['y'][()]) # type: ignore + ds2_1 = h5f['dataset2'] + assert isinstance(ds2_1, h5py.Dataset) + ds2_2 = h5f_2['dataset2'] + assert isinstance(ds2_2, h5py.Dataset) + assert ds2_1.dtype == ds2_2.dtype + assert _arrays_are_equal(ds2_1['a'][()], ds2_2['a'][()]) # type: ignore + assert _arrays_are_equal(ds2_1['b'][()], ds2_2['b'][()]) # type: ignore + assert _arrays_are_equal(ds2_1['c'][()], ds2_2['c'][()]) # type: ignore + + +def test_arrays_of_compound_dtype_with_references(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + dt = np.dtype([("x", "i4"), ("y", h5py.special_dtype(ref=h5py.Reference))]) + Y_ds = f.create_dataset("Y", data=[1, 2, 3]) + f.create_dataset("dataset1", data=[(1, Y_ds.ref), (2, Y_ds.ref)], dtype=dt) + h5f = h5py.File(filename, "r") + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs = store.to_reference_file_system() + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + ds1_1 = h5f['dataset1'] + assert isinstance(ds1_1, h5py.Dataset) + ds1_2 = h5f_2['dataset1'] + assert isinstance(ds1_2, h5py.Dataset) + assert ds1_1.dtype == ds1_2.dtype + assert _arrays_are_equal(ds1_1['x'][()], ds1_2['x'][()]) # type: ignore + ref1 = ds1_1['y'][0] + ref2 = ds1_2['y'][0] + assert isinstance(ref1, h5py.Reference) + assert isinstance(ref2, h5py.Reference) + target1 = h5f[ref1] + assert isinstance(target1, h5py.Dataset) + target2 = h5f_2[ref2] + assert isinstance(target2, h5py.Dataset) + assert _arrays_are_equal(target1[()], target2[()]) + + +def test_scalar_arrays(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + f.create_dataset("X", data=1) + f.create_dataset("Y", data=3.14) + f.create_dataset("Z", data="abc") + f.create_dataset("W", data=b"def") + h5f = h5py.File(filename, "r") + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs = store.to_reference_file_system() + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + X1 = h5f['X'] + assert isinstance(X1, h5py.Dataset) + X2 = h5f_2['X'] + assert isinstance(X2, h5py.Dataset) + assert X1[()] == X2[()] + Y1 = h5f['Y'] + assert isinstance(Y1, h5py.Dataset) + Y2 = h5f_2['Y'] + assert isinstance(Y2, h5py.Dataset) + assert Y1[()] == Y2[()] + Z1 = h5f['Z'] + assert isinstance(Z1, h5py.Dataset) + Z2 = h5f_2['Z'] + assert isinstance(Z2, h5py.Dataset) + # Note that encode is needed because Z1[()] is a bytes + assert Z1[()] == Z2[()].encode() # type: ignore + W1 = h5f['W'] + assert isinstance(W1, h5py.Dataset) + W2 = h5f_2['W'] + assert isinstance(W2, h5py.Dataset) + # Note that encode is needed because W2[()] is a str + assert W1[()] == W2[()].encode() # type: ignore + + +def test_arrays_of_strings(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + f.create_dataset("X", data=["abc", "def", "ghi"]) + h5f = h5py.File(filename, "r") + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs = store.to_reference_file_system() + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + X1 = h5f['X'] + assert isinstance(X1, h5py.Dataset) + X2 = h5f_2['X'] + assert isinstance(X2, h5py.Dataset) + assert _lists_are_equal(X1[:].tolist(), [x.encode() for x in X2[:]]) # type: ignore def test_numpy_arrays(): @@ -48,16 +216,16 @@ def test_numpy_arrays(): filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: f.create_dataset("X", data=array, chunks=chunks) - with LindiH5Store.from_file( + with LindiH5ZarrStore.from_file( filename, url=filename ) as store: # set url so that a reference file system can be created rfs = store.to_reference_file_system() - client = LindiClient.from_reference_file_system(rfs) + client = lindi.LindiH5pyFile.from_reference_file_system(rfs) h5f = h5py.File(filename, "r") X1 = h5f["X"] assert isinstance(X1, h5py.Dataset) X2 = client["X"] - assert isinstance(X2, LindiDataset) + assert isinstance(X2, lindi.LindiH5pyDataset) assert X1.shape == X2.shape assert X1.dtype == X2.dtype @@ -65,182 +233,90 @@ def test_numpy_arrays(): assert X1.nbytes == X2.nbytes assert len(X1) == len(X2) - # iterate over the first axis - count = 0 - for aa in X2: - assert _check_equal(aa[:], X1[count][:]) - count += 1 - - if not _check_equal(X1[:], X2[:]): - print("WARNING. Arrays are not equal") - print(X1[:]) - print(X2[:]) - raise ValueError("Arrays are not equal") - -def test_numpy_array_of_strings(): - print("Testing numpy array of strings") +def test_nan_inf_attributes(): with tempfile.TemporaryDirectory() as tmpdir: filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: - f.create_dataset("X", data=["abc", "def", "ghi"]) + f.create_dataset("X", data=[1, 2, 3]) + f["X"].attrs["nan"] = np.nan + f["X"].attrs["inf"] = np.inf + f["X"].attrs["ninf"] = -np.inf + f["X"].attrs['float_list'] = [np.nan, np.inf, -np.inf, 23] h5f = h5py.File(filename, "r") - with LindiH5Store.from_file(filename, url=filename) as store: + with LindiH5ZarrStore.from_file(filename, url=filename) as store: rfs = store.to_reference_file_system() - client = LindiClient.from_reference_file_system(rfs) + client = lindi.LindiH5pyFile.from_reference_file_system(rfs) + X1 = h5f["X"] assert isinstance(X1, h5py.Dataset) X2 = client["X"] - assert isinstance(X2, LindiDataset) - if not _check_equal(X1[:], X2[:]): - print("WARNING. Arrays are not equal") - print(X1[:]) - print(X2[:]) - raise ValueError("Arrays are not equal") + assert isinstance(X2, lindi.LindiH5pyDataset) + nanval = X1.attrs["nan"] + assert isinstance(nanval, float) and np.isnan(nanval) + assert X1.attrs["inf"] == np.inf + assert X1.attrs["ninf"] == -np.inf + assert _lists_are_equal(X1.attrs['float_list'], [np.nan, np.inf, -np.inf, 23]) -def test_attributes(): - print("Testing attributes") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=[1, 2, 3]) - f["X"].attrs["foo"] = "bar" - f["X"].attrs["baz"] = 3.14 - f["X"].attrs["qux"] = [1, 2, 3] - f["X"].attrs["corge"] = np.int32(5) - f.create_group("group") - f["group"].attrs["foo"] = "bar2" - f["group"].attrs["baz"] = 3.15 - h5f = h5py.File(filename, "r") - with LindiH5Store.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - client = LindiClient.from_reference_file_system(rfs) + nanval = X2.attrs["nan"] + assert isinstance(nanval, float) and np.isnan(nanval) + assert X2.attrs["inf"] == np.inf + assert X2.attrs["ninf"] == -np.inf + assert _lists_are_equal(X2.attrs['float_list'], [np.nan, np.inf, -np.inf, 23]) - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiDataset) - - with pytest.raises(KeyError): - X2.attrs["a"] = 1 # cannot set attributes on read-only object - with pytest.raises(KeyError): - X2.attrs["b"] # non-existent attribute - with pytest.raises(KeyError): - del X2.attrs["foo"] # cannot delete attributes on read-only object - - for k, v in X2.attrs.items(): - if not _check_equal(v, X1.attrs[k]): - print(f"WARNING: {k} attribute mismatch") - print(f" h5: {X1.attrs[k]} ({type(X1.attrs[k])})") - print(f" zarr: {v} ({type(v)})") - raise ValueError("Attribute mismatch") - for k, v in X1.attrs.items(): - if not _check_equal(v, X2.attrs[k]): - print(f"WARNING: {k} attribute mismatch") - print(f" h5: {v} ({type(v)})") - print(f" zarr: {X2.attrs[k]} ({type(X2.attrs[k])})") - raise ValueError("Attribute mismatch") - for k in X2.attrs: - assert k in X1.attrs - assert len(X2.attrs) == len(X1.attrs) - assert str(X2.attrs) # for coverage - assert repr(X2.attrs) # for coverage - - group1 = h5f["group"] - assert isinstance(group1, h5py.Group) - group2 = client["group"] - assert isinstance(group2, LindiGroup) - - for k, v in group2.attrs.items(): - if not _check_equal(v, group1.attrs[k]): - print(f"WARNING: {k} attribute mismatch") - print(f" h5: {group1.attrs[k]} ({type(group1.attrs[k])})") - print(f" zarr: {v} ({type(v)})") - raise ValueError("Attribute mismatch") - for k, v in group1.attrs.items(): - if not _check_equal(v, group2.attrs[k]): - print(f"WARNING: {k} attribute mismatch") - print(f" h5: {v} ({type(v)})") - print(f" zarr: {group2.attrs[k]} ({type(group2.attrs[k])})") - raise ValueError("Attribute mismatch") - - -def test_nan_inf_attr(): - print("Testing NaN, Inf, and -Inf attributes") + for test_string in ["NaN", "Infinity", "-Infinity", "Not-illegal"]: + filename = f"{tmpdir}/illegal_string.h5" + with h5py.File(filename, "w") as f: + f.create_dataset("X", data=[1, 2, 3]) + f["X"].attrs["test_string"] = test_string + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + if test_string in ["NaN", "Infinity", "-Infinity"]: + with pytest.raises(Exception): + rfs = store.to_reference_file_system() + else: + rfs = store.to_reference_file_system() + client = lindi.LindiH5pyFile.from_reference_file_system(rfs) + assert client["X"].attrs["test_string"] == test_string # type: ignore + + +def test_reference_file_system_to_file(): with tempfile.TemporaryDirectory() as tmpdir: filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: f.create_dataset("X", data=[1, 2, 3]) - f["X"].attrs["nan"] = np.nan - f["X"].attrs["inf"] = np.inf - f["X"].attrs["ninf"] = -np.inf - h5f = h5py.File(filename, "r") - with LindiH5Store.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - client = LindiClient.from_reference_file_system(rfs) + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs_fname = f'{tmpdir}/test.zarr.json' + store.to_file(rfs_fname) + client = lindi.LindiH5pyFile.from_reference_file_system(rfs_fname) + X = client["X"] + assert isinstance(X, lindi.LindiH5pyDataset) + assert _lists_are_equal(X[()], [1, 2, 3]) - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiDataset) - - assert X2.attrs["nan"] == 'NaN' - assert X2.attrs["inf"] == 'Infinity' - assert X2.attrs["ninf"] == '-Infinity' - - -def _check_equal(a, b): - # allow comparison of bytes and strings - if isinstance(a, str): - a = a.encode() - if isinstance(b, str): - b = b.encode() - - # allow comparison of numpy scalars with python scalars - if np.issubdtype(type(a), np.floating): - a = float(a) - if np.issubdtype(type(b), np.floating): - b = float(b) - if np.issubdtype(type(a), np.integer): - a = int(a) - if np.issubdtype(type(b), np.integer): - b = int(b) - - # allow comparison of numpy arrays to python lists - if isinstance(a, list): - a = np.array(a) - if isinstance(b, list): - b = np.array(b) - - if type(a) != type(b): # noqa: E721 + +def _lists_are_equal(a, b): + if len(a) != len(b): return False + for aa, bb in zip(a, b): + if aa != bb: + if np.isnan(aa) and np.isnan(bb): + # nan != nan, but we want to consider them equal + continue + return False + return True - if isinstance(a, np.ndarray): - assert isinstance(b, np.ndarray) - return _check_arrays_equal(a, b) - - # test for NaNs (we need to use np.isnan because NaN != NaN in python) - if isinstance(a, float) and isinstance(b, float): - if np.isnan(a) and np.isnan(b): - return True - - return a == b - - -def _check_arrays_equal(a: np.ndarray, b: np.ndarray): - # If it's an array of strings, we convert to an array of bytes - if a.dtype == object: - # need to modify all the entries - a = np.array([x.encode() if type(x) is str else x for x in a.ravel()]).reshape( - a.shape - ) - if b.dtype == object: - b = np.array([x.encode() if type(x) is str else x for x in b.ravel()]).reshape( - b.shape - ) + +def _arrays_are_equal(a, b): + if a.shape != b.shape: + return False + if a.dtype != b.dtype: + return False # if this is numeric data we need to use allclose so that we can handle NaNs if np.issubdtype(a.dtype, np.number): return np.allclose(a, b, equal_nan=True) else: return np.array_equal(a, b) + + +if __name__ == '__main__': + test_scalar_arrays() diff --git a/tests/test_external_array_link.py b/tests/test_external_array_link.py new file mode 100644 index 0000000..57e689b --- /dev/null +++ b/tests/test_external_array_link.py @@ -0,0 +1,27 @@ +import tempfile +import numpy as np +import h5py +import lindi + + +def test_external_array_link(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + X = np.random.randn(50, 12) + with h5py.File(filename, "w") as f: + f.create_dataset("dataset1", data=X, chunks=(10, 6)) + with lindi.LindiH5ZarrStore.from_file( + filename, + url=filename, + opts=lindi.LindiH5ZarrStoreOpts( + num_dataset_chunks_threshold=4 + ) + ) as store: + rfs = store.to_reference_file_system() + client = lindi.LindiH5pyFile.from_reference_file_system(rfs) + X2 = client["dataset1"][:] # type: ignore + assert np.array_equal(X, X2) + + +if __name__ == "__main__": + test_external_array_link() diff --git a/tests/test_pynwb.py b/tests/test_pynwb.py new file mode 100644 index 0000000..c08e3c9 --- /dev/null +++ b/tests/test_pynwb.py @@ -0,0 +1,114 @@ +from typing import Any +import tempfile +import lindi + + +def test_pynwb(): + from datetime import datetime + from uuid import uuid4 + + import numpy as np + from dateutil.tz import tzlocal + + from pynwb import NWBHDF5IO, NWBFile + from pynwb.ecephys import LFP, ElectricalSeries + + nwbfile: Any = NWBFile( + session_description="my first synthetic recording", + identifier=str(uuid4()), + session_start_time=datetime.now(tzlocal()), + experimenter=[ + "Baggins, Bilbo", + ], + lab="Bag End Laboratory", + institution="University of Middle Earth at the Shire", + experiment_description="I went on an adventure to reclaim vast treasures.", + session_id="LONELYMTN001", + ) + + device = nwbfile.create_device( + name="array", description="the best array", manufacturer="Probe Company 9000" + ) + + nwbfile.add_electrode_column(name="label", description="label of electrode") + + nshanks = 4 + nchannels_per_shank = 3 + electrode_counter = 0 + + for ishank in range(nshanks): + # create an electrode group for this shank + electrode_group = nwbfile.create_electrode_group( + name="shank{}".format(ishank), + description="electrode group for shank {}".format(ishank), + device=device, + location="brain area", + ) + # add electrodes to the electrode table + for ielec in range(nchannels_per_shank): + nwbfile.add_electrode( + group=electrode_group, + label="shank{}elec{}".format(ishank, ielec), + location="brain area", + ) + electrode_counter += 1 + + all_table_region = nwbfile.create_electrode_table_region( + region=list(range(electrode_counter)), # reference row indices 0 to N-1 + description="all electrodes", + ) + + raw_data = np.random.randn(50, 12) + raw_electrical_series = ElectricalSeries( + name="ElectricalSeries", + data=raw_data, + electrodes=all_table_region, + starting_time=0.0, # timestamp of the first sample in seconds relative to the session start time + rate=20000.0, # in Hz + ) + + nwbfile.add_acquisition(raw_electrical_series) + + lfp_data = np.random.randn(50, 12) + lfp_electrical_series = ElectricalSeries( + name="ElectricalSeries", + data=lfp_data, + electrodes=all_table_region, + starting_time=0.0, + rate=200.0, + ) + + lfp = LFP(electrical_series=lfp_electrical_series) + + ecephys_module = nwbfile.create_processing_module( + name="ecephys", description="processed extracellular electrophysiology data" + ) + ecephys_module.add(lfp) + + nwbfile.add_unit_column(name="quality", description="sorting quality") + + firing_rate = 20 + n_units = 10 + res = 1000 + duration = 20 + for n_units_per_shank in range(n_units): + spike_times = ( + np.where(np.random.rand((res * duration)) < (firing_rate / res))[0] / res + ) + nwbfile.add_unit(spike_times=spike_times, quality="good") + + with tempfile.TemporaryDirectory() as tmpdir: + nwb_fname = f"{tmpdir}/ecephys_tutorial.nwb" + with NWBHDF5IO(path=nwb_fname, mode="w") as io: + io.write(nwbfile) # type: ignore + # h5f = h5py.File(nwb_fname, "r") + with lindi.LindiH5ZarrStore.from_file(nwb_fname, url=nwb_fname) as store: + rfs = store.to_reference_file_system() + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + with NWBHDF5IO(file=h5f_2, mode="r") as io: + nwbfile_2 = io.read() + print(nwbfile_2) + + +if __name__ == "__main__": + test_pynwb() diff --git a/tests/test_remote_data.py b/tests/test_remote_data.py new file mode 100644 index 0000000..27af554 --- /dev/null +++ b/tests/test_remote_data.py @@ -0,0 +1,45 @@ +import json +import pytest +import lindi + + +@pytest.mark.network +def test_remote_data_1(): + import pynwb + + # Define the URL for a remote NWB file + h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/" + + # Create a read-only Zarr store as a wrapper for the h5 file + store = lindi.LindiH5ZarrStore.from_file(h5_url) + + # Generate a reference file system + rfs = store.to_reference_file_system() + + # Save it to a file for later use + with open("example.zarr.json", "w") as f: + json.dump(rfs, f, indent=2) + + # Create an h5py-like client from the reference file system + client = lindi.LindiH5pyFile.from_reference_file_system(rfs) + + # Open using pynwb + with pynwb.NWBHDF5IO(file=client, mode="r") as io: + nwbfile = io.read() + print(nwbfile) + + +@pytest.mark.network +def test_remote_data_2(): + import pynwb + + # Define the URL for a remote .zarr.json file + url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' + + # Load the h5py-like client from the reference file system + client = lindi.LindiH5pyFile.from_reference_file_system(url) + + # Open using pynwb + with pynwb.NWBHDF5IO(file=client, mode="r") as io: + nwbfile = io.read() + print(nwbfile) diff --git a/tests/test_store.py b/tests/test_store.py new file mode 100644 index 0000000..e7688f5 --- /dev/null +++ b/tests/test_store.py @@ -0,0 +1,58 @@ +import h5py +import tempfile +import lindi + + +def test_store(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + f.create_dataset("dataset1", data=[1, 2, 3]) + group1 = f.create_group("group1") + group1.create_group("group2") + group1.create_dataset("dataset2", data=[4, 5, 6]) + with lindi.LindiH5ZarrStore.from_file(filename, url=filename) as store: + store.to_file(f"{tmpdir}/test.zarr.json") # for coverage + a = store.listdir('') + assert _lists_are_equal(a, ['dataset1', 'group1'], ordered=False) + b = store.listdir('group1') + assert _lists_are_equal(b, ['group2', 'dataset2'], ordered=False) + c = store.listdir('group1/group2') + assert _lists_are_equal(c, [], ordered=False) + assert '.zattrs' in store + assert '.zgroup' in store + assert 'dataset1' not in store + assert 'dataset1/.zattrs' in store + assert 'dataset1/.zarray' in store + assert 'dataset1/.zgroup' not in store + assert 'dataset1/0' in store + assert 'group1' not in store + assert 'group1/.zattrs' in store + assert 'group1/.zgroup' in store + assert 'group1/.zarray' not in store + assert 'group1/group2' not in store + assert 'group1/group2/.zattrs' in store + assert 'group1/group2/.zgroup' in store + assert 'group1/group2/.zarray' not in store + assert 'group1/dataset2' not in store + assert 'group1/dataset2/.zattrs' in store + assert 'group1/dataset2/.zarray' in store + assert 'group1/dataset2/.zgroup' not in store + assert 'group1/dataset2/0' in store + client = lindi.LindiH5pyFile.from_zarr_store(store) + X = client["dataset1"][:] # type: ignore + assert _lists_are_equal(X, [1, 2, 3], ordered=True) + Y = client["group1/dataset2"][:] # type: ignore + assert _lists_are_equal(Y, [4, 5, 6], ordered=True) + + +def _lists_are_equal(a, b, ordered: bool): + if ordered: + if len(a) != len(b): + return False + for i in range(len(a)): + if a[i] != b[i]: + return False + return True + else: + return set(a) == set(b) diff --git a/tests/test_with_real_data.py b/tests/test_with_real_data.py deleted file mode 100644 index ff68502..0000000 --- a/tests/test_with_real_data.py +++ /dev/null @@ -1,302 +0,0 @@ -import tempfile -import numpy as np -import zarr -import h5py -import remfile -from lindi import LindiH5Store, LindiClient -import lindi -import pytest - -examples = [] - -# example 0 -# This one seems to load properly -# https://neurosift.app/?p=/nwb&dandisetId=000717&dandisetVersion=draft&url=https://api.dandiarchive.org/api/assets/3d12a902-139a-4c1a-8fd0-0a7faf2fb223/download/ -examples.append( - { - "h5_url": "https://api.dandiarchive.org/api/assets/3d12a902-139a-4c1a-8fd0-0a7faf2fb223/download/", - } -) - -# example 1 -# https://neurosift.app/?p=/nwb&dandisetId=000776&dandisetVersion=draft&url=https://api.dandiarchive.org/api/assets/54895119-f739-4544-973e-a9341a5c66ad/download/ -# Exception: Not yet implemented (3): dataset /processing/CalciumActivity/CalciumSeriesSegmentation/Aligned_neuron_coordinates/voxel_mask with -# dtype [('x', '