From 015552bf0df6c701fcee1f0293266289b01922e7 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Thu, 4 Apr 2024 11:53:47 -0400 Subject: [PATCH 1/9] implement group/dataset id as a string --- examples/example1.py | 6 ++++++ lindi/LindiH5pyFile/LindiH5pyDataset.py | 14 +++++-------- lindi/LindiH5pyFile/LindiH5pyFile.py | 6 +++++- lindi/LindiH5pyFile/LindiH5pyGroup.py | 28 ++++++++++++------------- 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/examples/example1.py b/examples/example1.py index f5c1b6b..cb5fe6f 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -22,3 +22,9 @@ with pynwb.NWBHDF5IO(file=client, mode="r") as io: nwbfile = io.read() print(nwbfile) + + print('Electrode group at shank0:') + print(nwbfile.electrode_groups["shank0"]) # type: ignore + + print('Electrode group at index 0:') + print(nwbfile.electrodes.group[0]) # type: ignore diff --git a/lindi/LindiH5pyFile/LindiH5pyDataset.py b/lindi/LindiH5pyFile/LindiH5pyDataset.py index 598279a..83c540a 100644 --- a/lindi/LindiH5pyFile/LindiH5pyDataset.py +++ b/lindi/LindiH5pyFile/LindiH5pyDataset.py @@ -14,11 +14,6 @@ from .LindiH5pyFile import LindiH5pyFile # pragma: no cover -class LindiH5pyDatasetId: - def __init__(self, _h5py_dataset_id): - self._h5py_dataset_id = _h5py_dataset_id - - # This is a global list of external hdf5 clients, which are used by # possibly multiple LindiH5pyFile objects. The key is the URL of the # external hdf5 file, and the value is the h5py.File object. @@ -32,6 +27,9 @@ def __init__(self, _dataset_object: Union[h5py.Dataset, zarr.Array], _file: "Lin self._file = _file self._readonly = _file.mode not in ['r+'] + # see comment in LindiH5pyGroup + self._id = f'{id(self._file)}/{self._dataset_object.name}' + # See if we have the _COMPOUND_DTYPE attribute, which signifies that # this is a compound dtype if isinstance(_dataset_object, zarr.Array): @@ -74,10 +72,8 @@ def __init__(self, _dataset_object: Union[h5py.Dataset, zarr.Array], _file: "Lin @property def id(self): - if isinstance(self._dataset_object, h5py.Dataset): - return LindiH5pyDatasetId(self._dataset_object.id) - else: - return LindiH5pyDatasetId(None) + # see comment in LindiH5pyGroup + return self._id @property def shape(self): # type: ignore diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index d25f2fb..da72de0 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -25,6 +25,9 @@ def __init__(self, _file_object: Union[h5py.File, zarr.Group], *, _zarr_store: U self._mode: Literal['r', 'r+'] = _mode self._the_group = LindiH5pyGroup(_file_object, self) + # see comment in LindiH5pyGroup + self._id = f'{id(self._file_object)}/' + @staticmethod def from_reference_file_system(rfs: Union[dict, str], mode: Literal["r", "r+"] = "r"): """ @@ -298,7 +301,8 @@ def __contains__(self, name): @property def id(self): - return self._the_group.id + # see comment in LindiH5pyGroup + return self._id @property def file(self): diff --git a/lindi/LindiH5pyFile/LindiH5pyGroup.py b/lindi/LindiH5pyFile/LindiH5pyGroup.py index 1fbe8be..fda4dcb 100644 --- a/lindi/LindiH5pyFile/LindiH5pyGroup.py +++ b/lindi/LindiH5pyFile/LindiH5pyGroup.py @@ -11,17 +11,23 @@ from .LindiH5pyFile import LindiH5pyFile # pragma: no cover -class LindiH5pyGroupId: - def __init__(self, _h5py_group_id): - self._h5py_group_id = _h5py_group_id - - class LindiH5pyGroup(h5py.Group): def __init__(self, _group_object: Union[h5py.Group, zarr.Group], _file: "LindiH5pyFile"): self._group_object = _group_object self._file = _file self._readonly = _file.mode not in ['r+'] + # In h5py, the id property is an object that exposes low-level + # operations specific to the HDF5 library. LINDI aims to override the + # high-level methods such that the low-level operations on id are not + # needed. However, sometimes packages (e.g., pynwb) use the id as a + # unique identifier for purposes of caching. Therefore, we make the id + # to be a string that is unique for each object. If any of the low-level + # operations are attempted on this id string, then an exception will be + # raised, which will usually indicate that one of the high-level methods + # should be overridden. + self._id = f'{id(self._file)}/{self._group_object.name}' + # The self._write object handles all the writing operations from .writers.LindiH5pyGroupWriter import LindiH5pyGroupWriter # avoid circular import if self._readonly: @@ -131,16 +137,8 @@ def __repr__(self): @property def id(self): - if isinstance(self._group_object, h5py.Group): - return LindiH5pyGroupId(self._group_object.id) - elif isinstance(self._group_object, zarr.Group): - # This is commented out for now because pynwb gets the id of a group - # in at least one place. But that could be avoided in the future, at - # which time, we could uncomment this. - # print('WARNING: Accessing low-level id of LindiH5pyGroup. This should be avoided.') - return LindiH5pyGroupId('') - else: - raise Exception(f'Unexpected group object type: {type(self._group_object)}') + # see comment above + return self._id @property def file(self): From 7dc223a364dac7b5608b7969fe9111f49ef608cb Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Thu, 4 Apr 2024 14:04:38 -0400 Subject: [PATCH 2/9] float type checking --- lindi/conversion/attr_conversion.py | 2 +- lindi/conversion/nan_inf_ninf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lindi/conversion/attr_conversion.py b/lindi/conversion/attr_conversion.py index 15416fa..b6335b3 100644 --- a/lindi/conversion/attr_conversion.py +++ b/lindi/conversion/attr_conversion.py @@ -23,7 +23,7 @@ def h5_to_zarr_attr(attr: Any, *, label: str = '', h5f: Union[h5py.File, None]): raise Exception(f"Unexpected h5 attribute: None at {label}") elif type(attr) in [int, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64]: return int(attr) - elif type(attr) in [float, np.float16, np.float32, np.float64]: + elif isinstance(attr, (float, np.floating)): return encode_nan_inf_ninf(float(attr)) elif isinstance(attr, complex) or (isinstance(attr, np.ndarray) and np.issubdtype(attr.dtype, np.complexfloating)): raise Exception(f"Complex number is not supported at {label}") diff --git a/lindi/conversion/nan_inf_ninf.py b/lindi/conversion/nan_inf_ninf.py index 3ab012b..cc7595d 100644 --- a/lindi/conversion/nan_inf_ninf.py +++ b/lindi/conversion/nan_inf_ninf.py @@ -21,7 +21,7 @@ def encode_nan_inf_ninf(val): return [encode_nan_inf_ninf(v) for v in val] elif isinstance(val, dict): return {k: encode_nan_inf_ninf(v) for k, v in val.items()} - elif type(val) in [float, np.float16, np.float32, np.float64]: + elif isinstance(val, (float, np.floating)): if np.isnan(val): return 'NaN' elif val == float('inf'): From 2e023540380118092336f9544eadb9331b965958 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Thu, 4 Apr 2024 14:08:28 -0400 Subject: [PATCH 3/9] complex type check --- lindi/conversion/attr_conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lindi/conversion/attr_conversion.py b/lindi/conversion/attr_conversion.py index b6335b3..a3f0a19 100644 --- a/lindi/conversion/attr_conversion.py +++ b/lindi/conversion/attr_conversion.py @@ -25,7 +25,7 @@ def h5_to_zarr_attr(attr: Any, *, label: str = '', h5f: Union[h5py.File, None]): return int(attr) elif isinstance(attr, (float, np.floating)): return encode_nan_inf_ninf(float(attr)) - elif isinstance(attr, complex) or (isinstance(attr, np.ndarray) and np.issubdtype(attr.dtype, np.complexfloating)): + elif isinstance(attr, (complex, np.complexfloating)): raise Exception(f"Complex number is not supported at {label}") elif type(attr) in [bool, np.bool_]: return bool(attr) From fc1421f30965c8ef251362aaec652091a2a6c456 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Thu, 4 Apr 2024 14:55:54 -0400 Subject: [PATCH 4/9] incr version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ae41614..a6db70d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "lindi" -version = "0.1.1" +version = "0.2.0" description = "" authors = [] readme = "README.md" From 7fbc03edfabae056245c073454d68f8b8e2b4613 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Fri, 5 Apr 2024 07:37:16 -0400 Subject: [PATCH 5/9] add authors to pyproject.toml --- pyproject.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a6db70d..53d88c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,11 @@ name = "lindi" version = "0.2.0" description = "" -authors = [] +authors = [ + { name = "Jeremy Magland", email = "jmagland@flatironinstitute.org" }, + { name = "Ryan Ly", email = "rly@lbl.gov" }, + { name = "Oliver Ruebel", email = "oruebel@lbl.gov" } +] readme = "README.md" [tool.poetry.dependencies] From 8f360f9a18dacb48cc471bb8bfea544bb0bc88d5 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Fri, 5 Apr 2024 07:49:12 -0400 Subject: [PATCH 6/9] fix pyproject.toml formatting --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 53d88c3..88d37fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,9 +3,9 @@ name = "lindi" version = "0.2.0" description = "" authors = [ - { name = "Jeremy Magland", email = "jmagland@flatironinstitute.org" }, - { name = "Ryan Ly", email = "rly@lbl.gov" }, - { name = "Oliver Ruebel", email = "oruebel@lbl.gov" } + "Jeremy Magland ", + "Ryan Ly ", + "Oliver Ruebel " ] readme = "README.md" From d1352a8678099d69afb75f4a887a591ed33062ec Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Tue, 9 Apr 2024 16:20:50 -0400 Subject: [PATCH 7/9] update integration_tests workflow --- .github/workflows/integration_tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index bf85664..00f1071 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -1,6 +1,9 @@ name: integration tests -on: [push, pull_request] +on: + - push + - pull_request + - workflow_dispatch: # Allows manual triggering of the workflow jobs: build: From 9ff8f8d5c828310f5ee59e0bb9e0cef090cc1260 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Tue, 9 Apr 2024 16:22:46 -0400 Subject: [PATCH 8/9] update workflow --- .github/workflows/integration_tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 00f1071..c130856 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -1,9 +1,9 @@ name: integration tests on: - - push - - pull_request - - workflow_dispatch: # Allows manual triggering of the workflow + push: + pull_request: + workflow_dispatch: jobs: build: From 96a896b6d089ffdd97fe0101f18d107b56bce33d Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Tue, 9 Apr 2024 16:41:29 -0400 Subject: [PATCH 9/9] fix tests: raise KeyError --- lindi/LindiH5ZarrStore/LindiH5ZarrStore.py | 6 ++++-- tests/test_store.py | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index d1d1a20..0ca71eb 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -268,7 +268,8 @@ def _get_zgroup_bytes(self, parent_key: str): raise Exception("Store is closed") h5_item = self._h5f.get('/' + parent_key, None) if not isinstance(h5_item, h5py.Group): - raise Exception(f"Item {parent_key} is not a group") + # Important to raise a KeyError here because that's what zarr expects + raise KeyError(f"Item {parent_key} is not a group") # We create a dummy zarr group and then get the .zgroup JSON text # from it. memory_store = MemoryStore() @@ -287,7 +288,8 @@ def _get_zarray_bytes(self, parent_key: str): raise Exception("Store is closed") h5_item = self._h5f.get('/' + parent_key, None) if not isinstance(h5_item, h5py.Dataset): - raise Exception(f"Item {parent_key} is not a dataset") + # Important to raise a KeyError here because that's what zarr expects + raise KeyError(f"Item {parent_key} is not a dataset") # get the shape, chunks, dtype, and filters from the h5 dataset inline_array = self._get_inline_array(parent_key, h5_item) if inline_array.is_inline: diff --git a/tests/test_store.py b/tests/test_store.py index aff4667..c2ed53b 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -49,3 +49,7 @@ def test_store(): def _lists_are_equal_as_sets(a, b): return set(a) == set(b) + + +if __name__ == "__main__": + test_store()