From a8d894ba9b46573993b4fd9ebd9e39696489e588 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Sat, 16 Mar 2024 08:26:07 -0400 Subject: [PATCH 1/2] exclude zattrs when empty --- lindi/LindiH5Store/LindiH5Store.py | 32 +++++++++++++++++++++++------- lindi/LindiH5Store/_util.py | 2 +- tests/test_core.py | 13 +++++++++++- tests/test_with_real_data.py | 2 +- 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/lindi/LindiH5Store/LindiH5Store.py b/lindi/LindiH5Store/LindiH5Store.py index ba58b08..7a4d7e7 100644 --- a/lindi/LindiH5Store/LindiH5Store.py +++ b/lindi/LindiH5Store/LindiH5Store.py @@ -54,10 +54,9 @@ def __init__( self._opts = _opts # Some datasets do not correspond to traditional chunked datasets. For - # those datasets, we need to store the inline data so that we can - # return it when the chunk is requested. We store the inline data in a - # dictionary with the dataset name as the key. The values are of type - # bytes. + # those datasets, we need to store the inline data so that we can return + # it when the chunk is requested. We store the inline data in a + # dictionary with the dataset name as the key. The values are the bytes. self._inline_data_for_arrays: Dict[str, bytes] = {} self._external_array_links: Dict[str, Union[dict, None]] = {} @@ -114,7 +113,9 @@ def __getitem__(self, key): key_name = parts[-1] key_parent = "/".join(parts[:-1]) if key_name == ".zattrs": - # Get the attributes of a group or dataset + # Get the attributes of a group or dataset. We return this even if + # it is empty, but we exclude it when writing out the reference file + # system. return self._get_zattrs_bytes(key_parent) elif key_name == ".zgroup": # Get the .zgroup JSON text for a group @@ -138,16 +139,27 @@ def __contains__(self, key): key_parent = "/".join(parts[:-1]) if key_name == ".zattrs": h5_item = _get_h5_item(self._h5f, key_parent) - return isinstance(h5_item, h5py.Group) + if not h5_item: + return False + # We always return True here even if the attributes are going to be + # empty, because it's not worth including the logic. But when we + # write out the ref file system, we exclude it there. + return isinstance(h5_item, h5py.Group) or isinstance(h5_item, h5py.Dataset) elif key_name == ".zgroup": h5_item = _get_h5_item(self._h5f, key_parent) + if not h5_item: + return False return isinstance(h5_item, h5py.Group) elif key_name == ".zarray": h5_item = _get_h5_item(self._h5f, key_parent) + if not h5_item: + return False return isinstance(h5_item, h5py.Dataset) else: # a chunk file h5_item = _get_h5_item(self._h5f, key_parent) + if not h5_item: + return False if not isinstance(h5_item, h5py.Dataset): return False external_array_link = self._get_external_array_link(key_parent, h5_item) @@ -182,6 +194,9 @@ def _get_zattrs_bytes(self, parent_key: str): if self._h5f is None: raise Exception("Store is closed") h5_item = _get_h5_item(self._h5f, parent_key) + assert isinstance(h5_item, h5py.Group) or isinstance(h5_item, h5py.Dataset), ( + f"Item {parent_key} is not a group or dataset in _get_zattrs_bytes" + ) # We create a dummy zarr group and copy the attributes to it. That way # we know that zarr has accepted them and they are serialized in the # correct format. @@ -411,7 +426,9 @@ def _add_ref(key: str, content: Union[bytes, None]): def _process_group(key, item: h5py.Group): if isinstance(item, h5py.Group): - _add_ref(_join(key, ".zattrs"), self.get(_join(key, ".zattrs"))) + zattrs_bytes = self.get(_join(key, ".zattrs")) + if zattrs_bytes != b"{}": # don't include empty zattrs + _add_ref(_join(key, ".zattrs"), self.get(_join(key, ".zattrs"))) _add_ref(_join(key, ".zgroup"), self.get(_join(key, ".zgroup"))) for k in item.keys(): subitem = item[k] @@ -485,6 +502,7 @@ def _get_chunk_names_for_dataset(chunk_coords_shape: List[int]) -> List[str]: def _reformat_json(x: Union[bytes, None]) -> Union[bytes, None]: + """Reformat to not include whitespace and to encode NaN, Inf, and -Inf as strings.""" if x is None: return None a = json.loads(x.decode("utf-8")) diff --git a/lindi/LindiH5Store/_util.py b/lindi/LindiH5Store/_util.py index ab8d3cf..ece2544 100644 --- a/lindi/LindiH5Store/_util.py +++ b/lindi/LindiH5Store/_util.py @@ -5,7 +5,7 @@ def _get_h5_item(h5f: h5py.File, key: str): """Get an item from the h5 file, given its key.""" - return h5f['/' + key] + return h5f.get('/' + key, None) def _read_bytes(file: IO, offset: int, count: int): diff --git a/tests/test_core.py b/tests/test_core.py index ad83cd1..40bfa5a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -11,7 +11,8 @@ def test_scalar_datasets(): with tempfile.TemporaryDirectory() as tmpdir: filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: - f.create_dataset("X", data=val) + ds = f.create_dataset("X", data=val) + ds.attrs["foo"] = "bar" with LindiH5Store.from_file( filename, url=filename ) as store: # set url so that a reference file system can be created @@ -25,6 +26,16 @@ def test_scalar_datasets(): if not _check_equal(X1[()], X2[()]): print(f"WARNING: {X1} ({type(X1)}) != {X2} ({type(X2)})") raise ValueError("Scalar datasets are not equal") + assert '.zgroup' in store + assert '.zarray' not in rfs['refs'] + assert '.zarray' not in store + assert '.zattrs' in store # it's in the store but not in the ref file system -- see notes in LindiH5Store source code + assert '.zattrs' not in rfs['refs'] + assert 'X/.zgroup' not in store + assert 'X/.zattrs' in store # foo is set to bar + assert store['X/.zattrs'] == rfs['refs']['X/.zattrs'].encode() + assert 'X/.zarray' in rfs['refs'] + assert store['X/.zarray'] == rfs['refs']['X/.zarray'].encode() def test_numpy_arrays(): diff --git a/tests/test_with_real_data.py b/tests/test_with_real_data.py index a3d88a5..ff68502 100644 --- a/tests/test_with_real_data.py +++ b/tests/test_with_real_data.py @@ -298,5 +298,5 @@ def test_with_real_data(): for k in top_level_keys: assert k in top_level_keys_2 - root = zarr.open(store) + root = zarr.open(store, mode="r") _hdf5_visit_items(h5f, lambda key, item: _compare_item_2(item, root[key])) From e977cee050d1ec3fb5dcc98c0957eab831abc1b6 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Sat, 16 Mar 2024 21:24:50 -0400 Subject: [PATCH 2/2] Update lindi/LindiH5Store/LindiH5Store.py Co-authored-by: Ryan Ly --- lindi/LindiH5Store/LindiH5Store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lindi/LindiH5Store/LindiH5Store.py b/lindi/LindiH5Store/LindiH5Store.py index 7a4d7e7..4d519f5 100644 --- a/lindi/LindiH5Store/LindiH5Store.py +++ b/lindi/LindiH5Store/LindiH5Store.py @@ -113,8 +113,8 @@ def __getitem__(self, key): key_name = parts[-1] key_parent = "/".join(parts[:-1]) if key_name == ".zattrs": - # Get the attributes of a group or dataset. We return this even if - # it is empty, but we exclude it when writing out the reference file + # Get the attributes of a group or dataset. If it is empty, we still + # return it, but we exclude it when writing out the reference file # system. return self._get_zattrs_bytes(key_parent) elif key_name == ".zgroup":