From a8d894ba9b46573993b4fd9ebd9e39696489e588 Mon Sep 17 00:00:00 2001
From: Jeremy Magland <jmagland@flatironinstitute.org>
Date: Sat, 16 Mar 2024 08:26:07 -0400
Subject: [PATCH 1/2] exclude zattrs when empty

---
 lindi/LindiH5Store/LindiH5Store.py | 32 +++++++++++++++++++++++-------
 lindi/LindiH5Store/_util.py        |  2 +-
 tests/test_core.py                 | 13 +++++++++++-
 tests/test_with_real_data.py       |  2 +-
 4 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/lindi/LindiH5Store/LindiH5Store.py b/lindi/LindiH5Store/LindiH5Store.py
index ba58b08..7a4d7e7 100644
--- a/lindi/LindiH5Store/LindiH5Store.py
+++ b/lindi/LindiH5Store/LindiH5Store.py
@@ -54,10 +54,9 @@ def __init__(
         self._opts = _opts
 
         # Some datasets do not correspond to traditional chunked datasets. For
-        # those datasets, we need to store the inline data so that we can
-        # return it when the chunk is requested. We store the inline data in a
-        # dictionary with the dataset name as the key. The values are of type
-        # bytes.
+        # those datasets, we need to store the inline data so that we can return
+        # it when the chunk is requested. We store the inline data in a
+        # dictionary with the dataset name as the key. The values are the bytes.
         self._inline_data_for_arrays: Dict[str, bytes] = {}
 
         self._external_array_links: Dict[str, Union[dict, None]] = {}
@@ -114,7 +113,9 @@ def __getitem__(self, key):
         key_name = parts[-1]
         key_parent = "/".join(parts[:-1])
         if key_name == ".zattrs":
-            # Get the attributes of a group or dataset
+            # Get the attributes of a group or dataset. We return this even if
+            # it is empty, but we exclude it when writing out the reference file
+            # system.
             return self._get_zattrs_bytes(key_parent)
         elif key_name == ".zgroup":
             # Get the .zgroup JSON text for a group
@@ -138,16 +139,27 @@ def __contains__(self, key):
         key_parent = "/".join(parts[:-1])
         if key_name == ".zattrs":
             h5_item = _get_h5_item(self._h5f, key_parent)
-            return isinstance(h5_item, h5py.Group)
+            if not h5_item:
+                return False
+            # We always return True here even if the attributes are going to be
+            # empty, because it's not worth including the logic. But when we
+            # write out the ref file system, we exclude it there.
+            return isinstance(h5_item, h5py.Group) or isinstance(h5_item, h5py.Dataset)
         elif key_name == ".zgroup":
             h5_item = _get_h5_item(self._h5f, key_parent)
+            if not h5_item:
+                return False
             return isinstance(h5_item, h5py.Group)
         elif key_name == ".zarray":
             h5_item = _get_h5_item(self._h5f, key_parent)
+            if not h5_item:
+                return False
             return isinstance(h5_item, h5py.Dataset)
         else:
             # a chunk file
             h5_item = _get_h5_item(self._h5f, key_parent)
+            if not h5_item:
+                return False
             if not isinstance(h5_item, h5py.Dataset):
                 return False
             external_array_link = self._get_external_array_link(key_parent, h5_item)
@@ -182,6 +194,9 @@ def _get_zattrs_bytes(self, parent_key: str):
         if self._h5f is None:
             raise Exception("Store is closed")
         h5_item = _get_h5_item(self._h5f, parent_key)
+        assert isinstance(h5_item, h5py.Group) or isinstance(h5_item, h5py.Dataset), (
+            f"Item {parent_key} is not a group or dataset in _get_zattrs_bytes"
+        )
         # We create a dummy zarr group and copy the attributes to it. That way
         # we know that zarr has accepted them and they are serialized in the
         # correct format.
@@ -411,7 +426,9 @@ def _add_ref(key: str, content: Union[bytes, None]):
 
         def _process_group(key, item: h5py.Group):
             if isinstance(item, h5py.Group):
-                _add_ref(_join(key, ".zattrs"), self.get(_join(key, ".zattrs")))
+                zattrs_bytes = self.get(_join(key, ".zattrs"))
+                if zattrs_bytes != b"{}":  # don't include empty zattrs
+                    _add_ref(_join(key, ".zattrs"), self.get(_join(key, ".zattrs")))
                 _add_ref(_join(key, ".zgroup"), self.get(_join(key, ".zgroup")))
                 for k in item.keys():
                     subitem = item[k]
@@ -485,6 +502,7 @@ def _get_chunk_names_for_dataset(chunk_coords_shape: List[int]) -> List[str]:
 
 
 def _reformat_json(x: Union[bytes, None]) -> Union[bytes, None]:
+    """Reformat to not include whitespace and to encode NaN, Inf, and -Inf as strings."""
     if x is None:
         return None
     a = json.loads(x.decode("utf-8"))
diff --git a/lindi/LindiH5Store/_util.py b/lindi/LindiH5Store/_util.py
index ab8d3cf..ece2544 100644
--- a/lindi/LindiH5Store/_util.py
+++ b/lindi/LindiH5Store/_util.py
@@ -5,7 +5,7 @@
 
 def _get_h5_item(h5f: h5py.File, key: str):
     """Get an item from the h5 file, given its key."""
-    return h5f['/' + key]
+    return h5f.get('/' + key, None)
 
 
 def _read_bytes(file: IO, offset: int, count: int):
diff --git a/tests/test_core.py b/tests/test_core.py
index ad83cd1..40bfa5a 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -11,7 +11,8 @@ def test_scalar_datasets():
         with tempfile.TemporaryDirectory() as tmpdir:
             filename = f"{tmpdir}/test.h5"
             with h5py.File(filename, "w") as f:
-                f.create_dataset("X", data=val)
+                ds = f.create_dataset("X", data=val)
+                ds.attrs["foo"] = "bar"
             with LindiH5Store.from_file(
                 filename, url=filename
             ) as store:  # set url so that a reference file system can be created
@@ -25,6 +26,16 @@ def test_scalar_datasets():
                 if not _check_equal(X1[()], X2[()]):
                     print(f"WARNING: {X1} ({type(X1)}) != {X2} ({type(X2)})")
                     raise ValueError("Scalar datasets are not equal")
+                assert '.zgroup' in store
+                assert '.zarray' not in rfs['refs']
+                assert '.zarray' not in store
+                assert '.zattrs' in store  # it's in the store but not in the ref file system -- see notes in LindiH5Store source code
+                assert '.zattrs' not in rfs['refs']
+                assert 'X/.zgroup' not in store
+                assert 'X/.zattrs' in store  # foo is set to bar
+                assert store['X/.zattrs'] == rfs['refs']['X/.zattrs'].encode()
+                assert 'X/.zarray' in rfs['refs']
+                assert store['X/.zarray'] == rfs['refs']['X/.zarray'].encode()
 
 
 def test_numpy_arrays():
diff --git a/tests/test_with_real_data.py b/tests/test_with_real_data.py
index a3d88a5..ff68502 100644
--- a/tests/test_with_real_data.py
+++ b/tests/test_with_real_data.py
@@ -298,5 +298,5 @@ def test_with_real_data():
         for k in top_level_keys:
             assert k in top_level_keys_2
 
-        root = zarr.open(store)
+        root = zarr.open(store, mode="r")
         _hdf5_visit_items(h5f, lambda key, item: _compare_item_2(item, root[key]))

From e977cee050d1ec3fb5dcc98c0957eab831abc1b6 Mon Sep 17 00:00:00 2001
From: Jeremy Magland <magland@users.noreply.github.com>
Date: Sat, 16 Mar 2024 21:24:50 -0400
Subject: [PATCH 2/2] Update lindi/LindiH5Store/LindiH5Store.py

Co-authored-by: Ryan Ly <rly@lbl.gov>
---
 lindi/LindiH5Store/LindiH5Store.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lindi/LindiH5Store/LindiH5Store.py b/lindi/LindiH5Store/LindiH5Store.py
index 7a4d7e7..4d519f5 100644
--- a/lindi/LindiH5Store/LindiH5Store.py
+++ b/lindi/LindiH5Store/LindiH5Store.py
@@ -113,8 +113,8 @@ def __getitem__(self, key):
         key_name = parts[-1]
         key_parent = "/".join(parts[:-1])
         if key_name == ".zattrs":
-            # Get the attributes of a group or dataset. We return this even if
-            # it is empty, but we exclude it when writing out the reference file
+            # Get the attributes of a group or dataset. If it is empty, we still
+            # return it, but we exclude it when writing out the reference file
             # system.
             return self._get_zattrs_bytes(key_parent)
         elif key_name == ".zgroup":