Reapply HDF reader fillvalue (#460)

* Reapply "Align HDF reader CF _FillValue with Zarr v3 semantics (#420)" (#457) This reverts commit dbd3566. * Reapply HDF fill value changes Use context manager in tests to ensure resources are released.
zarr-developers · Mar 3, 2025 · 1cdc2f0 · 1cdc2f0
1 parent 19caf2f
commit 1cdc2f0
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 48 deletions.
diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py
@@ -8,11 +8,13 @@
     List,
     Mapping,
     Optional,
+    Tuple,
     Union,
 )
 
 import numpy as np
 import xarray as xr
+from xarray.backends.zarr import FillValueCoder
 
 from virtualizarr.manifests import (
     ChunkEntry,
@@ -40,6 +42,20 @@
     H5Dataset: Any = None
     H5Group: Any = None
 
+FillValueType = Union[
+    int,
+    float,
+    bool,
+    complex,
+    str,
+    np.integer,
+    np.floating,
+    np.bool_,
+    np.complexfloating,
+    bytes,  # For fixed-length string storage
+    Tuple[bytes, int],  # Structured type
+]
+
 
 class HDFVirtualBackend(VirtualBackend):
     @staticmethod
@@ -216,6 +232,29 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
 
         return [dim.removeprefix(group) for dim in dims]
 
+    @staticmethod
+    def _extract_cf_fill_value(
+        h5obj: Union[H5Dataset, H5Group],
+    ) -> Optional[FillValueType]:
+        """
+        Convert the _FillValue attribute from an HDF5 group or dataset into
+        encoding.
+
+        Parameters
+        ----------
+        h5obj : h5py.Group or h5py.Dataset
+            An h5py group or dataset.
+        """
+        fillvalue = None
+        for n, v in h5obj.attrs.items():
+            if n == "_FillValue":
+                if isinstance(v, np.ndarray) and v.size == 1:
+                    fillvalue = v.item()
+                else:
+                    fillvalue = v
+                fillvalue = FillValueCoder.encode(fillvalue, h5obj.dtype)  # type: ignore[arg-type]
+        return fillvalue
+
     @staticmethod
     def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
         """
@@ -240,14 +279,14 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
         for n, v in h5obj.attrs.items():
             if n in _HIDDEN_ATTRS:
                 continue
+            if n == "_FillValue":
+                continue
             # Fix some attribute values to avoid JSON encoding exceptions...
             if isinstance(v, bytes):
                 v = v.decode("utf-8") or " "
             elif isinstance(v, (np.ndarray, np.number, np.bool_)):
                 if v.dtype.kind == "S":
                     v = v.astype(str)
-                if n == "_FillValue":
-                    continue
                 elif v.size == 1:
                     v = v.flatten()[0]
                     if isinstance(v, (np.ndarray, np.number, np.bool_)):
@@ -258,7 +297,6 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
                 v = ""
             if v == "DIMENSION_SCALE":
                 continue
-
             attrs[n] = v
         return attrs
 
@@ -290,21 +328,19 @@ def _dataset_to_variable(
         codecs = codecs_from_dataset(dataset)
         cfcodec = cfcodec_from_dataset(dataset)
         attrs = HDFVirtualBackend._extract_attrs(dataset)
+        cf_fill_value = HDFVirtualBackend._extract_cf_fill_value(dataset)
+        attrs.pop("_FillValue", None)
+
         if cfcodec:
             codecs.insert(0, cfcodec["codec"])
             dtype = cfcodec["target_dtype"]
             attrs.pop("scale_factor", None)
             attrs.pop("add_offset", None)
-            fill_value = cfcodec["codec"].decode(dataset.fillvalue)
         else:
             dtype = dataset.dtype
-            fill_value = dataset.fillvalue
-        if isinstance(fill_value, np.ndarray):
-            fill_value = fill_value[0]
-        if np.isnan(fill_value):
-            fill_value = float("nan")
-        if isinstance(fill_value, np.generic):
-            fill_value = fill_value.item()
+
+        fill_value = dataset.fillvalue.item()
+
         filters = [codec.get_config() for codec in codecs]
         zarray = ZArray(
             chunks=chunks,  # type: ignore
@@ -323,6 +359,8 @@ def _dataset_to_variable(
             variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
         else:
             variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
+        if cf_fill_value is not None:
+            variable.encoding["_FillValue"] = cf_fill_value
         return variable
 
     @staticmethod

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
@@ -342,3 +342,64 @@ def non_coord_dim(tmpdir):
     ds = ds.drop_dims("dim3")
     ds.to_netcdf(filepath, engine="netcdf4")
     return filepath
+
+
+@pytest.fixture
+def scalar_fill_value_hdf5_file(tmpdir):
+    filepath = f"{tmpdir}/scalar_fill_value.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.randint(0, 10, size=(5))
+    fill_value = 42
+    f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value)
+    return filepath
+
+
+compound_dtype = np.dtype(
+    [
+        ("id", "i4"),  # 4-byte integer
+        ("temperature", "f4"),  # 4-byte float
+    ]
+)
+
+compound_data = np.array(
+    [
+        (1, 98.6),
+        (2, 101.3),
+    ],
+    dtype=compound_dtype,
+)
+
+compound_fill = (-9999, -9999.0)
+
+fill_values = [
+    {"fill_value": -9999, "data": np.random.randint(0, 10, size=(5))},
+    {"fill_value": -9999.0, "data": np.random.random(5)},
+    {"fill_value": np.nan, "data": np.random.random(5)},
+    {"fill_value": False, "data": np.array([True, False, False, True, True])},
+    {"fill_value": "NaN", "data": np.array(["three"], dtype="S10")},
+    {"fill_value": compound_fill, "data": compound_data},
+]
+
+
+@pytest.fixture(params=fill_values)
+def cf_fill_value_hdf5_file(tmpdir, request):
+    filepath = f"{tmpdir}/cf_fill_value.nc"
+    f = h5py.File(filepath, "w")
+    dset = f.create_dataset(name="data", data=request.param["data"], chunks=True)
+    dim_scale = f.create_dataset(
+        name="dim_scale", data=request.param["data"], chunks=True
+    )
+    dim_scale.make_scale()
+    dset.dims[0].attach_scale(dim_scale)
+    dset.attrs["_FillValue"] = request.param["fill_value"]
+    return filepath
+
+
+@pytest.fixture
+def cf_array_fill_value_hdf5_file(tmpdir):
+    filepath = f"{tmpdir}/cf_array_fill_value.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random(5)
+    dset = f.create_dataset(name="data", data=data, chunks=True)
+    dset.attrs["_FillValue"] = np.array([np.nan])
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 import h5py  # type: ignore
+import numpy as np
 import pytest
 
 from virtualizarr import open_virtual_dataset
@@ -111,6 +112,36 @@ def test_dataset_attributes(self, string_attributes_hdf5_file):
         )
         assert var.attrs["attribute_name"] == "attribute_name"
 
+    def test_scalar_fill_value(self, scalar_fill_value_hdf5_file):
+        f = h5py.File(scalar_fill_value_hdf5_file)
+        ds = f["data"]
+        var = HDFVirtualBackend._dataset_to_variable(
+            scalar_fill_value_hdf5_file, ds, group=""
+        )
+        assert var.data.zarray.fill_value == 42
+
+    def test_cf_fill_value(self, cf_fill_value_hdf5_file):
+        f = h5py.File(cf_fill_value_hdf5_file)
+        ds = f["data"]
+        if ds.dtype.kind in "S":
+            pytest.xfail("Investigate fixed-length binary encoding in Zarr v3")
+        if ds.dtype.names:
+            pytest.xfail(
+                "To fix, structured dtype fill value encoding for Zarr backend"
+            )
+        var = HDFVirtualBackend._dataset_to_variable(
+            cf_fill_value_hdf5_file, ds, group=""
+        )
+        assert "_FillValue" in var.encoding
+
+    def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file):
+        f = h5py.File(cf_array_fill_value_hdf5_file)
+        ds = f["data"]
+        var = HDFVirtualBackend._dataset_to_variable(
+            cf_array_fill_value_hdf5_file, ds, group=""
+        )
+        assert not isinstance(var.encoding["_FillValue"], np.ndarray)
+
 
 @requires_hdf5plugin
 @requires_imagecodecs

diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py
@@ -6,9 +6,11 @@
 from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import (
     requires_hdf5plugin,
+    requires_icechunk,
     requires_imagecodecs,
     requires_kerchunk,
 )
+from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk
 
 
 @requires_kerchunk
@@ -19,47 +21,81 @@ class TestIntegration:
         reason="0 time start is being interpreted as fillvalue see issues/280"
     )
     def test_filters_h5netcdf_roundtrip(
-        self, tmpdir, filter_encoded_roundtrip_hdf5_file
+        self, tmp_path, filter_encoded_roundtrip_hdf5_file
     ):
-        ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
-        vds = virtualizarr.open_virtual_dataset(
-            filter_encoded_roundtrip_hdf5_file,
-            loadable_variables=["time"],
-            cftime_variables=["time"],
-            backend=HDFVirtualBackend,
-        )
-        kerchunk_file = f"{tmpdir}/kerchunk.json"
-        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
-        xrt.assert_allclose(ds, roundtrip)
+        with (
+            xr.open_dataset(
+                filter_encoded_roundtrip_hdf5_file, decode_times=True
+            ) as ds,
+            virtualizarr.open_virtual_dataset(
+                filter_encoded_roundtrip_hdf5_file,
+                loadable_variables=["time"],
+                cftime_variables=["time"],
+                backend=HDFVirtualBackend,
+            ) as vds,
+        ):
+            kerchunk_file = str(tmp_path / "kerchunk.json")
+            vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+            roundtrip = xr.open_dataset(
+                kerchunk_file, engine="kerchunk", decode_times=True
+            )
+            xrt.assert_allclose(ds, roundtrip)
 
     def test_filters_netcdf4_roundtrip(
-        self, tmpdir, filter_encoded_roundtrip_netcdf4_file
+        self, tmp_path, filter_encoded_roundtrip_netcdf4_file
     ):
         filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
-        ds = xr.open_dataset(filepath)
-        vds = virtualizarr.open_virtual_dataset(filepath, backend=HDFVirtualBackend)
-        kerchunk_file = f"{tmpdir}/kerchunk.json"
-        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
-        xrt.assert_equal(ds, roundtrip)
+        with (
+            xr.open_dataset(filepath) as ds,
+            virtualizarr.open_virtual_dataset(
+                filepath, backend=HDFVirtualBackend
+            ) as vds,
+        ):
+            kerchunk_file = str(tmp_path / "kerchunk.json")
+            vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+            roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
+            xrt.assert_equal(ds, roundtrip)
 
-    def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
-        ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
-        vds = virtualizarr.open_virtual_dataset(
-            filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend
-        )
-        kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
-        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
-        xrt.assert_allclose(ds, roundtrip)
+    def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_file):
+        with (
+            xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) as ds,
+            virtualizarr.open_virtual_dataset(
+                filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend
+            ) as vds,
+        ):
+            kerchunk_file = str(tmp_path / "filter_cf_kerchunk.json")
+            vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+            roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
+            xrt.assert_allclose(ds, roundtrip)
+            assert (
+                ds["temperature"].encoding["_FillValue"]
+                == roundtrip["temperature"].encoding["_FillValue"]
+            )
 
-    def test_non_coord_dim(self, tmpdir, non_coord_dim):
-        ds = xr.open_dataset(non_coord_dim)
-        vds = virtualizarr.open_virtual_dataset(
-            non_coord_dim, backend=HDFVirtualBackend
-        )
-        kerchunk_file = f"{tmpdir}/kerchunk.json"
-        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
-        xrt.assert_equal(ds, roundtrip)
+    def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim):
+        with (
+            xr.open_dataset(non_coord_dim) as ds,
+            virtualizarr.open_virtual_dataset(
+                non_coord_dim, backend=HDFVirtualBackend
+            ) as vds,
+        ):
+            kerchunk_file = str(tmp_path / "kerchunk.json")
+            vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+            roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
+            xrt.assert_equal(ds, roundtrip)
+
+    @requires_icechunk
+    def test_cf_fill_value_roundtrip(self, tmp_path, cf_fill_value_hdf5_file):
+        with xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf") as ds:
+            if ds["data"].dtype in [float, object]:
+                pytest.xfail(
+                    "TODO: fix handling fixed-length and structured type fill value"
+                    " encoding in xarray zarr backend."
+                )
+            with virtualizarr.open_virtual_dataset(
+                cf_fill_value_hdf5_file, backend=HDFVirtualBackend
+            ) as vds:
+                roundtrip = roundtrip_as_in_memory_icechunk(
+                    vds, tmp_path, decode_times=False
+                )
+                xrt.assert_equal(ds, roundtrip)