From 1cdc2f099dcadf4f12595cd02437256aaa662580 Mon Sep 17 00:00:00 2001 From: Chuck Daniels Date: Mon, 3 Mar 2025 08:26:15 -0500 Subject: [PATCH] Reapply HDF reader fillvalue (#460) * Reapply "Align HDF reader CF _FillValue with Zarr v3 semantics (#420)" (#457) This reverts commit dbd35667eb6b97f686833da281dce9eb635ebdf9. * Reapply HDF fill value changes Use context manager in tests to ensure resources are released. --- virtualizarr/readers/hdf/hdf.py | 60 ++++++++-- virtualizarr/tests/test_readers/conftest.py | 61 ++++++++++ .../tests/test_readers/test_hdf/test_hdf.py | 31 +++++ .../test_hdf/test_hdf_integration.py | 110 ++++++++++++------ 4 files changed, 214 insertions(+), 48 deletions(-) diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index 2b45cd9a..fc55c1bd 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -8,11 +8,13 @@ List, Mapping, Optional, + Tuple, Union, ) import numpy as np import xarray as xr +from xarray.backends.zarr import FillValueCoder from virtualizarr.manifests import ( ChunkEntry, @@ -40,6 +42,20 @@ H5Dataset: Any = None H5Group: Any = None +FillValueType = Union[ + int, + float, + bool, + complex, + str, + np.integer, + np.floating, + np.bool_, + np.complexfloating, + bytes, # For fixed-length string storage + Tuple[bytes, int], # Structured type +] + class HDFVirtualBackend(VirtualBackend): @staticmethod @@ -216,6 +232,29 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]: return [dim.removeprefix(group) for dim in dims] + @staticmethod + def _extract_cf_fill_value( + h5obj: Union[H5Dataset, H5Group], + ) -> Optional[FillValueType]: + """ + Convert the _FillValue attribute from an HDF5 group or dataset into + encoding. + + Parameters + ---------- + h5obj : h5py.Group or h5py.Dataset + An h5py group or dataset. + """ + fillvalue = None + for n, v in h5obj.attrs.items(): + if n == "_FillValue": + if isinstance(v, np.ndarray) and v.size == 1: + fillvalue = v.item() + else: + fillvalue = v + fillvalue = FillValueCoder.encode(fillvalue, h5obj.dtype) # type: ignore[arg-type] + return fillvalue + @staticmethod def _extract_attrs(h5obj: Union[H5Dataset, H5Group]): """ @@ -240,14 +279,14 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]): for n, v in h5obj.attrs.items(): if n in _HIDDEN_ATTRS: continue + if n == "_FillValue": + continue # Fix some attribute values to avoid JSON encoding exceptions... if isinstance(v, bytes): v = v.decode("utf-8") or " " elif isinstance(v, (np.ndarray, np.number, np.bool_)): if v.dtype.kind == "S": v = v.astype(str) - if n == "_FillValue": - continue elif v.size == 1: v = v.flatten()[0] if isinstance(v, (np.ndarray, np.number, np.bool_)): @@ -258,7 +297,6 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]): v = "" if v == "DIMENSION_SCALE": continue - attrs[n] = v return attrs @@ -290,21 +328,19 @@ def _dataset_to_variable( codecs = codecs_from_dataset(dataset) cfcodec = cfcodec_from_dataset(dataset) attrs = HDFVirtualBackend._extract_attrs(dataset) + cf_fill_value = HDFVirtualBackend._extract_cf_fill_value(dataset) + attrs.pop("_FillValue", None) + if cfcodec: codecs.insert(0, cfcodec["codec"]) dtype = cfcodec["target_dtype"] attrs.pop("scale_factor", None) attrs.pop("add_offset", None) - fill_value = cfcodec["codec"].decode(dataset.fillvalue) else: dtype = dataset.dtype - fill_value = dataset.fillvalue - if isinstance(fill_value, np.ndarray): - fill_value = fill_value[0] - if np.isnan(fill_value): - fill_value = float("nan") - if isinstance(fill_value, np.generic): - fill_value = fill_value.item() + + fill_value = dataset.fillvalue.item() + filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, # type: ignore @@ -323,6 +359,8 @@ def _dataset_to_variable( variable = xr.Variable(data=marray, dims=dims, attrs=attrs) else: variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs) + if cf_fill_value is not None: + variable.encoding["_FillValue"] = cf_fill_value return variable @staticmethod diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 8d0c1997..4884db4a 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -342,3 +342,64 @@ def non_coord_dim(tmpdir): ds = ds.drop_dims("dim3") ds.to_netcdf(filepath, engine="netcdf4") return filepath + + +@pytest.fixture +def scalar_fill_value_hdf5_file(tmpdir): + filepath = f"{tmpdir}/scalar_fill_value.nc" + f = h5py.File(filepath, "w") + data = np.random.randint(0, 10, size=(5)) + fill_value = 42 + f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value) + return filepath + + +compound_dtype = np.dtype( + [ + ("id", "i4"), # 4-byte integer + ("temperature", "f4"), # 4-byte float + ] +) + +compound_data = np.array( + [ + (1, 98.6), + (2, 101.3), + ], + dtype=compound_dtype, +) + +compound_fill = (-9999, -9999.0) + +fill_values = [ + {"fill_value": -9999, "data": np.random.randint(0, 10, size=(5))}, + {"fill_value": -9999.0, "data": np.random.random(5)}, + {"fill_value": np.nan, "data": np.random.random(5)}, + {"fill_value": False, "data": np.array([True, False, False, True, True])}, + {"fill_value": "NaN", "data": np.array(["three"], dtype="S10")}, + {"fill_value": compound_fill, "data": compound_data}, +] + + +@pytest.fixture(params=fill_values) +def cf_fill_value_hdf5_file(tmpdir, request): + filepath = f"{tmpdir}/cf_fill_value.nc" + f = h5py.File(filepath, "w") + dset = f.create_dataset(name="data", data=request.param["data"], chunks=True) + dim_scale = f.create_dataset( + name="dim_scale", data=request.param["data"], chunks=True + ) + dim_scale.make_scale() + dset.dims[0].attach_scale(dim_scale) + dset.attrs["_FillValue"] = request.param["fill_value"] + return filepath + + +@pytest.fixture +def cf_array_fill_value_hdf5_file(tmpdir): + filepath = f"{tmpdir}/cf_array_fill_value.nc" + f = h5py.File(filepath, "w") + data = np.random.random(5) + dset = f.create_dataset(name="data", data=data, chunks=True) + dset.attrs["_FillValue"] = np.array([np.nan]) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py index 598848a5..d5146e88 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py @@ -1,6 +1,7 @@ from unittest.mock import patch import h5py # type: ignore +import numpy as np import pytest from virtualizarr import open_virtual_dataset @@ -111,6 +112,36 @@ def test_dataset_attributes(self, string_attributes_hdf5_file): ) assert var.attrs["attribute_name"] == "attribute_name" + def test_scalar_fill_value(self, scalar_fill_value_hdf5_file): + f = h5py.File(scalar_fill_value_hdf5_file) + ds = f["data"] + var = HDFVirtualBackend._dataset_to_variable( + scalar_fill_value_hdf5_file, ds, group="" + ) + assert var.data.zarray.fill_value == 42 + + def test_cf_fill_value(self, cf_fill_value_hdf5_file): + f = h5py.File(cf_fill_value_hdf5_file) + ds = f["data"] + if ds.dtype.kind in "S": + pytest.xfail("Investigate fixed-length binary encoding in Zarr v3") + if ds.dtype.names: + pytest.xfail( + "To fix, structured dtype fill value encoding for Zarr backend" + ) + var = HDFVirtualBackend._dataset_to_variable( + cf_fill_value_hdf5_file, ds, group="" + ) + assert "_FillValue" in var.encoding + + def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file): + f = h5py.File(cf_array_fill_value_hdf5_file) + ds = f["data"] + var = HDFVirtualBackend._dataset_to_variable( + cf_array_fill_value_hdf5_file, ds, group="" + ) + assert not isinstance(var.encoding["_FillValue"], np.ndarray) + @requires_hdf5plugin @requires_imagecodecs diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py index 8583f210..e1407611 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py @@ -6,9 +6,11 @@ from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import ( requires_hdf5plugin, + requires_icechunk, requires_imagecodecs, requires_kerchunk, ) +from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk @requires_kerchunk @@ -19,47 +21,81 @@ class TestIntegration: reason="0 time start is being interpreted as fillvalue see issues/280" ) def test_filters_h5netcdf_roundtrip( - self, tmpdir, filter_encoded_roundtrip_hdf5_file + self, tmp_path, filter_encoded_roundtrip_hdf5_file ): - ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True) - vds = virtualizarr.open_virtual_dataset( - filter_encoded_roundtrip_hdf5_file, - loadable_variables=["time"], - cftime_variables=["time"], - backend=HDFVirtualBackend, - ) - kerchunk_file = f"{tmpdir}/kerchunk.json" - vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True) - xrt.assert_allclose(ds, roundtrip) + with ( + xr.open_dataset( + filter_encoded_roundtrip_hdf5_file, decode_times=True + ) as ds, + virtualizarr.open_virtual_dataset( + filter_encoded_roundtrip_hdf5_file, + loadable_variables=["time"], + cftime_variables=["time"], + backend=HDFVirtualBackend, + ) as vds, + ): + kerchunk_file = str(tmp_path / "kerchunk.json") + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset( + kerchunk_file, engine="kerchunk", decode_times=True + ) + xrt.assert_allclose(ds, roundtrip) def test_filters_netcdf4_roundtrip( - self, tmpdir, filter_encoded_roundtrip_netcdf4_file + self, tmp_path, filter_encoded_roundtrip_netcdf4_file ): filepath = filter_encoded_roundtrip_netcdf4_file["filepath"] - ds = xr.open_dataset(filepath) - vds = virtualizarr.open_virtual_dataset(filepath, backend=HDFVirtualBackend) - kerchunk_file = f"{tmpdir}/kerchunk.json" - vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") - xrt.assert_equal(ds, roundtrip) + with ( + xr.open_dataset(filepath) as ds, + virtualizarr.open_virtual_dataset( + filepath, backend=HDFVirtualBackend + ) as vds, + ): + kerchunk_file = str(tmp_path / "kerchunk.json") + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + xrt.assert_equal(ds, roundtrip) - def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): - ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) - vds = virtualizarr.open_virtual_dataset( - filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend - ) - kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json" - vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") - xrt.assert_allclose(ds, roundtrip) + def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_file): + with ( + xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) as ds, + virtualizarr.open_virtual_dataset( + filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend + ) as vds, + ): + kerchunk_file = str(tmp_path / "filter_cf_kerchunk.json") + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + xrt.assert_allclose(ds, roundtrip) + assert ( + ds["temperature"].encoding["_FillValue"] + == roundtrip["temperature"].encoding["_FillValue"] + ) - def test_non_coord_dim(self, tmpdir, non_coord_dim): - ds = xr.open_dataset(non_coord_dim) - vds = virtualizarr.open_virtual_dataset( - non_coord_dim, backend=HDFVirtualBackend - ) - kerchunk_file = f"{tmpdir}/kerchunk.json" - vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") - xrt.assert_equal(ds, roundtrip) + def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim): + with ( + xr.open_dataset(non_coord_dim) as ds, + virtualizarr.open_virtual_dataset( + non_coord_dim, backend=HDFVirtualBackend + ) as vds, + ): + kerchunk_file = str(tmp_path / "kerchunk.json") + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + xrt.assert_equal(ds, roundtrip) + + @requires_icechunk + def test_cf_fill_value_roundtrip(self, tmp_path, cf_fill_value_hdf5_file): + with xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf") as ds: + if ds["data"].dtype in [float, object]: + pytest.xfail( + "TODO: fix handling fixed-length and structured type fill value" + " encoding in xarray zarr backend." + ) + with virtualizarr.open_virtual_dataset( + cf_fill_value_hdf5_file, backend=HDFVirtualBackend + ) as vds: + roundtrip = roundtrip_as_in_memory_icechunk( + vds, tmp_path, decode_times=False + ) + xrt.assert_equal(ds, roundtrip)