Skip to content

Commit

Permalink
Reapply HDF reader fillvalue (#460)
Browse files Browse the repository at this point in the history
* Reapply "Align HDF reader CF _FillValue with Zarr v3 semantics (#420)" (#457)

This reverts commit dbd3566.

* Reapply HDF fill value changes

Use context manager in tests to ensure resources are released.
  • Loading branch information
chuckwondo authored Mar 3, 2025
1 parent 19caf2f commit 1cdc2f0
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 48 deletions.
60 changes: 49 additions & 11 deletions virtualizarr/readers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
List,
Mapping,
Optional,
Tuple,
Union,
)

import numpy as np
import xarray as xr
from xarray.backends.zarr import FillValueCoder

from virtualizarr.manifests import (
ChunkEntry,
Expand Down Expand Up @@ -40,6 +42,20 @@
H5Dataset: Any = None
H5Group: Any = None

FillValueType = Union[
int,
float,
bool,
complex,
str,
np.integer,
np.floating,
np.bool_,
np.complexfloating,
bytes, # For fixed-length string storage
Tuple[bytes, int], # Structured type
]


class HDFVirtualBackend(VirtualBackend):
@staticmethod
Expand Down Expand Up @@ -216,6 +232,29 @@ def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:

return [dim.removeprefix(group) for dim in dims]

@staticmethod
def _extract_cf_fill_value(
h5obj: Union[H5Dataset, H5Group],
) -> Optional[FillValueType]:
"""
Convert the _FillValue attribute from an HDF5 group or dataset into
encoding.
Parameters
----------
h5obj : h5py.Group or h5py.Dataset
An h5py group or dataset.
"""
fillvalue = None
for n, v in h5obj.attrs.items():
if n == "_FillValue":
if isinstance(v, np.ndarray) and v.size == 1:
fillvalue = v.item()
else:
fillvalue = v
fillvalue = FillValueCoder.encode(fillvalue, h5obj.dtype) # type: ignore[arg-type]
return fillvalue

@staticmethod
def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
"""
Expand All @@ -240,14 +279,14 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
for n, v in h5obj.attrs.items():
if n in _HIDDEN_ATTRS:
continue
if n == "_FillValue":
continue
# Fix some attribute values to avoid JSON encoding exceptions...
if isinstance(v, bytes):
v = v.decode("utf-8") or " "
elif isinstance(v, (np.ndarray, np.number, np.bool_)):
if v.dtype.kind == "S":
v = v.astype(str)
if n == "_FillValue":
continue
elif v.size == 1:
v = v.flatten()[0]
if isinstance(v, (np.ndarray, np.number, np.bool_)):
Expand All @@ -258,7 +297,6 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
v = ""
if v == "DIMENSION_SCALE":
continue

attrs[n] = v
return attrs

Expand Down Expand Up @@ -290,21 +328,19 @@ def _dataset_to_variable(
codecs = codecs_from_dataset(dataset)
cfcodec = cfcodec_from_dataset(dataset)
attrs = HDFVirtualBackend._extract_attrs(dataset)
cf_fill_value = HDFVirtualBackend._extract_cf_fill_value(dataset)
attrs.pop("_FillValue", None)

if cfcodec:
codecs.insert(0, cfcodec["codec"])
dtype = cfcodec["target_dtype"]
attrs.pop("scale_factor", None)
attrs.pop("add_offset", None)
fill_value = cfcodec["codec"].decode(dataset.fillvalue)
else:
dtype = dataset.dtype
fill_value = dataset.fillvalue
if isinstance(fill_value, np.ndarray):
fill_value = fill_value[0]
if np.isnan(fill_value):
fill_value = float("nan")
if isinstance(fill_value, np.generic):
fill_value = fill_value.item()

fill_value = dataset.fillvalue.item()

filters = [codec.get_config() for codec in codecs]
zarray = ZArray(
chunks=chunks, # type: ignore
Expand All @@ -323,6 +359,8 @@ def _dataset_to_variable(
variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
else:
variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
if cf_fill_value is not None:
variable.encoding["_FillValue"] = cf_fill_value
return variable

@staticmethod
Expand Down
61 changes: 61 additions & 0 deletions virtualizarr/tests/test_readers/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,3 +342,64 @@ def non_coord_dim(tmpdir):
ds = ds.drop_dims("dim3")
ds.to_netcdf(filepath, engine="netcdf4")
return filepath


@pytest.fixture
def scalar_fill_value_hdf5_file(tmpdir):
filepath = f"{tmpdir}/scalar_fill_value.nc"
f = h5py.File(filepath, "w")
data = np.random.randint(0, 10, size=(5))
fill_value = 42
f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value)
return filepath


compound_dtype = np.dtype(
[
("id", "i4"), # 4-byte integer
("temperature", "f4"), # 4-byte float
]
)

compound_data = np.array(
[
(1, 98.6),
(2, 101.3),
],
dtype=compound_dtype,
)

compound_fill = (-9999, -9999.0)

fill_values = [
{"fill_value": -9999, "data": np.random.randint(0, 10, size=(5))},
{"fill_value": -9999.0, "data": np.random.random(5)},
{"fill_value": np.nan, "data": np.random.random(5)},
{"fill_value": False, "data": np.array([True, False, False, True, True])},
{"fill_value": "NaN", "data": np.array(["three"], dtype="S10")},
{"fill_value": compound_fill, "data": compound_data},
]


@pytest.fixture(params=fill_values)
def cf_fill_value_hdf5_file(tmpdir, request):
filepath = f"{tmpdir}/cf_fill_value.nc"
f = h5py.File(filepath, "w")
dset = f.create_dataset(name="data", data=request.param["data"], chunks=True)
dim_scale = f.create_dataset(
name="dim_scale", data=request.param["data"], chunks=True
)
dim_scale.make_scale()
dset.dims[0].attach_scale(dim_scale)
dset.attrs["_FillValue"] = request.param["fill_value"]
return filepath


@pytest.fixture
def cf_array_fill_value_hdf5_file(tmpdir):
filepath = f"{tmpdir}/cf_array_fill_value.nc"
f = h5py.File(filepath, "w")
data = np.random.random(5)
dset = f.create_dataset(name="data", data=data, chunks=True)
dset.attrs["_FillValue"] = np.array([np.nan])
return filepath
31 changes: 31 additions & 0 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import patch

import h5py # type: ignore
import numpy as np
import pytest

from virtualizarr import open_virtual_dataset
Expand Down Expand Up @@ -111,6 +112,36 @@ def test_dataset_attributes(self, string_attributes_hdf5_file):
)
assert var.attrs["attribute_name"] == "attribute_name"

def test_scalar_fill_value(self, scalar_fill_value_hdf5_file):
f = h5py.File(scalar_fill_value_hdf5_file)
ds = f["data"]
var = HDFVirtualBackend._dataset_to_variable(
scalar_fill_value_hdf5_file, ds, group=""
)
assert var.data.zarray.fill_value == 42

def test_cf_fill_value(self, cf_fill_value_hdf5_file):
f = h5py.File(cf_fill_value_hdf5_file)
ds = f["data"]
if ds.dtype.kind in "S":
pytest.xfail("Investigate fixed-length binary encoding in Zarr v3")
if ds.dtype.names:
pytest.xfail(
"To fix, structured dtype fill value encoding for Zarr backend"
)
var = HDFVirtualBackend._dataset_to_variable(
cf_fill_value_hdf5_file, ds, group=""
)
assert "_FillValue" in var.encoding

def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file):
f = h5py.File(cf_array_fill_value_hdf5_file)
ds = f["data"]
var = HDFVirtualBackend._dataset_to_variable(
cf_array_fill_value_hdf5_file, ds, group=""
)
assert not isinstance(var.encoding["_FillValue"], np.ndarray)


@requires_hdf5plugin
@requires_imagecodecs
Expand Down
110 changes: 73 additions & 37 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import (
requires_hdf5plugin,
requires_icechunk,
requires_imagecodecs,
requires_kerchunk,
)
from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk


@requires_kerchunk
Expand All @@ -19,47 +21,81 @@ class TestIntegration:
reason="0 time start is being interpreted as fillvalue see issues/280"
)
def test_filters_h5netcdf_roundtrip(
self, tmpdir, filter_encoded_roundtrip_hdf5_file
self, tmp_path, filter_encoded_roundtrip_hdf5_file
):
ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
vds = virtualizarr.open_virtual_dataset(
filter_encoded_roundtrip_hdf5_file,
loadable_variables=["time"],
cftime_variables=["time"],
backend=HDFVirtualBackend,
)
kerchunk_file = f"{tmpdir}/kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
xrt.assert_allclose(ds, roundtrip)
with (
xr.open_dataset(
filter_encoded_roundtrip_hdf5_file, decode_times=True
) as ds,
virtualizarr.open_virtual_dataset(
filter_encoded_roundtrip_hdf5_file,
loadable_variables=["time"],
cftime_variables=["time"],
backend=HDFVirtualBackend,
) as vds,
):
kerchunk_file = str(tmp_path / "kerchunk.json")
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(
kerchunk_file, engine="kerchunk", decode_times=True
)
xrt.assert_allclose(ds, roundtrip)

def test_filters_netcdf4_roundtrip(
self, tmpdir, filter_encoded_roundtrip_netcdf4_file
self, tmp_path, filter_encoded_roundtrip_netcdf4_file
):
filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
ds = xr.open_dataset(filepath)
vds = virtualizarr.open_virtual_dataset(filepath, backend=HDFVirtualBackend)
kerchunk_file = f"{tmpdir}/kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)
with (
xr.open_dataset(filepath) as ds,
virtualizarr.open_virtual_dataset(
filepath, backend=HDFVirtualBackend
) as vds,
):
kerchunk_file = str(tmp_path / "kerchunk.json")
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)

def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
vds = virtualizarr.open_virtual_dataset(
filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend
)
kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_allclose(ds, roundtrip)
def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_file):
with (
xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) as ds,
virtualizarr.open_virtual_dataset(
filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend
) as vds,
):
kerchunk_file = str(tmp_path / "filter_cf_kerchunk.json")
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_allclose(ds, roundtrip)
assert (
ds["temperature"].encoding["_FillValue"]
== roundtrip["temperature"].encoding["_FillValue"]
)

def test_non_coord_dim(self, tmpdir, non_coord_dim):
ds = xr.open_dataset(non_coord_dim)
vds = virtualizarr.open_virtual_dataset(
non_coord_dim, backend=HDFVirtualBackend
)
kerchunk_file = f"{tmpdir}/kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)
def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim):
with (
xr.open_dataset(non_coord_dim) as ds,
virtualizarr.open_virtual_dataset(
non_coord_dim, backend=HDFVirtualBackend
) as vds,
):
kerchunk_file = str(tmp_path / "kerchunk.json")
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)

@requires_icechunk
def test_cf_fill_value_roundtrip(self, tmp_path, cf_fill_value_hdf5_file):
with xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf") as ds:
if ds["data"].dtype in [float, object]:
pytest.xfail(
"TODO: fix handling fixed-length and structured type fill value"
" encoding in xarray zarr backend."
)
with virtualizarr.open_virtual_dataset(
cf_fill_value_hdf5_file, backend=HDFVirtualBackend
) as vds:
roundtrip = roundtrip_as_in_memory_icechunk(
vds, tmp_path, decode_times=False
)
xrt.assert_equal(ds, roundtrip)

0 comments on commit 1cdc2f0

Please sign in to comment.