much better validation of fill value

zarr-developers · rabernat · Oct 8, 2024 · Jul 14, 2024 · Sep 29, 2024 · Sep 29, 2024
commit 6cf7dde6214450970661a267f7409217f62e4830
diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py
@@ -1,5 +1,9 @@
 from __future__ import annotations
 
+from typing import Any
+
+import numpy as np
+
 from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
 from zarr.codecs.bytes import BytesCodec, Endian
 from zarr.codecs.crc32c_ import Crc32cCodec
@@ -9,6 +13,7 @@
 from zarr.codecs.transpose import TransposeCodec
 from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
 from zarr.codecs.zstd import ZstdCodec
+from zarr.core.metadata.v3 import DataType
 
 __all__ = [
     "BatchedCodecPipeline",
@@ -26,3 +31,15 @@
     "VLenBytesCodec",
     "ZstdCodec",
 ]
+
+
+def get_default_array_bytes_codec(
+    np_dtype: np.dtype[Any],
+) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec:
+    dtype = DataType.from_numpy(np_dtype)
+    if dtype == DataType.string:
+        return VLenUTF8Codec()
+    elif dtype == DataType.bytes:
+        return VLenBytesCodec()
+    else:
+        return BytesCodec()
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -11,7 +11,7 @@
 
 from zarr._compat import _deprecate_positional_args
 from zarr.abc.store import Store, set_or_delete
-from zarr.codecs import BytesCodec
+from zarr.codecs import get_default_array_bytes_codec
 from zarr.codecs._v2 import V2Compressor, V2Filters
 from zarr.core.attributes import Attributes
 from zarr.core.buffer import (
@@ -318,7 +318,7 @@ async def _create_v3(
             await ensure_no_existing_node(store_path, zarr_format=3)
 
         shape = parse_shapelike(shape)
-        codecs = list(codecs) if codecs is not None else [BytesCodec()]
+        codecs = list(codecs) if codecs is not None else [get_default_array_bytes_codec(dtype)]
 
         if chunk_key_encoding is None:
             chunk_key_encoding = ("default", "/")

diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py
@@ -64,6 +64,34 @@ def parse_codecs(data: object) -> tuple[Codec, ...]:
     return out
 
 
+def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None:
+    """Check that the codecs are valid for the given dtype"""
+
+    # ensure that we have at least one ArrayBytesCodec
+    abcs: list[ArrayBytesCodec] = []
+    for codec in codecs:
+        if isinstance(codec, ArrayBytesCodec):
+            abcs.append(codec)
+    if len(abcs) == 0:
+        raise ValueError("At least one ArrayBytesCodec is required.")
+    elif len(abcs) > 1:
+        raise ValueError("Only one ArrayBytesCodec is allowed.")
+
+    abc = abcs[0]
+
+    # we need to have special codecs if we are decoding vlen strings or bytestrings
+    # TODO: use codec ID instead of class name
+    codec_id = abc.__class__.__name__
+    if dtype == DataType.string and not codec_id == "VLenUTF8Codec":
+        raise ValueError(
+            f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_id}`."
+        )
+    if dtype == DataType.bytes and not codec_id == "VLenBytesCodec":
+        raise ValueError(
+            f"For bytes dtype, ArrayBytesCodec must be `VLenBytesCodec`, got `{codec_id}`."
+        )
+
+
 def parse_dimension_names(data: object) -> tuple[str | None, ...] | None:
     if data is None:
         return data
@@ -186,6 +214,8 @@ def __init__(
         chunk_grid_parsed = ChunkGrid.from_dict(chunk_grid)
         chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding)
         dimension_names_parsed = parse_dimension_names(dimension_names)
+        if fill_value is None:
+            fill_value = default_fill_value(data_type_parsed)
         fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed.to_numpy())
         attributes_parsed = parse_attributes(attributes)
         codecs_parsed_partial = parse_codecs(codecs)
@@ -199,6 +229,7 @@ def __init__(
             prototype=default_buffer_prototype(),  # TODO: prototype is not needed here.
         )
         codecs_parsed = [c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial]
+        validate_codecs(codecs_parsed_partial, data_type_parsed)
 
         object.__setattr__(self, "shape", shape_parsed)
         object.__setattr__(self, "data_type", data_type_parsed)
@@ -360,8 +391,17 @@ def parse_fill_value(
     ...
 
 
+def default_fill_value(dtype: DataType) -> str | bytes | np.generic:
+    if dtype == DataType.string:
+        return ""
+    elif dtype == DataType.bytes:
+        return b""
+    else:
+        return dtype.to_numpy().type(0)
+
+
 def parse_fill_value(
-    fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool | None,
+    fill_value: complex | str | bytes | np.generic | Sequence[Any] | bool,
     dtype: BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | np.dtype[Any],
 ) -> BOOL | INTEGER | FLOAT | COMPLEX | Any:
     """
@@ -385,7 +425,7 @@ def parse_fill_value(
     A scalar instance of `dtype`
     """
     if fill_value is None:
-        return dtype.type(0)
+        raise ValueError("Fill value cannot be None")
     if dtype.kind == "O":
         return fill_value
     if isinstance(fill_value, Sequence) and not isinstance(fill_value, str):

diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/v3/test_codecs/test_vlen.py
@@ -4,8 +4,9 @@
 import pytest
 
 from zarr import Array
+from zarr.abc.codec import Codec
 from zarr.abc.store import Store
-from zarr.codecs import VLenBytesCodec, VLenUTF8Codec
+from zarr.codecs import VLenBytesCodec, VLenUTF8Codec, ZstdCodec
 from zarr.core.metadata.v3 import ArrayV3Metadata, DataType
 from zarr.storage.common import StorePath
 from zarr.strings import NUMPY_SUPPORTS_VLEN_STRING
@@ -21,11 +22,13 @@
 
 @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
 @pytest.mark.parametrize("dtype", numpy_str_dtypes)
-async def test_vlen_string(store: Store, dtype: None | np.dtype[Any]) -> None:
+@pytest.mark.parametrize("as_object_array", [False, True])
+@pytest.mark.parametrize("codecs", [None, [VLenUTF8Codec()], [VLenUTF8Codec(), ZstdCodec()]])
+def test_vlen_string(
+    store: Store, dtype: None | np.dtype[Any], as_object_array: bool, codecs: None | list[Codec]
+) -> None:
     strings = ["hello", "world", "this", "is", "a", "test"]
-    data = np.array(strings).reshape((2, 3))
-    if dtype is not None:
-        data = data.astype(dtype)
+    data = np.array(strings, dtype=dtype).reshape((2, 3))
 
     sp = StorePath(store, path="string")
     a = Array.create(
@@ -34,10 +37,15 @@ async def test_vlen_string(store: Store, dtype: None | np.dtype[Any]) -> None:
         chunk_shape=data.shape,
         dtype=data.dtype,
         fill_value="",
-        codecs=[VLenUTF8Codec()],
+        codecs=codecs,
     )
     assert isinstance(a.metadata, ArrayV3Metadata)  # needed for mypy
 
+    # should also work if input array is an object array, provided we explicitly specified
+    # a stringlike dtype when creating the Array
+    if as_object_array:
+        data = data.astype("O")
+
     a[:, :] = data
     assert np.array_equal(data, a[:, :])
     assert a.metadata.data_type == DataType.string
@@ -52,7 +60,9 @@ async def test_vlen_string(store: Store, dtype: None | np.dtype[Any]) -> None:
 
 
 @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
-async def test_vlen_bytes(store: Store) -> None:
+@pytest.mark.parametrize("as_object_array", [False, True])
+@pytest.mark.parametrize("codecs", [None, [VLenBytesCodec()], [VLenBytesCodec(), ZstdCodec()]])
+def test_vlen_bytes(store: Store, as_object_array: bool, codecs: None | list[Codec]) -> None:
     bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"]
     data = np.array(bstrings).reshape((2, 3))
     assert data.dtype == "|S5"
@@ -64,10 +74,14 @@ async def test_vlen_bytes(store: Store) -> None:
         chunk_shape=data.shape,
         dtype=data.dtype,
         fill_value=b"",
-        codecs=[VLenBytesCodec()],
+        codecs=codecs,
     )
     assert isinstance(a.metadata, ArrayV3Metadata)  # needed for mypy
 
+    # should also work if input array is an object array, provided we explicitly specified
+    # a bytesting-like dtype when creating the Array
+    if as_object_array:
+        data = data.astype("O")
     a[:, :] = data
     assert np.array_equal(data, a[:, :])
     assert a.metadata.data_type == DataType.bytes
@@ -79,3 +93,21 @@ async def test_vlen_bytes(store: Store) -> None:
     assert np.array_equal(data, b[:, :])
     assert b.metadata.data_type == DataType.bytes
     assert a.dtype == "O"
+
+
+@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
+def test_vlen_errors(store: Store) -> None:
+    sp = StorePath(store, path="string")
+
+    # fill value must be a compatible type
+    with pytest.raises(ValueError, match="fill value 0 is not valid"):
+        Array.create(sp, shape=5, chunk_shape=5, dtype="<U4", fill_value=0)
+
+    # FIXME: this should raise but doesn't; need to fix parse_fill_value
+    # Problem is that parse_fill_value compares with numpy dtype('O') instead
+    # of DataType.bytes, and anything can be cast to Object
+    # with pytest.raises(ValueError, match="fill value X is not valid"):
+    #    Array.create(sp, shape=5, chunk_shape=5, dtype='|S4', fill_value='')
+
+    a = Array.create(sp, shape=5, chunk_shape=5, dtype="<U4")
+    assert a.fill_value == ""
diff --git a/tests/v3/test_metadata/test_v3.py b/tests/v3/test_metadata/test_v3.py
@@ -21,6 +21,7 @@
 import pytest
 
 from zarr.core.metadata.v3 import (
+    default_fill_value,
     parse_dimension_names,
     parse_fill_value,
     parse_zarr_format,
@@ -46,8 +47,9 @@
 )
 
 complex_dtypes = ("complex64", "complex128")
+vlen_dtypes = ("string", "bytes")
 
-dtypes = (*bool_dtypes, *int_dtypes, *float_dtypes, *complex_dtypes)
+dtypes = (*bool_dtypes, *int_dtypes, *float_dtypes, *complex_dtypes, *vlen_dtypes)
 
 
 @pytest.mark.parametrize("data", [None, 1, 2, 4, 5, "3"])
@@ -72,13 +74,18 @@ def parse_dimension_names_valid(data: Sequence[str] | None) -> None:
 
 
 @pytest.mark.parametrize("dtype_str", dtypes)
-def test_parse_auto_fill_value(dtype_str: str) -> None:
+def test_default_fill_value(dtype_str: str) -> None:
     """
     Test that parse_fill_value(None, dtype) results in the 0 value for the given dtype.
     """
-    dtype = np.dtype(dtype_str)
-    fill_value = None
-    assert parse_fill_value(fill_value, dtype) == dtype.type(0)
+    dtype = DataType(dtype_str)
+    fill_value = default_fill_value(dtype)
+    if dtype == DataType.string:
+        assert fill_value == ""
+    elif dtype == DataType.bytes:
+        assert fill_value == b""
+    else:
+        assert fill_value == dtype.to_numpy().type(0)
 
 
 @pytest.mark.parametrize(
@@ -337,7 +344,7 @@ async def test_special_float_fill_values(fill_value: str) -> None:
         "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}},
         "data_type": "float64",
         "chunk_key_encoding": {"name": "default", "separator": "."},
-        "codecs": (),
+        "codecs": [{"name": "bytes"}],
         "fill_value": fill_value,  # this is not a valid fill value for uint8
     }
     m = ArrayV3Metadata.from_dict(metadata_dict)