Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add string and bytes dtypes plus vlen-utf8 and vlen-bytes codecs #2036

Merged
merged 35 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c05b9d1
add legacy vlen-utf8 codec
rabernat Jul 14, 2024
c86ddc6
Merge branch 'v3' into ryan/legacy-vlen
rabernat Sep 29, 2024
a322124
got it working again
rabernat Sep 29, 2024
2a1e2e3
got strings working; broke everything else
rabernat Oct 1, 2024
1d3d7a5
change v3.metadata.data_type type
rabernat Oct 1, 2024
cd40b08
merged
rabernat Oct 1, 2024
988f9df
fixed tests
rabernat Oct 1, 2024
507161a
satisfy mypy for tests
rabernat Oct 1, 2024
1ae5e63
make strings work
rabernat Oct 3, 2024
94ecdb5
add missing module
rabernat Oct 3, 2024
2c7d638
Merge branch 'v3' into ryan/legacy-vlen
d-v-b Oct 3, 2024
b1717d8
Merge remote-tracking branch 'upstream/v3' into ryan/legacy-vlen
rabernat Oct 4, 2024
79b7d43
store -> storage
rabernat Oct 4, 2024
a5c2a37
rename module
rabernat Oct 4, 2024
717f0c7
Merge remote-tracking branch 'origin/ryan/legacy-vlen' into ryan/lega…
rabernat Oct 4, 2024
b90d8f3
merged
rabernat Oct 4, 2024
0406ea1
add vlen bytes
rabernat Oct 7, 2024
8e61a18
fix type assertions in test
rabernat Oct 7, 2024
6cf7dde
much better validation of fill value
rabernat Oct 7, 2024
28d58fa
retype parse_fill_value
rabernat Oct 7, 2024
c6de878
tests pass but not mypy
rabernat Oct 7, 2024
4f026db
attempted to change parse_fill_value typing
rabernat Oct 8, 2024
e427c7a
restore DEFAULT_DTYPE
rabernat Oct 8, 2024
7d9d897
fixup
TomAugspurger Oct 8, 2024
0c21994
docstring
TomAugspurger Oct 8, 2024
c12ac41
update test
TomAugspurger Oct 8, 2024
3aeea1e
add better DataType tests
rabernat Oct 8, 2024
cae7055
more progress on typing; still not passing mypy
rabernat Oct 8, 2024
1aeb49a
fix typing yay!
rabernat Oct 8, 2024
6714bad
make types work with numpy <, 2
rabernat Oct 8, 2024
2edf3b8
Apply suggestions from code review
rabernat Oct 8, 2024
12a0d65
Apply suggestions from code review
rabernat Oct 8, 2024
7ba7077
apply Joe's suggestions
rabernat Oct 8, 2024
1e828b4
add missing module
rabernat Oct 8, 2024
ba0f093
make _STRING_DTYPE private to try to make sphinx happy
rabernat Oct 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
apply Joe's suggestions
  • Loading branch information
rabernat committed Oct 8, 2024
commit 7ba70771bd05ec61800459fe16696381acb1957d
10 changes: 5 additions & 5 deletions src/zarr/codecs/vlen_utf8.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from zarr.abc.codec import ArrayBytesCodec
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import JSON, parse_named_configuration
from zarr.core.strings import cast_to_string_dtype
from zarr.registry import register_codec
from zarr.strings import cast_to_string_dtype

if TYPE_CHECKING:
from typing import Self
Expand Down Expand Up @@ -47,7 +47,7 @@ async def _decode_single(
assert isinstance(chunk_bytes, Buffer)

raw_bytes = chunk_bytes.as_array_like()
decoded = vlen_utf8_codec.decode(raw_bytes)
decoded = _vlen_utf8_codec.decode(raw_bytes)
assert decoded.dtype == np.object_
decoded.shape = chunk_spec.shape
# coming out of the code, we know this is safe, so don't issue a warning
Expand All @@ -61,7 +61,7 @@ async def _encode_single(
) -> Buffer | None:
assert isinstance(chunk_array, NDBuffer)
return chunk_spec.prototype.buffer.from_bytes(
vlen_utf8_codec.encode(chunk_array.as_numpy_array())
_vlen_utf8_codec.encode(chunk_array.as_numpy_array())
)

def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
Expand Down Expand Up @@ -93,7 +93,7 @@ async def _decode_single(
assert isinstance(chunk_bytes, Buffer)

raw_bytes = chunk_bytes.as_array_like()
decoded = vlen_bytes_codec.decode(raw_bytes)
decoded = _vlen_bytes_codec.decode(raw_bytes)
assert decoded.dtype == np.object_
decoded.shape = chunk_spec.shape
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)
Expand All @@ -105,7 +105,7 @@ async def _encode_single(
) -> Buffer | None:
assert isinstance(chunk_array, NDBuffer)
return chunk_spec.prototype.buffer.from_bytes(
vlen_bytes_codec.encode(chunk_array.as_numpy_array())
_vlen_bytes_codec.encode(chunk_array.as_numpy_array())
)

def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
Expand Down
6 changes: 4 additions & 2 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from zarr._compat import _deprecate_positional_args
from zarr.abc.store import Store, set_or_delete
from zarr.codecs import get_default_array_bytes_codec
from zarr.codecs import _get_default_array_bytes_codec
from zarr.codecs._v2 import V2Compressor, V2Filters
from zarr.core.attributes import Attributes
from zarr.core.buffer import (
Expand Down Expand Up @@ -319,7 +319,9 @@ async def _create_v3(

shape = parse_shapelike(shape)
codecs = (
list(codecs) if codecs is not None else [get_default_array_bytes_codec(np.dtype(dtype))]
list(codecs)
if codecs is not None
else [_get_default_array_bytes_codec(np.dtype(dtype))]
)

if chunk_key_encoding is None:
Expand Down
2 changes: 1 addition & 1 deletion src/zarr/core/metadata/v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
from zarr.core.common import ZARR_JSON, parse_named_configuration, parse_shapelike
from zarr.core.config import config
from zarr.core.metadata.common import ArrayMetadata, parse_attributes
from zarr.core.strings import STRING_DTYPE as STRING_NP_DTYPE
from zarr.registry import get_codec_class
from zarr.strings import STRING_DTYPE as STRING_NP_DTYPE

DEFAULT_DTYPE = "float64"

Expand Down
87 changes: 0 additions & 87 deletions src/zarr/strings.py

This file was deleted.

2 changes: 1 addition & 1 deletion tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pytest

from zarr.strings import NUMPY_SUPPORTS_VLEN_STRING, STRING_DTYPE, cast_to_string_dtype
from zarr.core.strings import NUMPY_SUPPORTS_VLEN_STRING, STRING_DTYPE, cast_to_string_dtype


def test_string_defaults() -> None:
Expand Down
38 changes: 37 additions & 1 deletion tests/v3/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import zarr.api.asynchronous
import zarr.storage
from zarr import Array, AsyncArray, Group
from zarr.codecs.bytes import BytesCodec
from zarr.codecs import BytesCodec, VLenBytesCodec
from zarr.core.array import chunks_initialized
from zarr.core.buffer.cpu import NDBuffer
from zarr.core.common import JSON, ZarrFormat
Expand Down Expand Up @@ -370,3 +370,39 @@ def test_chunks_initialized(test_cls: type[Array] | type[AsyncArray]) -> None:

expected = sorted(keys)
assert observed == expected


def test_default_fill_values() -> None:
a = Array.create(MemoryStore({}, mode="w"), shape=5, chunk_shape=5, dtype="<U4")
assert a.fill_value == ""

b = Array.create(MemoryStore({}, mode="w"), shape=5, chunk_shape=5, dtype="<S4")
assert b.fill_value == b""

c = Array.create(MemoryStore({}, mode="w"), shape=5, chunk_shape=5, dtype="i")
assert c.fill_value == 0

d = Array.create(MemoryStore({}, mode="w"), shape=5, chunk_shape=5, dtype="f")
assert d.fill_value == 0.0


def test_vlen_errors() -> None:
with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."):
Array.create(MemoryStore({}, mode="w"), shape=5, chunk_shape=5, dtype="<U4", codecs=[])

with pytest.raises(
ValueError,
match="For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `BytesCodec`.",
):
Array.create(
MemoryStore({}, mode="w"), shape=5, chunk_shape=5, dtype="<U4", codecs=[BytesCodec()]
)

with pytest.raises(ValueError, match="Only one ArrayBytesCodec is allowed."):
Array.create(
MemoryStore({}, mode="w"),
shape=5,
chunk_shape=5,
dtype="<U4",
codecs=[BytesCodec(), VLenBytesCodec()],
)
37 changes: 2 additions & 35 deletions tests/v3/test_codecs/test_vlen.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from zarr import Array
from zarr.abc.codec import Codec
from zarr.abc.store import Store
from zarr.codecs import BytesCodec, VLenBytesCodec, VLenUTF8Codec, ZstdCodec
from zarr.codecs import VLenBytesCodec, VLenUTF8Codec, ZstdCodec
from zarr.core.metadata.v3 import ArrayV3Metadata, DataType
from zarr.core.strings import NUMPY_SUPPORTS_VLEN_STRING
from zarr.storage.common import StorePath
from zarr.strings import NUMPY_SUPPORTS_VLEN_STRING

numpy_str_dtypes: list[type | None] = [None, str, np.dtypes.StrDType]
expected_zarr_string_dtype: np.dtype[Any]
Expand Down Expand Up @@ -93,36 +93,3 @@ def test_vlen_bytes(store: Store, as_object_array: bool, codecs: None | list[Cod
assert np.array_equal(data, b[:, :])
assert b.metadata.data_type == DataType.bytes
assert a.dtype == "O"


# TODO: move these tests out of codecs and into a more appropriate location
@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
def test_default_fill_values(store: Store) -> None:
a = Array.create(StorePath(store, path="string"), shape=5, chunk_shape=5, dtype="<U4")
assert a.fill_value == ""

b = Array.create(StorePath(store, path="bytes"), shape=5, chunk_shape=5, dtype="<S4")
assert b.fill_value == b""


@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
def test_vlen_errors(store: Store) -> None:
with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."):
Array.create(StorePath(store, path="a"), shape=5, chunk_shape=5, dtype="<U4", codecs=[])

with pytest.raises(
ValueError,
match="For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `BytesCodec`.",
):
Array.create(
StorePath(store, path="b"), shape=5, chunk_shape=5, dtype="<U4", codecs=[BytesCodec()]
)

with pytest.raises(ValueError, match="Only one ArrayBytesCodec is allowed."):
Array.create(
StorePath(store, path="b"),
shape=5,
chunk_shape=5,
dtype="<U4",
codecs=[BytesCodec(), VLenBytesCodec()],
)
Loading