From f62320854a21aa018020cb600690bd3bf52e558b Mon Sep 17 00:00:00 2001 From: snibbor Date: Sun, 25 Aug 2024 21:01:41 -0400 Subject: [PATCH 1/2] patch writing image with dask array to ZipStore --- ome_zarr/writer.py | 10 ++++++++-- tests/test_writer.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/ome_zarr/writer.py b/ome_zarr/writer.py index 137c5e3c..5f0be71d 100644 --- a/ome_zarr/writer.py +++ b/ome_zarr/writer.py @@ -235,6 +235,7 @@ def write_multiscale( Please use the 'storage_options' argument instead.""" warnings.warn(msg, DeprecationWarning) datasets: List[dict] = [] + is_zipstore = isinstance(group.store, zarr.storage.ZipStore) for path, data in enumerate(pyramid): options = _resolve_storage_options(storage_options, path) @@ -251,11 +252,13 @@ def write_multiscale( if chunks_opt is not None: data = da.array(data).rechunk(chunks=chunks_opt) options["chunks"] = chunks_opt + # storage options with chunks results in duplicate key error for ZipStore + # TODO: do other stores also cause this error with da.to_zarr? da_delayed = da.to_zarr( arr=data, url=group.store, component=str(Path(group.path, str(path))), - storage_options=options, + storage_options=None if is_zipstore else options, compressor=options.get("compressor", zarr.storage.default_compressor), dimension_separator=group._store._dimension_separator, compute=compute, @@ -585,6 +588,7 @@ def _write_dask_image( # for path, data in enumerate(pyramid): max_layer: int = scaler.max_layer if scaler is not None else 0 shapes = [] + is_zipstore = isinstance(group.store, zarr.storage.ZipStore) for path in range(0, max_layer + 1): # LOGGER.debug(f"write_image path: {path}") options = _resolve_storage_options(storage_options, path) @@ -609,12 +613,14 @@ def _write_dask_image( LOGGER.debug( "write dask.array to_zarr shape: %s, dtype: %s", image.shape, image.dtype ) + # storage options with chunks results in duplicate key error for ZipStore + # TODO: do other stores also cause this error with da.to_zarr? delayed.append( da.to_zarr( arr=image, url=group.store, component=str(Path(group.path, str(path))), - storage_options=options, + storage_options=None if is_zipstore else options, compute=False, compressor=options.get("compressor", zarr.storage.default_compressor), dimension_separator=group._store._dimension_separator, diff --git a/tests/test_writer.py b/tests/test_writer.py index 14a8ed50..03beba65 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -39,6 +39,8 @@ class TestWriter: @pytest.fixture(autouse=True) def initdir(self, tmpdir): self.path = pathlib.Path(tmpdir.mkdir("data")) + self.zip_path = pathlib.Path(tmpdir.mkdir("data_zip"), "data.zip") + self.zip_store = zarr.storage.ZipStore(self.zip_path, mode="w") self.store = parse_url(self.path, mode="w").store self.root = zarr.group(store=self.store) self.group = self.root.create_group("test") @@ -146,6 +148,35 @@ def test_write_image_current(self, array_constructor): for value in transfs[0]["scale"]: assert value >= 1 + # write_image/write_multiscale ZipStore when the storage_options is not None + @pytest.mark.parametrize("array_constructor", [np.array, da.from_array]) + @pytest.mark.parametrize("storage_options", [None, {"chunks": (1, 100, 100)}]) + def test_write_image_zipstore(self, array_constructor, storage_options): + # Initialize the Zarr group from ZipStore + group = zarr.group(store=self.zip_store, overwrite=True) + + shape = (3, 300, 300) + data = self.create_data(shape) + data = array_constructor(data) + + write_image(data, group, axes="cyx", storage_options=storage_options) + + @pytest.mark.parametrize("array_constructor", [np.array, da.from_array]) + @pytest.mark.parametrize("storage_options", [None, {"chunks": (1, 100, 100)}]) + def test_write_multiscale_zipstore(self, array_constructor, storage_options): + # Initialize the Zarr group from ZipStore + group = zarr.group(store=self.zip_store, overwrite=True) + + shape = (3, 300, 300) + data1 = self.create_data(shape) + data1 = array_constructor(data1) + data2 = self.create_data(shape) + data2 = array_constructor(data2) + + write_multiscale( + [data1, data2], group, axes="cyx", storage_options=storage_options + ) + @pytest.mark.parametrize("read_from_zarr", [True, False]) @pytest.mark.parametrize("compute", [True, False]) def test_write_image_dask(self, read_from_zarr, compute): From b58d55035483e1d6c9cd552ab293d1ce7d26a26b Mon Sep 17 00:00:00 2001 From: snibbor Date: Sun, 25 Aug 2024 21:39:21 -0400 Subject: [PATCH 2/2] test for correct chunk size after writing to ZipStore --- tests/test_writer.py | 54 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 03beba65..b1419350 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -157,9 +157,27 @@ def test_write_image_zipstore(self, array_constructor, storage_options): shape = (3, 300, 300) data = self.create_data(shape) - data = array_constructor(data) + if array_constructor == da.from_array: + data = array_constructor(data, chunks=(1, 50, 50)) + else: + data = array_constructor(data) write_image(data, group, axes="cyx", storage_options=storage_options) + self.zip_store.close() + # load the data from the ZipStore + store = zarr.storage.ZipStore(self.zip_path, mode="r") + group = zarr.open_group(store=store) + # check the chunksize of the array + if array_constructor == da.from_array: + if storage_options is None: + assert group[0].chunks == (1, 50, 50) + else: + assert group[0].chunks == storage_options["chunks"] + else: + if storage_options is None: + pass + else: + assert group[0].chunks == storage_options["chunks"] @pytest.mark.parametrize("array_constructor", [np.array, da.from_array]) @pytest.mark.parametrize("storage_options", [None, {"chunks": (1, 100, 100)}]) @@ -168,14 +186,32 @@ def test_write_multiscale_zipstore(self, array_constructor, storage_options): group = zarr.group(store=self.zip_store, overwrite=True) shape = (3, 300, 300) - data1 = self.create_data(shape) - data1 = array_constructor(data1) - data2 = self.create_data(shape) - data2 = array_constructor(data2) - - write_multiscale( - [data1, data2], group, axes="cyx", storage_options=storage_options - ) + data_arrs = [] + for i in range(2): + data = self.create_data(shape) + if array_constructor == da.from_array: + data = array_constructor(data, chunks=(1, 50, 50)) + else: + data = array_constructor(data) + data_arrs.append(data.copy()) + + write_multiscale(data_arrs, group, axes="cyx", storage_options=storage_options) + self.zip_store.close() + # load the data from the ZipStore + store = zarr.storage.ZipStore(self.zip_path, mode="r") + group = zarr.open_group(store=store) + # check the chunksize of the array + for i in range(2): + if array_constructor == da.from_array: + if storage_options is None: + assert group[i].chunks == (1, 50, 50) + else: + assert group[i].chunks == storage_options["chunks"] + else: + if storage_options is None: + pass + else: + assert group[i].chunks == storage_options["chunks"] @pytest.mark.parametrize("read_from_zarr", [True, False]) @pytest.mark.parametrize("compute", [True, False])