chanzuckerberg · seankmartin · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/README.md b/README.md
@@ -1,2 +1,11 @@
 # cryoet-data-portal-neuroglancer
+
 CryoET Data Portal Neuroglancer configuration helper
+
+## Installation
+
+```bash
+git clone https://github.com/chanzuckerberg/cryoet-data-portal-neuroglancer.git
+cd cryoet-data-portal-neuroglancer
+poetry install
+```
diff --git a/cryoet_data_portal_neuroglancer/precompute/segmentation_mask.py b/cryoet_data_portal_neuroglancer/precompute/segmentation_mask.py
@@ -213,9 +213,9 @@ def create_segmentation_chunk(
     # data = np.moveaxis(data, (0, 1, 2), (2, 1, 0))
     for z, y, x in np.ndindex((gz, gy, gx)):
         block = data[z * bz : (z + 1) * bz, y * by : (y + 1) * by, x * bx : (x + 1) * bx]
-        unique_values, encoded_values = np.unique(block, return_inverse=True)
         if block.shape != block_size:
             block = pad_block(block, block_size)
+        unique_values, encoded_values = np.unique(block, return_inverse=True)
 
         lookup_table_offset, encoded_bits = _create_lookup_table(buffer, stored_lookup_tables, unique_values)
         encoded_values_offset = _create_encoded_values(buffer, encoded_values, encoded_bits)
@@ -246,7 +246,7 @@ def _create_metadata(
         "num_channels": 1,
         "scales": [
             {
-                "chunk_sizes": [chunk_size],
+                "chunk_sizes": [chunk_size[::-1]],  # reverse the chunk size to pass from Z-Y-X to X-Y-Z
                 "encoding": "compressed_segmentation",
                 "compressed_segmentation_block_size": block_size,
                 "resolution": resolution,
@@ -318,17 +318,45 @@ def write_metadata(metadata: dict[str, Any], output_directory: Path) -> None:
 
 def encode_segmentation(
     filename: str,
-    output_path: Path,
+    output_path: Path | str,
     resolution: tuple[float, float, float],
     block_size: tuple[int, int, int] = (64, 64, 64),
     data_directory: str = "data",
     delete_existing: bool = False,
-    convert_non_zero_to: Optional[int] = 0,
+    convert_non_zero_to: int | None = 0,
     include_mesh: bool = False,
     mesh_directory: str = "mesh",
 ) -> None:
-    """Convert the given OME-Zarr file to neuroglancer segmentation format with the given block size"""
+    """Convert the given OME-Zarr file to neuroglancer segmentation format with the given block size.
+
+    Parameters
+    ----------
+    filename : str
+        The path to the OME-Zarr file
+    output_path : Path | str
+        The path to the output directory
+    resolution : tuple[float, float, float]
+        The resolution of the data in nm
+    block_size : tuple[int, int, int], optional
+        The size of the blocks to use, by default (64, 64, 64)
+        This determines the size of the chunks in the precomputed format
+        output
+        Order is Z, Y, X
+    data_directory : str, optional
+        The name of the data directory, by default "data"
+        This is the directory that will contain the segmentation data
+    delete_existing : bool, optional
+        Whether to delete the existing output directory, by default False
+        If False and the output directory exists, the function will
+        return without doing anything
+    convert_non_zero_to : int | None, optional
+        The value to convert non-zero values to, by default 0, which
+        will leave non-zero values as they are. If None, non-zero
+        values will be left as they are also. This is useful for
+        representing multiple objects in the same segmentation
+    """
     print(f"Converting {filename} to neuroglancer compressed segmentation format")
+    output_path = Path(output_path)
     dask_data = load_omezarr_data(filename)
     if delete_existing and output_path.exists():
         contents = list(output_path.iterdir())

diff --git a/tests/test_segmentation_encoding.py b/tests/test_segmentation_encoding.py
@@ -199,7 +199,36 @@ def test__create_segmentation_chunk():
             [1, 1, 1, 1],
         ],
     ]
-    chunk: Chunk = create_segmentation_chunk(np.array(array), dimensions=((0, 0, 0), (8, 8, 4)), block_size=(8, 8, 4))
+    real_data = np.array(array)
+    chunk: Chunk = create_segmentation_chunk(real_data, dimensions=((0, 0, 0), (8, 8, 4)), block_size=(8, 8, 4))
 
     assert chunk.dimensions == ((0, 0, 0), (8, 8, 4))
-    # TODO expand me!
+    byte_array = chunk.buffer
+    data_start_offset = 20  # header of 8 bytes + 12 bytes of info
+    data = np.frombuffer(byte_array, dtype=np.uint32, offset=data_start_offset)
+    # The data is symmetric, so each 32-bit integer should be the same
+    assert len(data) == 8  # 8 * 8 * 4 / 32
+    assert np.all(np.diff(data) == 0)
+
+    # If we chunk in larger blocks, it should still work with padding
+    chunk: Chunk = create_segmentation_chunk(real_data, dimensions=((0, 0, 0), (8, 8, 4)), block_size=(8, 8, 8))
+
+    assert chunk.dimensions == ((0, 0, 0), (8, 8, 4))
+    byte_array = chunk.buffer
+    data_start_offset = 20
+    data = np.frombuffer(byte_array, dtype=np.uint32, offset=data_start_offset)
+    # The data is symmetric, so each 32-bit integer should be the same
+    assert len(data) == 16  # 8 * 8 * 8 / 32
+    assert np.all(np.diff(data) == 0)
+
+    # With smaller blocks, there should be more of them
+    chunk: Chunk = create_segmentation_chunk(real_data, dimensions=((0, 0, 0), (8, 8, 4)), block_size=(4, 4, 4))
+
+    assert chunk.dimensions == ((0, 0, 0), (8, 8, 4))
+    byte_array = chunk.buffer
+    data_start_offset = 20
+    data = np.frombuffer(byte_array, dtype=np.uint32)
+
+    # In this case, there should be four block headers (32 bits) - followed by the 3 32-bit info bytes, and then all the data is the same in one block of 8 32-bit integers
+    assert len(data) == 8 + 3 + 8
+    assert np.all(np.diff(data[11:]) == 0)