galaxyproject · mvdbeek · Mar 26, 2025 · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025
diff --git a/lib/galaxy/datatypes/images.py b/lib/galaxy/datatypes/images.py
@@ -9,13 +9,16 @@
 from typing import (
     Any,
     Dict,
+    Iterator,
     List,
     Optional,
     Tuple,
+    Union,
 )
 
 import mrcfile
 import numpy as np
+import png
 import tifffile
 
 try:
@@ -154,32 +157,32 @@ def set_meta(
         self, dataset: DatasetProtocol, overwrite: bool = True, metadata_tmp_files_dir: Optional[str] = None, **kwd
     ) -> None:
         """
-        Try to populate the metadata of the image using a generic image loading library (pillow), if available.
+        Try to populate the metadata of the image using a generic image loading library (Pillow), if available.
 
         If an image has two axes, they are assumed to be ``YX``. If an image has three axes, they are assumed to be ``YXC``.
+
+        The metadata element `num_unique_values` remains unset.
         """
         if PIL is not None:
             try:
                 with PIL.Image.open(dataset.get_file_name()) as im:
+                    im_shape, im_typestr = PIL.Image._conv_type_shape(im)
+                    im_ndim = len(im_shape)
 
                     # Determine the metadata values that are available without loading the image data
                     dataset.metadata.width = im.size[1]
                     dataset.metadata.height = im.size[0]
                     dataset.metadata.depth = 0
                     dataset.metadata.frames = getattr(im, "n_frames", 0)
-                    dataset.metadata.num_unique_values = sum(val > 0 for val in im.histogram())
-
-                    # Peek into a small 2x2 section of the image data
-                    im_peek_arr = np.array(im.crop((0, 0, min((2, im.size[1])), min((2, im.size[0])))))
+                    dataset.metadata.dtype = str(np.array((0,), im_typestr).dtype)
 
-                    # Determine the remaining metadata values
-                    dataset.metadata.dtype = str(im_peek_arr.dtype)
-                    if im_peek_arr.ndim == 2:
+                    # Determine the remaining values, by assuming the order of axes
+                    if im_ndim == 2:
                         dataset.metadata.axes = "YX"
                         dataset.metadata.channels = 0
-                    elif im_peek_arr.ndim == 3:
+                    elif im_ndim == 3:
                         dataset.metadata.axes = "YXC"
-                        dataset.metadata.channels = im_peek_arr.shape[2]
+                        dataset.metadata.channels = im_shape[2]
 
             except PIL.UnidentifiedImageError:
                 pass
@@ -198,6 +201,29 @@ class Png(Image):
     edam_format = "format_3603"
     file_ext = "png"
 
+    def set_meta(
+        self, dataset: DatasetProtocol, overwrite: bool = True, metadata_tmp_files_dir: Optional[str] = None, **kwd
+    ) -> None:
+        """
+        Try to populate the metadata of the image using PyPNG.
+
+        The base implementation is used to determine metadata elements that cannot be determined with PyPNG, but with Pillow.
+
+        Only 8bit PNG without animations is supported.
+        """
+        super().set_meta(dataset, overwrite, metadata_tmp_files_dir, **kwd)
+
+        # Read the image data row by row, to avoid allocating memory for the entire image
+        if dataset.metadata.dtype == "uint8" and dataset.metadata.frames in (0, 1):
+            reader = png.Reader(filename=dataset.get_file_name())
+            width, height, pixels, metadata = reader.asDirect()
+
+            unique_values: List[Any] = []
+            for row in pixels:
+                values = np.array(row, dtype="uint8")
+                unique_values = list(np.unique(unique_values + list(values)))
+            dataset.metadata.num_unique_values = len(unique_values)
+
 
 class Tiff(Image):
     edam_format = "format_3591"
@@ -245,25 +271,23 @@ def set_meta(
                         "num_unique_values",
                     ]
                 }
-                for page in tif.series:
+
+                # TIFF files can contain multiple images, each represented by a series of pages
+                for series in tif.series:
 
                     # Determine the metadata values that should be generally available
-                    metadata["axes"].append(page.axes.upper())
-                    metadata["dtype"].append(str(page.dtype))
+                    metadata["axes"].append(series.axes.upper())
+                    metadata["dtype"].append(str(series.dtype))
 
                     axes = metadata["axes"][-1].replace("S", "C")
-                    metadata["width"].append(Tiff._get_axis_size(page.shape, axes, "X"))
-                    metadata["height"].append(Tiff._get_axis_size(page.shape, axes, "Y"))
-                    metadata["channels"].append(Tiff._get_axis_size(page.shape, axes, "C"))
-                    metadata["depth"].append(Tiff._get_axis_size(page.shape, axes, "Z"))
-                    metadata["frames"].append(Tiff._get_axis_size(page.shape, axes, "T"))
+                    metadata["width"].append(Tiff._get_axis_size(series.shape, axes, "X"))
+                    metadata["height"].append(Tiff._get_axis_size(series.shape, axes, "Y"))
+                    metadata["channels"].append(Tiff._get_axis_size(series.shape, axes, "C"))
+                    metadata["depth"].append(Tiff._get_axis_size(series.shape, axes, "Z"))
+                    metadata["frames"].append(Tiff._get_axis_size(series.shape, axes, "T"))
 
                     # Determine the metadata values that require reading the image data
-                    try:
-                        im_arr = page.asarray()
-                        metadata["num_unique_values"].append(len(np.unique(im_arr)))
-                    except ValueError:  # Occurs if the compression of the TIFF file is unsupported
-                        pass
+                    metadata["num_unique_values"].append(Tiff._get_num_unique_values(series))
 
                 # Populate the metadata fields based on the values determined above
                 for key, values in metadata.items():
@@ -301,6 +325,64 @@ def _get_axis_size(shape: Tuple[int, ...], axes: str, axis: str) -> int:
         idx = axes.find(axis)
         return shape[idx] if idx >= 0 else 0
 
+    @staticmethod
+    def _get_num_unique_values(series: tifffile.TiffPageSeries) -> Optional[int]:
+        """
+        Determines the number of unique values in a TIFF series of pages.
+        """
+        unique_values: List[Any] = []
+        try:
+            for page in series.pages:
+
+                if page is None:
+                    continue  # No idea how this might occur, but mypy demands that we check it, just to be sure
+
+                for chunk in Tiff._read_chunks(page):
+                    unique_values = list(np.unique(unique_values + list(chunk)))
+
+            return len(unique_values)
+        except ValueError:
+            return None  # Occurs if the compression of the TIFF file is unsupported
+
+    @staticmethod
+    def _read_chunks(
+        page: Union[tifffile.TiffPage, tifffile.TiffFrame], mmap_chunk_size: int = 2**14
+    ) -> Iterator["np.typing.NDArray"]:
+        """
+        Generator that reads all chunks of values from a TIFF page.
+        """
+        if len(page.dataoffsets) > 1:
+
+            # There are multiple segments that can be processed consecutively
+            for segment in Tiff._read_segments(page):
+                yield segment.reshape(-1)
+
+        else:
+
+            # The page can be memory-mapped and processed chunk-wise
+            arr = page.asarray(out="memmap")  # No considerable amounts of memory should be allocated here
+            arr_flat = arr.reshape(-1)  # This should only produce a view without any new allocations
+            if mmap_chunk_size > len(arr_flat):
+                yield arr_flat
+            else:
+                yield from np.array_split(arr_flat, mmap_chunk_size)
+
+    @staticmethod
+    def _read_segments(page: Union[tifffile.TiffPage, tifffile.TiffFrame]) -> Iterator["np.typing.NDArray"]:
+        """
+        Generator that reads all segments of a TIFF page.
+        """
+        reader = page.parent.filehandle
+        for segment_idx, (segment_offset, segment_size) in enumerate(zip(page.dataoffsets, page.databytecounts)):
+            reader.seek(segment_offset)
+            segment_data = reader.read(segment_size)
+            segment = page.decode(segment_data, segment_idx)[0]
+
+            if segment is None:
+                continue  # No idea how this might occur, but mypy demands that we check it, just to be sure
+
+            yield segment
+
     def sniff(self, filename: str) -> bool:
         with tifffile.TiffFile(filename):
             return True

diff --git a/lib/galaxy/datatypes/test/im9_multipage.tif b/lib/galaxy/datatypes/test/im9_multipage.tif
diff --git a/lib/galaxy/datatypes/test/im9_multiseries.tif b/lib/galaxy/datatypes/test/im9_multiseries.tif
@@ -0,0 +1 @@
+../../../../test-data/im9_multiseries.tif
diff --git a/lib/galaxy/dependencies/pinned-requirements.txt b/lib/galaxy/dependencies/pinned-requirements.txt
@@ -141,6 +141,7 @@ pykwalify==1.8.0
 pylibmagic==0.5.0
 pynacl==1.5.0
 pyparsing==3.2.1
+pypng==0.20220715.0
 pyreadline3==3.5.4 ; sys_platform == 'win32'
 pysam==0.23.0
 python-dateutil==2.9.0.post0

diff --git a/packages/data/setup.cfg b/packages/data/setup.cfg
@@ -54,6 +54,7 @@ install_requires =
     pycryptodome
     pydantic[email]>=2.7.4
     pylibmagic
+    pypng
     python-magic
     pysam>=0.21
     rocrate

diff --git a/pyproject.toml b/pyproject.toml
@@ -75,6 +75,7 @@ dependencies = [
     "pykwalify",
     "pylibmagic",
     "pyparsing",
+    "pypng",
     "pysam>=0.21",  # for Python 3.11 support on macOS
     "python-dateutil",
     "python-magic",

diff --git a/test-data/im8_uint16.tif b/test-data/im8_uint16.tif
diff --git a/test-data/im9_multipage.tif → test-data/im9_multiseries.tif b/test-data/im9_multipage.tif → test-data/im9_multiseries.tif
diff --git a/test/unit/data/datatypes/test_images.py b/test/unit/data/datatypes/test_images.py
@@ -6,6 +6,7 @@
 from galaxy.datatypes.images import (
     Image,
     Pdf,
+    Png,
     Tiff,
 )
 from .util import (
@@ -61,7 +62,7 @@ def __assert_empty_metadata(metadata):
         assert getattr(metadata, key, None) is None
 
 
-# Tests with TIFF files
+# Tests for `Tiff` class
 
 test_tiff_axes_yx = __create_test(Tiff, "im1_uint8.tif", "axes", "YX")
 test_tiff_axes_zcyx = __create_test(Tiff, "im6_uint8.tif", "axes", "ZCYX")
@@ -103,8 +104,8 @@ def test_tiff_unsupported_compression(metadata):
     assert getattr(metadata, "num_unique_values", None) is None
 
 
-@__test(Tiff, "im9_multipage.tif")
-def test_tiff_multipage(metadata):
+@__test(Tiff, "im9_multiseries.tif")
+def test_tiff_multiseries(metadata):
     assert metadata.axes == ["YXS", "YX"]
     assert metadata.dtype == ["uint8", "uint16"]
     assert metadata.num_unique_values == [2, 255]
@@ -115,13 +116,13 @@ def test_tiff_multipage(metadata):
     assert metadata.frames == [0, 0]
 
 
-# Tests with PNG files
+# Tests for `Image` class
 
 test_png_axes_yx = __create_test(Image, "im1_uint8.png", "axes", "YX")
 test_png_axes_yxc = __create_test(Image, "im3_a.png", "axes", "YXC")
 test_png_dtype_uint8 = __create_test(Image, "im1_uint8.png", "dtype", "uint8")
-test_png_num_unique_values_1 = __create_test(Image, "im2_a.png", "num_unique_values", 1)
-test_png_num_unique_values_2 = __create_test(Image, "im2_b.png", "num_unique_values", 2)
+test_png_num_unique_values_1 = __create_test(Image, "im2_a.png", "num_unique_values", None)
+test_png_num_unique_values_2 = __create_test(Image, "im2_b.png", "num_unique_values", None)
 test_png_width_32 = __create_test(Image, "im2_b.png", "width", 32)
 test_png_height_32 = __create_test(Image, "im2_b.png", "height", 32)
 test_png_channels_0 = __create_test(Image, "im1_uint8.png", "channels", 0)
@@ -130,6 +131,21 @@ def test_tiff_multipage(metadata):
 test_png_frames_1 = __create_test(Image, "im1_uint8.png", "frames", 1)
 
 
+# Tests for `Png` class
+
+test_png_axes_yx = __create_test(Png, "im1_uint8.png", "axes", "YX")
+test_png_axes_yxc = __create_test(Png, "im3_a.png", "axes", "YXC")
+test_png_dtype_uint8 = __create_test(Png, "im1_uint8.png", "dtype", "uint8")
+test_png_num_unique_values_1 = __create_test(Png, "im2_a.png", "num_unique_values", 1)
+test_png_num_unique_values_2 = __create_test(Png, "im2_b.png", "num_unique_values", 2)
+test_png_width_32 = __create_test(Png, "im2_b.png", "width", 32)
+test_png_height_32 = __create_test(Png, "im2_b.png", "height", 32)
+test_png_channels_0 = __create_test(Png, "im1_uint8.png", "channels", 0)
+test_png_channels_3 = __create_test(Png, "im3_a.png", "channels", 3)
+test_png_depth_0 = __create_test(Png, "im1_uint8.png", "depth", 0)
+test_png_frames_1 = __create_test(Png, "im1_uint8.png", "frames", 1)
+
+
 # Test with files that neither Pillow nor tifffile can open