Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Populate image metadata without allocating memory for the entire image content #19830

Merged
merged 18 commits into from
Mar 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 105 additions & 23 deletions lib/galaxy/datatypes/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@
from typing import (
Any,
Dict,
Iterator,
List,
Optional,
Tuple,
Union,
)

import mrcfile
import numpy as np
import png
import tifffile

try:
Expand Down Expand Up @@ -154,32 +157,32 @@ def set_meta(
self, dataset: DatasetProtocol, overwrite: bool = True, metadata_tmp_files_dir: Optional[str] = None, **kwd
) -> None:
"""
Try to populate the metadata of the image using a generic image loading library (pillow), if available.
Try to populate the metadata of the image using a generic image loading library (Pillow), if available.

If an image has two axes, they are assumed to be ``YX``. If an image has three axes, they are assumed to be ``YXC``.

The metadata element `num_unique_values` remains unset.
"""
if PIL is not None:
try:
with PIL.Image.open(dataset.get_file_name()) as im:
im_shape, im_typestr = PIL.Image._conv_type_shape(im)
im_ndim = len(im_shape)

# Determine the metadata values that are available without loading the image data
dataset.metadata.width = im.size[1]
dataset.metadata.height = im.size[0]
dataset.metadata.depth = 0
dataset.metadata.frames = getattr(im, "n_frames", 0)
dataset.metadata.num_unique_values = sum(val > 0 for val in im.histogram())

# Peek into a small 2x2 section of the image data
im_peek_arr = np.array(im.crop((0, 0, min((2, im.size[1])), min((2, im.size[0])))))
dataset.metadata.dtype = str(np.array((0,), im_typestr).dtype)

# Determine the remaining metadata values
dataset.metadata.dtype = str(im_peek_arr.dtype)
if im_peek_arr.ndim == 2:
# Determine the remaining values, by assuming the order of axes
if im_ndim == 2:
dataset.metadata.axes = "YX"
dataset.metadata.channels = 0
elif im_peek_arr.ndim == 3:
elif im_ndim == 3:
dataset.metadata.axes = "YXC"
dataset.metadata.channels = im_peek_arr.shape[2]
dataset.metadata.channels = im_shape[2]

except PIL.UnidentifiedImageError:
pass
Expand All @@ -198,6 +201,29 @@ class Png(Image):
edam_format = "format_3603"
file_ext = "png"

def set_meta(
self, dataset: DatasetProtocol, overwrite: bool = True, metadata_tmp_files_dir: Optional[str] = None, **kwd
) -> None:
"""
Try to populate the metadata of the image using PyPNG.

The base implementation is used to determine metadata elements that cannot be determined with PyPNG, but with Pillow.

Only 8bit PNG without animations is supported.
"""
super().set_meta(dataset, overwrite, metadata_tmp_files_dir, **kwd)

# Read the image data row by row, to avoid allocating memory for the entire image
if dataset.metadata.dtype == "uint8" and dataset.metadata.frames in (0, 1):
reader = png.Reader(filename=dataset.get_file_name())
width, height, pixels, metadata = reader.asDirect()

unique_values: List[Any] = []
for row in pixels:
values = np.array(row, dtype="uint8")
unique_values = list(np.unique(unique_values + list(values)))
dataset.metadata.num_unique_values = len(unique_values)


class Tiff(Image):
edam_format = "format_3591"
Expand Down Expand Up @@ -245,25 +271,23 @@ def set_meta(
"num_unique_values",
]
}
for page in tif.series:

# TIFF files can contain multiple images, each represented by a series of pages
for series in tif.series:

# Determine the metadata values that should be generally available
metadata["axes"].append(page.axes.upper())
metadata["dtype"].append(str(page.dtype))
metadata["axes"].append(series.axes.upper())
metadata["dtype"].append(str(series.dtype))

axes = metadata["axes"][-1].replace("S", "C")
metadata["width"].append(Tiff._get_axis_size(page.shape, axes, "X"))
metadata["height"].append(Tiff._get_axis_size(page.shape, axes, "Y"))
metadata["channels"].append(Tiff._get_axis_size(page.shape, axes, "C"))
metadata["depth"].append(Tiff._get_axis_size(page.shape, axes, "Z"))
metadata["frames"].append(Tiff._get_axis_size(page.shape, axes, "T"))
metadata["width"].append(Tiff._get_axis_size(series.shape, axes, "X"))
metadata["height"].append(Tiff._get_axis_size(series.shape, axes, "Y"))
metadata["channels"].append(Tiff._get_axis_size(series.shape, axes, "C"))
metadata["depth"].append(Tiff._get_axis_size(series.shape, axes, "Z"))
metadata["frames"].append(Tiff._get_axis_size(series.shape, axes, "T"))

# Determine the metadata values that require reading the image data
try:
im_arr = page.asarray()
metadata["num_unique_values"].append(len(np.unique(im_arr)))
except ValueError: # Occurs if the compression of the TIFF file is unsupported
pass
metadata["num_unique_values"].append(Tiff._get_num_unique_values(series))

# Populate the metadata fields based on the values determined above
for key, values in metadata.items():
Expand Down Expand Up @@ -301,6 +325,64 @@ def _get_axis_size(shape: Tuple[int, ...], axes: str, axis: str) -> int:
idx = axes.find(axis)
return shape[idx] if idx >= 0 else 0

@staticmethod
def _get_num_unique_values(series: tifffile.TiffPageSeries) -> Optional[int]:
"""
Determines the number of unique values in a TIFF series of pages.
"""
unique_values: List[Any] = []
try:
for page in series.pages:

if page is None:
continue # No idea how this might occur, but mypy demands that we check it, just to be sure

for chunk in Tiff._read_chunks(page):
unique_values = list(np.unique(unique_values + list(chunk)))

return len(unique_values)
except ValueError:
return None # Occurs if the compression of the TIFF file is unsupported

@staticmethod
def _read_chunks(
page: Union[tifffile.TiffPage, tifffile.TiffFrame], mmap_chunk_size: int = 2**14
) -> Iterator["np.typing.NDArray"]:
"""
Generator that reads all chunks of values from a TIFF page.
"""
if len(page.dataoffsets) > 1:

# There are multiple segments that can be processed consecutively
for segment in Tiff._read_segments(page):
yield segment.reshape(-1)

else:

# The page can be memory-mapped and processed chunk-wise
arr = page.asarray(out="memmap") # No considerable amounts of memory should be allocated here
arr_flat = arr.reshape(-1) # This should only produce a view without any new allocations
if mmap_chunk_size > len(arr_flat):
yield arr_flat
else:
yield from np.array_split(arr_flat, mmap_chunk_size)

@staticmethod
def _read_segments(page: Union[tifffile.TiffPage, tifffile.TiffFrame]) -> Iterator["np.typing.NDArray"]:
"""
Generator that reads all segments of a TIFF page.
"""
reader = page.parent.filehandle
for segment_idx, (segment_offset, segment_size) in enumerate(zip(page.dataoffsets, page.databytecounts)):
reader.seek(segment_offset)
segment_data = reader.read(segment_size)
segment = page.decode(segment_data, segment_idx)[0]

if segment is None:
continue # No idea how this might occur, but mypy demands that we check it, just to be sure

yield segment

def sniff(self, filename: str) -> bool:
with tifffile.TiffFile(filename):
return True
Expand Down
1 change: 0 additions & 1 deletion lib/galaxy/datatypes/test/im9_multipage.tif

This file was deleted.

1 change: 1 addition & 0 deletions lib/galaxy/datatypes/test/im9_multiseries.tif
1 change: 1 addition & 0 deletions lib/galaxy/dependencies/pinned-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ pykwalify==1.8.0
pylibmagic==0.5.0
pynacl==1.5.0
pyparsing==3.2.1
pypng==0.20220715.0
pyreadline3==3.5.4 ; sys_platform == 'win32'
pysam==0.23.0
python-dateutil==2.9.0.post0
Expand Down
1 change: 1 addition & 0 deletions packages/data/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ install_requires =
pycryptodome
pydantic[email]>=2.7.4
pylibmagic
pypng
python-magic
pysam>=0.21
rocrate
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ dependencies = [
"pykwalify",
"pylibmagic",
"pyparsing",
"pypng",
"pysam>=0.21", # for Python 3.11 support on macOS
"python-dateutil",
"python-magic",
Expand Down
Binary file modified test-data/im8_uint16.tif
Binary file not shown.
File renamed without changes.
28 changes: 22 additions & 6 deletions test/unit/data/datatypes/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from galaxy.datatypes.images import (
Image,
Pdf,
Png,
Tiff,
)
from .util import (
Expand Down Expand Up @@ -61,7 +62,7 @@ def __assert_empty_metadata(metadata):
assert getattr(metadata, key, None) is None


# Tests with TIFF files
# Tests for `Tiff` class

test_tiff_axes_yx = __create_test(Tiff, "im1_uint8.tif", "axes", "YX")
test_tiff_axes_zcyx = __create_test(Tiff, "im6_uint8.tif", "axes", "ZCYX")
Expand Down Expand Up @@ -103,8 +104,8 @@ def test_tiff_unsupported_compression(metadata):
assert getattr(metadata, "num_unique_values", None) is None


@__test(Tiff, "im9_multipage.tif")
def test_tiff_multipage(metadata):
@__test(Tiff, "im9_multiseries.tif")
def test_tiff_multiseries(metadata):
assert metadata.axes == ["YXS", "YX"]
assert metadata.dtype == ["uint8", "uint16"]
assert metadata.num_unique_values == [2, 255]
Expand All @@ -115,13 +116,13 @@ def test_tiff_multipage(metadata):
assert metadata.frames == [0, 0]


# Tests with PNG files
# Tests for `Image` class

test_png_axes_yx = __create_test(Image, "im1_uint8.png", "axes", "YX")
test_png_axes_yxc = __create_test(Image, "im3_a.png", "axes", "YXC")
test_png_dtype_uint8 = __create_test(Image, "im1_uint8.png", "dtype", "uint8")
test_png_num_unique_values_1 = __create_test(Image, "im2_a.png", "num_unique_values", 1)
test_png_num_unique_values_2 = __create_test(Image, "im2_b.png", "num_unique_values", 2)
test_png_num_unique_values_1 = __create_test(Image, "im2_a.png", "num_unique_values", None)
test_png_num_unique_values_2 = __create_test(Image, "im2_b.png", "num_unique_values", None)
test_png_width_32 = __create_test(Image, "im2_b.png", "width", 32)
test_png_height_32 = __create_test(Image, "im2_b.png", "height", 32)
test_png_channels_0 = __create_test(Image, "im1_uint8.png", "channels", 0)
Expand All @@ -130,6 +131,21 @@ def test_tiff_multipage(metadata):
test_png_frames_1 = __create_test(Image, "im1_uint8.png", "frames", 1)


# Tests for `Png` class

test_png_axes_yx = __create_test(Png, "im1_uint8.png", "axes", "YX")
test_png_axes_yxc = __create_test(Png, "im3_a.png", "axes", "YXC")
test_png_dtype_uint8 = __create_test(Png, "im1_uint8.png", "dtype", "uint8")
test_png_num_unique_values_1 = __create_test(Png, "im2_a.png", "num_unique_values", 1)
test_png_num_unique_values_2 = __create_test(Png, "im2_b.png", "num_unique_values", 2)
test_png_width_32 = __create_test(Png, "im2_b.png", "width", 32)
test_png_height_32 = __create_test(Png, "im2_b.png", "height", 32)
test_png_channels_0 = __create_test(Png, "im1_uint8.png", "channels", 0)
test_png_channels_3 = __create_test(Png, "im3_a.png", "channels", 3)
test_png_depth_0 = __create_test(Png, "im1_uint8.png", "depth", 0)
test_png_frames_1 = __create_test(Png, "im1_uint8.png", "frames", 1)


# Test with files that neither Pillow nor tifffile can open


Expand Down
Loading