From 4bb4280144ac79617be421170eec966a430aabc4 Mon Sep 17 00:00:00 2001 From: Christos Tsolakis Date: Fri, 8 Nov 2024 10:21:15 -0500 Subject: [PATCH 1/7] Add new optional style argument in FrameCache writer --- hexrd/imageseries/save.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/hexrd/imageseries/save.py b/hexrd/imageseries/save.py index 092d30aa6..d16248d12 100644 --- a/hexrd/imageseries/save.py +++ b/hexrd/imageseries/save.py @@ -211,19 +211,26 @@ class WriteFrameCache(Writer): cache_file: str or Path, optional name of the npz file to save the image data, if not given in the `fname` argument; for YAML format (deprecated), this is required + style: str, type of file to use for saving. options are: + - 'npz' for saving in a numpy compressed file + - 'fch5' for saving in the HDF5-based frame-cache format max_workers: int, optional The max number of worker threads for multithreading. Defaults to the number of CPUs. """ fmt = 'frame-cache' - def __init__(self, ims, fname, **kwargs): + def __init__(self, ims, fname, style='npz', **kwargs): Writer.__init__(self, ims, fname, **kwargs) self._thresh = self._opts['threshold'] self._cache, self.cachename = self._set_cache() ncpus = multiprocessing.cpu_count() self.max_workers = kwargs.get('max_workers', ncpus) + supported_formats = ['npz','cfh5'] + if style not in supported_formats: + raise TypeError(f"Unknown file style for writing framecache: {style}. Supported formats are {supported_formats}") + self.style = style def _set_cache(self): @@ -274,6 +281,10 @@ def _write_yml(self): yaml.safe_dump(info, f) def _write_frames(self): + if self.style == 'npz': + self._write_frames_npz() + + def _write_frames_npz(self): """also save shape array as originally done (before yaml)""" buff_size = self._ims.shape[0]*self._ims.shape[1] arrd = {} From 43f23add46c2f556402698c02344be4c5d17f420 Mon Sep 17 00:00:00 2001 From: Christos Tsolakis Date: Fri, 8 Nov 2024 12:19:09 -0500 Subject: [PATCH 2/7] Add hdf5plugin to requirements and conda meta.yml Gives access to a plethora of compression algorithms for hdf5 files --- conda.recipe/meta.yaml | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index e6ae499e6..b8c64eedc 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -42,6 +42,7 @@ requirements: - lxml >=4.9.2 - fast-histogram - h5py + - hdf5plugin - lmfit - matplotlib-base - numba diff --git a/setup.py b/setup.py index db7c45a6e..7e0e0ef59 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ 'fast-histogram', 'h5py<3.12', # Currently, h5py 3.12 on Windows fails to import. # We can remove this version pin when that is fixed. + 'hdf5plugin', 'lmfit', 'matplotlib', 'numba', From 39cc26a3558760914717f1aaf203905fcd988ef3 Mon Sep 17 00:00:00 2001 From: Christos Tsolakis Date: Fri, 8 Nov 2024 12:25:30 -0500 Subject: [PATCH 3/7] Introduce fch5 format for saving framecaches .fch5 is an hdf5-based format for saving framecaches comprised mainly 3 datasets - 'data': (m,1) array holding the datavalues of all frames. `m` is evaluated upon runtime - 'indices': (m,2) array holding the row& col information for the values in data. 'data' together within 'indices' represent tha data using the CSR format for sparse matrices. - 'frame_ids`: (2*nframes) holds the range that the i-th frame occupies in the above arrays. i.e. the information of the i-th frame can be accessed using: data_i = data[frame_ids[2*i]:frame_ids[2*i+1]] and indices_i = indices[frame_ids[2*i]:frame_ids[2*i+1]] --- hexrd/imageseries/save.py | 185 ++++++++++++++++++++++++++++++++------ 1 file changed, 158 insertions(+), 27 deletions(-) diff --git a/hexrd/imageseries/save.py b/hexrd/imageseries/save.py index d16248d12..bac59afc1 100644 --- a/hexrd/imageseries/save.py +++ b/hexrd/imageseries/save.py @@ -9,11 +9,13 @@ import numpy as np import h5py +import hdf5plugin import yaml from hexrd.matrixutil import extract_ijv +from hexrd.utils.hdf5 import unwrap_dict_to_h5 -MAX_NZ_FRACTION = 0.1 # 10% sparsity trigger for frame-cache write +MAX_NZ_FRACTION = 0.1 # 10% sparsity trigger for frame-cache write # ============================================================================= @@ -42,7 +44,6 @@ def write(ims, fname, fmt, **kwargs): # Registry class _RegisterWriter(abc.ABCMeta): - def __init__(cls, name, bases, attrs): abc.ABCMeta.__init__(cls, name, bases, attrs) _Registry.register(cls) @@ -50,6 +51,7 @@ def __init__(cls, name, bases, attrs): class _Registry(object): """Registry for imageseries writers""" + writer_registry = dict() @classmethod @@ -76,6 +78,7 @@ class Writer(object, metaclass=_RegisterWriter): kwargs: dict options specific to format """ + fmt = None def __init__(self, ims, fname, **kwargs): @@ -111,6 +114,7 @@ def fname_dir(self): def opts(self): return self._opts + class WriteH5(Writer): """Write imageseries in HDF5 file @@ -129,6 +133,7 @@ class WriteH5(Writer): shuffle: bool shuffle HDF5 data """ + fmt = 'hdf5' dflt_gzip = 1 dflt_chrows = 0 @@ -151,8 +156,9 @@ def write(self): g = f.create_group(self._path) s0, s1 = self._shape - ds = g.create_dataset('images', (self._nframes, s0, s1), self._dtype, - **self.h5opts) + ds = g.create_dataset( + 'images', (self._nframes, s0, s1), self._dtype, **self.h5opts + ) for i in range(self._nframes): ds[i, :, :] = self._ims[i] @@ -218,6 +224,7 @@ class WriteFrameCache(Writer): The max number of worker threads for multithreading. Defaults to the number of CPUs. """ + fmt = 'frame-cache' def __init__(self, ims, fname, style='npz', **kwargs): @@ -227,13 +234,17 @@ def __init__(self, ims, fname, style='npz', **kwargs): ncpus = multiprocessing.cpu_count() self.max_workers = kwargs.get('max_workers', ncpus) - supported_formats = ['npz','cfh5'] + supported_formats = ['npz', 'fch5'] if style not in supported_formats: - raise TypeError(f"Unknown file style for writing framecache: {style}. Supported formats are {supported_formats}") + raise TypeError( + f"Unknown file style for writing framecache: {style}. " + f"Supported formats are {supported_formats}" + ) self.style = style - def _set_cache(self): + self.hdf5_compression = hdf5plugin.Blosc(cname="zstd", clevel=5) + def _set_cache(self): cf = self.opts.get('cache_file') if cf is None: @@ -274,19 +285,40 @@ def _process_meta(self, save_omegas=False): return d def _write_yml(self): - datad = {'file': self._cachename, 'dtype': str(self._ims.dtype), - 'nframes': len(self._ims), 'shape': list(self._ims.shape)} + datad = { + 'file': self._cachename, + 'dtype': str(self._ims.dtype), + 'nframes': len(self._ims), + 'shape': list(self._ims.shape), + } info = {'data': datad, 'meta': self._process_meta(save_omegas=True)} with open(self._fname, "w") as f: yaml.safe_dump(info, f) def _write_frames(self): if self.style == 'npz': - self._write_frames_npz() + self._write_frames_npz() + elif self.style == 'fch5': + self._write_frames_fch5() + + def _check_sparsity(self, frame_id, count, buff_size): + # check the sparsity + # + # FIXME: formalize this a little better + # ???: maybe set a hard limit of total nonzeros for the imageseries + # ???: could pass as a kwarg on open + fullness = count / float(buff_size) + if fullness > MAX_NZ_FRACTION: + sparseness = 100.0 * (1 - fullness) + msg = "frame %d is %4.2f%% sparse (cutoff is 95%%)" % ( + frame_id, + sparseness, + ) + warnings.warn(msg) def _write_frames_npz(self): """also save shape array as originally done (before yaml)""" - buff_size = self._ims.shape[0]*self._ims.shape[1] + buff_size = self._ims.shape[0] * self._ims.shape[1] arrd = {} num_workers = min(self.max_workers, len(self._ims)) @@ -308,20 +340,9 @@ def extract_data(i): vals = val_buffers[buffer_id] # wrapper to find (sparse) pixels above threshold - count = extract_ijv(self._ims[i], self._thresh, - rows, cols, vals) - - # check the sparsity - # - # FIXME: formalize this a little better - # ???: maybe set a hard limit of total nonzeros for the imageseries - # ???: could pass as a kwarg on open - fullness = count / float(buff_size) - if fullness > MAX_NZ_FRACTION: - sparseness = 100.*(1 - fullness) - msg = "frame %d is %4.2f%% sparse (cutoff is 95%%)" \ - % (i, sparseness) - warnings.warn(msg) + count = extract_ijv(self._ims[i], self._thresh, rows, cols, vals) + + self._check_sparsity(i, count, buff_size) arrd[f'{i}_row'] = rows[:count].copy() arrd[f'{i}_col'] = cols[:count].copy() @@ -342,6 +363,117 @@ def extract_data(i): arrd.update(self._process_meta()) np.savez_compressed(self.cache, **arrd) + def _write_frames_fch5(self): + """Write framecache into an hdf5 file. The file will use three + datasets for the framecache: + - 'data': (m,1) array holding the datavalues of all frames. `m` is + evaluated upon runtime + - 'indices': (m,2) array holding the row& col information for the + values in data. 'data' together within 'indices' represent tha data + using the CSR format for sparse matrices. + - 'frame_ids`: (2*nframes) holds the range that the i-th frame + occupies in the above arrays. i.e. the information of the i-th frame + can be accessed using: + + data_i = data[frame_ids[2*i]:frame_ids[2*i+1]] and + indices_i = indices[frame_ids[2*i]:frame_ids[2*i+1]] + """ + max_frame_size = self._ims.shape[0] * self._ims.shape[1] + nframes = len(self._ims) + shape = self._ims.shape + data_dtype = self._ims.dtype + + frame_indices = np.empty((2 * nframes,), dtype=np.uint64) + data_dataset = None + indices_dataset = None + file_position = 0 + total_size = 0 + + common_lock = threading.Lock() + thread_local = threading.local() + + # creating an array in memory will fail if data is too big or threshold + # too low, so we write to the file while iterating the frames + with h5py.File(self.cache, "w") as h5f: + h5f.attrs['HEXRD_FRAMECACHE_VERSION'] = 1 + h5f["shape"] = shape + h5f["nframes"] = nframes + h5f["dtype"] = str(self._ims.dtype).encode() + metadata = h5f.create_group("metadata") + unwrap_dict_to_h5(metadata, self._meta.copy()) + + def initialize_buffers(): + thread_local.data = np.empty( + (max_frame_size, 1), dtype=self._ims.dtype + ) + thread_local.indices = np.empty( + (max_frame_size, 2), dtype=np.uint16 + ) + + def single_array_write_thread(i): + nonlocal file_position, total_size + im = self._ims[i] + row_slice = thread_local.indices[:, 0] + col_slice = thread_local.indices[:, 1] + data_slice = thread_local.data[:, 0] + count = extract_ijv( + im, self._thresh, row_slice, col_slice, data_slice + ) + + self._check_sparsity(i, count, max_frame_size) + + # get the range this thread is doing to write into the file + start_file = 0 + end_file = 0 + with common_lock: + start_file = file_position + file_position += count + end_file = file_position + total_size += end_file - start_file + # write within the appropriate ranges + data_dataset[start_file:end_file, :] = thread_local.data[ + :count, : + ] + indices_dataset[start_file:end_file, :] = thread_local.indices[ + :count, : + ] + frame_indices[2 * i] = start_file + frame_indices[2 * i + 1] = end_file + + kwargs = { + "max_workers": self.max_workers, + "initializer": initialize_buffers, + } + + data_dataset = h5f.create_dataset( + "data", + shape=(nframes * max_frame_size, 1), + dtype=data_dtype, + compression=self.hdf5_compression, + ) + indices_dataset = h5f.create_dataset( + "indices", + shape=(nframes * max_frame_size, 2), + dtype=np.uint16, + compression=self.hdf5_compression, + ) + with ThreadPoolExecutor(**kwargs) as executor: + # Evaluate the results via `list()`, so that if an exception is + # raised in a thread, it will be re-raised and visible to + # the user. + list(executor.map(single_array_write_thread, range(nframes))) + + # update the sizes of the dataset to match the amount of data + # that have been actually written + data_dataset.resize(total_size, axis=0) + indices_dataset.resize(total_size, axis=0) + + h5f.create_dataset( + "frame_ids", + data=frame_indices, + compression=self.hdf5_compression, + ) + def write(self, output_yaml=False): """writes frame cache for imageseries @@ -350,7 +482,6 @@ def write(self, output_yaml=False): self._write_frames() if output_yaml: warnings.warn( - "YAML output for frame-cache is deprecated", - DeprecationWarning + "YAML output for frame-cache is deprecated", DeprecationWarning ) self._write_yml() From ab11ca4f621e5650f7f3da600baeb0ac1795582a Mon Sep 17 00:00:00 2001 From: Christos Tsolakis Date: Fri, 8 Nov 2024 17:41:05 -0500 Subject: [PATCH 4/7] fch5: add parallel reader --- hexrd/imageseries/load/framecache.py | 82 +++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/hexrd/imageseries/load/framecache.py b/hexrd/imageseries/load/framecache.py index 93b3a8ad5..5c139818a 100644 --- a/hexrd/imageseries/load/framecache.py +++ b/hexrd/imageseries/load/framecache.py @@ -6,10 +6,16 @@ import numpy as np from scipy.sparse import csr_matrix import yaml +import h5py from . import ImageSeriesAdapter from ..imageseriesiter import ImageSeriesIterator from .metadata import yamlmeta +from hexrd.utils.hdf5 import unwrap_h5_to_dict + +import multiprocessing +from concurrent.futures import ThreadPoolExecutor + class FrameCacheImageSeriesAdapter(ImageSeriesAdapter): """collection of images in HDF5 format""" @@ -26,13 +32,25 @@ def __init__(self, fname, style='npz', **kwargs): self._framelist = [] self._framelist_was_loaded = False self._load_framelist_lock = Lock() + # TODO extract style from filename ? + self._style = style.lower() + + ncpus = multiprocessing.cpu_count() + self._max_workers = kwargs.get('max_workers', ncpus) - if style.lower() in ('yml', 'yaml', 'test'): + if self._style in ('yml', 'yaml', 'test'): self._from_yml = True self._load_yml() - else: + elif self._style == "npz": self._from_yml = False self._load_cache() + elif self._style == "fch5": + self._from_yml = False + self._load_cache() + else: + raise TypeError(f"Unknown style format for loading data: {style}." + "Known style formats: 'npz', 'fch5' 'yml', ", + "'yaml', 'test'") def _load_yml(self): with open(self._fname, "r") as f: @@ -45,6 +63,29 @@ def _load_yml(self): self._meta = yamlmeta(d['meta'], path=self._cache) def _load_cache(self): + if self._style == 'fch5': + self._load_cache_fch5() + else: + self._load_cache_npz() + + def _load_cache_fch5(self): + with h5py.File(self._fname, "r") as file: + if 'HEXRD_FRAMECACHE_VERSION' not in file.attrs.keys(): + raise NotImplementedError("Unsupported file. " + "HEXRD_FRAMECACHE_VERSION " + "is missing!") + version = file.attrs.get('HEXRD_FRAMECACHE_VERSION', 0) + if version != 1: + raise NotImplementedError("Framecache version is not " + f"supported: {version}") + + self._shape = file["shape"][()] + self._nframes = file["nframes"][()] + self._dtype = np.dtype(file["dtype"][()]) + self._meta = {} + unwrap_h5_to_dict(file["metadata"], self._meta) + + def _load_cache_npz(self): arrs = np.load(self._fname) # HACK: while the loaded npz file has a getitem method # that mimicks a dict, it doesn't have a "pop" method. @@ -79,6 +120,41 @@ def _load_cache(self): def _load_framelist(self): """load into list of csr sparse matrices""" + if self._style == 'fch5': + self._load_framelist_fch5() + else: + self._load_framelist_npz() + + def _load_framelist_fch5(self): + self._framelist = [None] * self._nframes + with h5py.File(self._fname, "r") as file: + frame_id = file["frame_ids"] + data = file["data"] + indices = file["indices"] + + def read_list_arrays_method_thread(i): + frame_data = data[frame_id[2*i]: frame_id[2*i+1]] + frame_indices = indices[frame_id[2*i]: frame_id[2*i+1]] + row = frame_indices[:, 0] + col = frame_indices[:, 1] + mat_data = frame_data[:, 0] + frame = csr_matrix((mat_data, (row, col)), + shape=self._shape, + dtype=self._dtype) + self._framelist[i] = frame + return + + kwargs = { + "max_workers": self._max_workers, + } + with ThreadPoolExecutor(**kwargs) as executor: + # Evaluate the results via `list()`, so that if an exception is + # raised in a thread, it will be re-raised and visible to the + # user. + list(executor.map(read_list_arrays_method_thread, + range(self._nframes))) + + def _load_framelist_npz(self): self._framelist = [] if self._from_yml: bpath = os.path.dirname(self._fname) @@ -149,6 +225,6 @@ def __getitem__(self, key): def __iter__(self): return ImageSeriesIterator(self) - #@memoize + # @memoize def __len__(self): return self._nframes From a4968ce6a4e485b09558b89063c939e72483c345 Mon Sep 17 00:00:00 2001 From: Christos Tsolakis Date: Fri, 8 Nov 2024 18:47:22 -0500 Subject: [PATCH 5/7] fch5: Add tests --- tests/imageseries/test_formats.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/imageseries/test_formats.py b/tests/imageseries/test_formats.py index ae33bb8f8..977b62112 100644 --- a/tests/imageseries/test_formats.py +++ b/tests/imageseries/test_formats.py @@ -85,6 +85,7 @@ def setUp(self): self.fcfile = os.path.join(self.tmpdir, 'frame-cache.npz') self.fmt = 'frame-cache' self.thresh = 0.5 + self.style = 'npz' self.cache_file='frame-cache.npz' _, self.is_a = make_array_ims() @@ -93,9 +94,9 @@ def tearDown(self): def test_fmtfc(self): """save/load frame-cache format""" - imageseries.write(self.is_a, self.fcfile, self.fmt, + imageseries.write(self.is_a, self.fcfile, self.fmt, style=self.style, threshold=self.thresh, cache_file=self.cache_file) - is_fc = imageseries.open(self.fcfile, self.fmt) + is_fc = imageseries.open(self.fcfile, self.fmt, style=self.style) diff = compare(self.is_a, is_fc) self.assertAlmostEqual(diff, 0., "frame-cache reconstruction failed") self.assertTrue(compare_meta(self.is_a, is_fc)) @@ -104,9 +105,9 @@ def test_fmtfc_nocache_file(self): """save/load frame-cache format with no cache_file arg""" imageseries.write( self.is_a, self.fcfile, self.fmt, - threshold=self.thresh + threshold=self.thresh, style=self.style ) - is_fc = imageseries.open(self.fcfile, self.fmt) + is_fc = imageseries.open(self.fcfile, self.fmt, style=self.style) diff = compare(self.is_a, is_fc) self.assertAlmostEqual(diff, 0., "frame-cache reconstruction failed") self.assertTrue(compare_meta(self.is_a, is_fc)) @@ -117,11 +118,22 @@ def test_fmtfc_nparray(self): npa = np.array([0,2.0,1.3]) self.is_a.metadata[key] = npa - imageseries.write(self.is_a, self.fcfile, self.fmt, + imageseries.write(self.is_a, self.fcfile, self.fmt, style=self.style, threshold=self.thresh, cache_file=self.cache_file ) - is_fc = imageseries.open(self.fcfile, self.fmt) + is_fc = imageseries.open(self.fcfile, self.fmt, style=self.style) meta = is_fc.metadata diff = np.linalg.norm(meta[key] - npa) self.assertAlmostEqual(diff, 0., "frame-cache numpy array metadata failed") + + +class TestFormatFrameCache_FCH5(TestFormatFrameCache): + + def setUp(self): + self.fcfile = os.path.join(self.tmpdir, 'frame-cache.fch5') + self.fmt = 'frame-cache' + self.style = 'fch5' + self.thresh = 0.5 + self.cache_file = 'frame-cache.fch5' + _, self.is_a = make_array_ims() From 01e6bbeddcfaed4cc48cd3046adc7a7d53ee5225 Mon Sep 17 00:00:00 2001 From: Christos Tsolakis Date: Tue, 12 Nov 2024 09:38:55 -0500 Subject: [PATCH 6/7] fch5: Add test with nested metadata --- tests/imageseries/test_formats.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/imageseries/test_formats.py b/tests/imageseries/test_formats.py index 977b62112..6d03c6453 100644 --- a/tests/imageseries/test_formats.py +++ b/tests/imageseries/test_formats.py @@ -137,3 +137,13 @@ def setUp(self): self.thresh = 0.5 self.cache_file = 'frame-cache.fch5' _, self.is_a = make_array_ims() + + def test_fmtfc_nested_metadata(self): + """frame-cache format with nested metadata""" + metadata = {'int': 1, 'array': np.array([1, 2, 3])} + self.is_a.metadata["key"] = metadata + + imageseries.write(self.is_a, self.fcfile, self.fmt, style=self.style, + threshold=self.thresh, cache_file=self.cache_file) + is_fc = imageseries.open(self.fcfile, self.fmt, style=self.style) + self.assertTrue(compare_meta(self.is_a, is_fc)) From 301a673b52235f8b1a32a4e130c096ce865d49c4 Mon Sep 17 00:00:00 2001 From: Christos Tsolakis Date: Wed, 13 Nov 2024 18:26:51 -0500 Subject: [PATCH 7/7] fch5: save dtype as encoded string --- hexrd/imageseries/load/framecache.py | 3 ++- hexrd/imageseries/save.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hexrd/imageseries/load/framecache.py b/hexrd/imageseries/load/framecache.py index 5c139818a..a42fc4d11 100644 --- a/hexrd/imageseries/load/framecache.py +++ b/hexrd/imageseries/load/framecache.py @@ -12,6 +12,7 @@ from ..imageseriesiter import ImageSeriesIterator from .metadata import yamlmeta from hexrd.utils.hdf5 import unwrap_h5_to_dict +from hexrd.utils.compatibility import h5py_read_string import multiprocessing from concurrent.futures import ThreadPoolExecutor @@ -81,7 +82,7 @@ def _load_cache_fch5(self): self._shape = file["shape"][()] self._nframes = file["nframes"][()] - self._dtype = np.dtype(file["dtype"][()]) + self._dtype = np.dtype(h5py_read_string(file["dtype"])) self._meta = {} unwrap_h5_to_dict(file["metadata"], self._meta) diff --git a/hexrd/imageseries/save.py b/hexrd/imageseries/save.py index bac59afc1..ea669c9a0 100644 --- a/hexrd/imageseries/save.py +++ b/hexrd/imageseries/save.py @@ -398,7 +398,7 @@ def _write_frames_fch5(self): h5f.attrs['HEXRD_FRAMECACHE_VERSION'] = 1 h5f["shape"] = shape h5f["nframes"] = nframes - h5f["dtype"] = str(self._ims.dtype).encode() + h5f["dtype"] = str(np.dtype(self._ims.dtype)).encode("utf-8") metadata = h5f.create_group("metadata") unwrap_dict_to_h5(metadata, self._meta.copy())