From a6073d410d170eeea0d7bd58b8016a0c9fc5b338 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Mon, 9 Nov 2020 15:07:34 +0000 Subject: [PATCH] Allow retrieving big detector data with cell IDs in the index instead of pulse IDs --- extra_data/components.py | 43 +++++++++++++++++++------- extra_data/tests/mockdata/detectors.py | 11 +++++-- extra_data/tests/test_components.py | 12 +++++++ 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/extra_data/components.py b/extra_data/components.py index 329116a1..f4a6f0b5 100644 --- a/extra_data/components.py +++ b/extra_data/components.py @@ -38,7 +38,7 @@ def _guess_axes(data, train_pulse_ids, unstack_pulses): if unstack_pulses: # Separate train & pulse dimensions, and arrange dimensions # so that the data is contiguous in memory. - dim_order = ['train', 'pulse'] + dims[1:] + dim_order = train_pulse_ids.names + dims[1:] return arr.unstack('train_pulse').transpose(*dim_order) else: return arr @@ -239,7 +239,17 @@ def _select_pulse_indices(pulses, firsts, counts): return np.concatenate(positions) - def _get_module_pulse_data(self, source, key, pulses, unstack_pulses): + def _get_module_pulse_data(self, source, key, pulses, unstack_pulses, + inner_index='pulseId'): + def get_inner_ids(f, ix_name='pulseId'): + ids = f.file[f'/INSTRUMENT/{source}/{group}/{ix_name}'][ + data_slice + ] + # Raw files have a spurious extra dimension + if ids.ndim >= 2 and ids.shape[1] == 1: + ids = ids[:, 0] + return ids + seq_arrays = [] data_path = "/INSTRUMENT/{}/{}".format(source, key.replace('.', '/')) for f in self.data._source_index[source]: @@ -263,14 +273,13 @@ def _get_module_pulse_data(self, source, key, pulses, unstack_pulses): np.arange(first_tid, last_tid + 1, dtype=np.uint64), chunk_counts.astype(np.intp), ) - pulse_id = f.file['/INSTRUMENT/{}/{}/pulseId'.format(source, group)][ - data_slice - ] - # Raw files have a spurious extra dimension - if pulse_id.ndim >= 2 and pulse_id.shape[1] == 1: - pulse_id = pulse_id[:, 0] + inner_ids = get_inner_ids(f, inner_index) if isinstance(pulses, by_id): + if inner_index == 'pulseId': + pulse_id = inner_ids + else: + pulse_id = get_inner_ids(f, 'pulseId') positions = self._select_pulse_ids(pulses, pulse_id) else: # by_index positions = self._select_pulse_indices( @@ -278,9 +287,9 @@ def _get_module_pulse_data(self, source, key, pulses, unstack_pulses): ) trainids = trainids[positions] - pulse_id = pulse_id[positions] + inner_ids = inner_ids[positions] index = pd.MultiIndex.from_arrays( - [trainids, pulse_id], names=['train', 'pulse'] + [trainids, inner_ids], names=['train', inner_index[:-2]] ) if isinstance(positions, slice): @@ -320,7 +329,8 @@ def _get_module_pulse_data(self, source, key, pulses, unstack_pulses): dim=('train' if unstack_pulses else 'train_pulse'), ) - def get_array(self, key, pulses=np.s_[:], unstack_pulses=True): + def get_array(self, key, pulses=np.s_[:], unstack_pulses=True, *, + subtrain_index='pulseId'): """Get a labelled array of detector data Parameters @@ -334,7 +344,15 @@ def get_array(self, key, pulses=np.s_[:], unstack_pulses=True): all pulses. Only used for per-train data. unstack_pulses: bool Whether to separate train and pulse dimensions. + subtrain_index: str + Specify 'pulseId' (default) or 'cellId' to label the frames recorded + within each train. Pulse ID should allow this data to be matched with + other devices, but depends on how the detector was manually configured + when the data was taken. Cell ID refers to the memory cell used for + that frame in the detector hardware. """ + if subtrain_index not in {'pulseId', 'cellId'}: + raise ValueError("subtrain_index must be 'pulseId' or 'cellId'") pulses = _check_pulse_selection(pulses) arrays = [] @@ -344,7 +362,8 @@ def get_array(self, key, pulses=np.s_[:], unstack_pulses=True): # If that changes, this check will need to change as well. if key.startswith('image.'): arrays.append(self._get_module_pulse_data( - source, key, pulses, unstack_pulses)) + source, key, pulses, unstack_pulses, subtrain_index, + )) else: arrays.append(self.data.get_array(source, key)) modnos.append(modno) diff --git a/extra_data/tests/mockdata/detectors.py b/extra_data/tests/mockdata/detectors.py index 2e2bbb4f..bec6124c 100644 --- a/extra_data/tests/mockdata/detectors.py +++ b/extra_data/tests/mockdata/detectors.py @@ -34,7 +34,6 @@ def write_control(self, f): def image_keys(self): if self.raw: return [ - ('cellId', 'u2', (1,)), ('data', 'u2', self.image_dims), ('length', 'u4', (1,)), ('status', 'u2', (1,)), @@ -42,7 +41,6 @@ def image_keys(self): else: return [ - ('cellId', 'u2', ()), ('data', 'f4', self.image_dims), ('mask', 'u4', self.image_dims), ('gain', 'u1', self.image_dims), @@ -114,6 +112,10 @@ def write_instrument(self, f): pid = f.create_dataset('INSTRUMENT/%s:xtdf/image/pulseId' % self.device_id, (nframes, 1), 'u8', maxshape=(None, 1)) pid[:, 0] = pid_index + + cid = f.create_dataset('INSTRUMENT/%s:xtdf/image/cellId' % self.device_id, + (nframes, 1), 'u2', maxshape=(None, 1)) + cid[:, 0] = pid_index # Cell IDs mirror pulse IDs for now else: # Corrected data drops the extra dimension, and maxshape==shape. f.create_dataset( @@ -126,6 +128,11 @@ def write_instrument(self, f): (nframes,), 'u8', chunks=True, data=pid_index ) + f.create_dataset( # Cell IDs mirror pulse IDs for now + 'INSTRUMENT/%s:xtdf/image/cellId' % self.device_id, + (nframes,), 'u2', chunks=True, data=pid_index + ) + max_len = None if self.raw else nframes for (key, datatype, dims) in self.image_keys: f.create_dataset('INSTRUMENT/%s:xtdf/image/%s' % (self.device_id, key), diff --git a/extra_data/tests/test_components.py b/extra_data/tests/test_components.py index 44243d4e..35b28eee 100644 --- a/extra_data/tests/test_components.py +++ b/extra_data/tests/test_components.py @@ -44,6 +44,18 @@ def test_get_array_pulse_id(mock_fxe_raw_run): assert list(arr.coords['pulse']) == [1, 7, 22, 23] +def test_get_array_with_cell_ids(mock_fxe_raw_run): + run = RunDirectory(mock_fxe_raw_run) + det = LPD1M(run.select_trains(by_index[:3])) + arr = det.get_array('image.data', subtrain_index='cellId') + assert arr.shape == (16, 3, 128, 256, 256) + assert arr.dims == ('module', 'train', 'cell', 'slow_scan', 'fast_scan') + + arr = det.get_array('image.data', pulses=by_id[0], subtrain_index='cellId') + assert arr.shape == (16, 3, 1, 256, 256) + assert (arr.coords['cell'] == 0).all() + + def test_get_array_pulse_indexes(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3]))