From a6073d410d170eeea0d7bd58b8016a0c9fc5b338 Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Mon, 9 Nov 2020 15:07:34 +0000
Subject: [PATCH] Allow retrieving big detector data with cell IDs in the index
 instead of pulse IDs

---
 extra_data/components.py               | 43 +++++++++++++++++++-------
 extra_data/tests/mockdata/detectors.py | 11 +++++--
 extra_data/tests/test_components.py    | 12 +++++++
 3 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/extra_data/components.py b/extra_data/components.py
index 329116a1..f4a6f0b5 100644
--- a/extra_data/components.py
+++ b/extra_data/components.py
@@ -38,7 +38,7 @@ def _guess_axes(data, train_pulse_ids, unstack_pulses):
     if unstack_pulses:
         # Separate train & pulse dimensions, and arrange dimensions
         # so that the data is contiguous in memory.
-        dim_order = ['train', 'pulse'] + dims[1:]
+        dim_order = train_pulse_ids.names + dims[1:]
         return arr.unstack('train_pulse').transpose(*dim_order)
     else:
         return arr
@@ -239,7 +239,17 @@ def _select_pulse_indices(pulses, firsts, counts):
 
         return np.concatenate(positions)
 
-    def _get_module_pulse_data(self, source, key, pulses, unstack_pulses):
+    def _get_module_pulse_data(self, source, key, pulses, unstack_pulses,
+                               inner_index='pulseId'):
+        def get_inner_ids(f, ix_name='pulseId'):
+            ids = f.file[f'/INSTRUMENT/{source}/{group}/{ix_name}'][
+                data_slice
+            ]
+            # Raw files have a spurious extra dimension
+            if ids.ndim >= 2 and ids.shape[1] == 1:
+                ids = ids[:, 0]
+            return ids
+
         seq_arrays = []
         data_path = "/INSTRUMENT/{}/{}".format(source, key.replace('.', '/'))
         for f in self.data._source_index[source]:
@@ -263,14 +273,13 @@ def _get_module_pulse_data(self, source, key, pulses, unstack_pulses):
                     np.arange(first_tid, last_tid + 1, dtype=np.uint64),
                     chunk_counts.astype(np.intp),
                 )
-                pulse_id = f.file['/INSTRUMENT/{}/{}/pulseId'.format(source, group)][
-                    data_slice
-                ]
-                # Raw files have a spurious extra dimension
-                if pulse_id.ndim >= 2 and pulse_id.shape[1] == 1:
-                    pulse_id = pulse_id[:, 0]
+                inner_ids = get_inner_ids(f, inner_index)
 
                 if isinstance(pulses, by_id):
+                    if inner_index == 'pulseId':
+                        pulse_id = inner_ids
+                    else:
+                        pulse_id = get_inner_ids(f, 'pulseId')
                     positions = self._select_pulse_ids(pulses, pulse_id)
                 else:  # by_index
                     positions = self._select_pulse_indices(
@@ -278,9 +287,9 @@ def _get_module_pulse_data(self, source, key, pulses, unstack_pulses):
                     )
 
                 trainids = trainids[positions]
-                pulse_id = pulse_id[positions]
+                inner_ids = inner_ids[positions]
                 index = pd.MultiIndex.from_arrays(
-                    [trainids, pulse_id], names=['train', 'pulse']
+                    [trainids, inner_ids], names=['train', inner_index[:-2]]
                 )
 
                 if isinstance(positions, slice):
@@ -320,7 +329,8 @@ def _get_module_pulse_data(self, source, key, pulses, unstack_pulses):
             dim=('train' if unstack_pulses else 'train_pulse'),
         )
 
-    def get_array(self, key, pulses=np.s_[:], unstack_pulses=True):
+    def get_array(self, key, pulses=np.s_[:], unstack_pulses=True, *,
+                  subtrain_index='pulseId'):
         """Get a labelled array of detector data
 
         Parameters
@@ -334,7 +344,15 @@ def get_array(self, key, pulses=np.s_[:], unstack_pulses=True):
           all pulses. Only used for per-train data.
         unstack_pulses: bool
           Whether to separate train and pulse dimensions.
+        subtrain_index: str
+          Specify 'pulseId' (default) or 'cellId' to label the frames recorded
+          within each train. Pulse ID should allow this data to be matched with
+          other devices, but depends on how the detector was manually configured
+          when the data was taken. Cell ID refers to the memory cell used for
+          that frame in the detector hardware.
         """
+        if subtrain_index not in {'pulseId', 'cellId'}:
+            raise ValueError("subtrain_index must be 'pulseId' or 'cellId'")
         pulses = _check_pulse_selection(pulses)
 
         arrays = []
@@ -344,7 +362,8 @@ def get_array(self, key, pulses=np.s_[:], unstack_pulses=True):
             # If that changes, this check will need to change as well.
             if key.startswith('image.'):
                 arrays.append(self._get_module_pulse_data(
-                    source, key, pulses, unstack_pulses))
+                    source, key, pulses, unstack_pulses, subtrain_index,
+                ))
             else:
                 arrays.append(self.data.get_array(source, key))
             modnos.append(modno)
diff --git a/extra_data/tests/mockdata/detectors.py b/extra_data/tests/mockdata/detectors.py
index 2e2bbb4f..bec6124c 100644
--- a/extra_data/tests/mockdata/detectors.py
+++ b/extra_data/tests/mockdata/detectors.py
@@ -34,7 +34,6 @@ def write_control(self, f):
     def image_keys(self):
         if self.raw:
             return [
-                ('cellId', 'u2', (1,)),
                 ('data', 'u2', self.image_dims),
                 ('length', 'u4', (1,)),
                 ('status', 'u2', (1,)),
@@ -42,7 +41,6 @@ def image_keys(self):
 
         else:
             return [
-                ('cellId', 'u2', ()),
                 ('data', 'f4', self.image_dims),
                 ('mask', 'u4', self.image_dims),
                 ('gain', 'u1', self.image_dims),
@@ -114,6 +112,10 @@ def write_instrument(self, f):
             pid = f.create_dataset('INSTRUMENT/%s:xtdf/image/pulseId' % self.device_id,
                                    (nframes, 1), 'u8', maxshape=(None, 1))
             pid[:, 0] = pid_index
+
+            cid = f.create_dataset('INSTRUMENT/%s:xtdf/image/cellId' % self.device_id,
+                                   (nframes, 1), 'u2', maxshape=(None, 1))
+            cid[:, 0] = pid_index  # Cell IDs mirror pulse IDs for now
         else:
             # Corrected data drops the extra dimension, and maxshape==shape.
             f.create_dataset(
@@ -126,6 +128,11 @@ def write_instrument(self, f):
                 (nframes,), 'u8', chunks=True, data=pid_index
             )
 
+            f.create_dataset(  # Cell IDs mirror pulse IDs for now
+                'INSTRUMENT/%s:xtdf/image/cellId' % self.device_id,
+                (nframes,), 'u2', chunks=True, data=pid_index
+            )
+
         max_len = None if self.raw else nframes
         for (key, datatype, dims) in self.image_keys:
             f.create_dataset('INSTRUMENT/%s:xtdf/image/%s' % (self.device_id, key),
diff --git a/extra_data/tests/test_components.py b/extra_data/tests/test_components.py
index 44243d4e..35b28eee 100644
--- a/extra_data/tests/test_components.py
+++ b/extra_data/tests/test_components.py
@@ -44,6 +44,18 @@ def test_get_array_pulse_id(mock_fxe_raw_run):
     assert list(arr.coords['pulse']) == [1, 7, 22, 23]
 
 
+def test_get_array_with_cell_ids(mock_fxe_raw_run):
+    run = RunDirectory(mock_fxe_raw_run)
+    det = LPD1M(run.select_trains(by_index[:3]))
+    arr = det.get_array('image.data', subtrain_index='cellId')
+    assert arr.shape == (16, 3, 128, 256, 256)
+    assert arr.dims == ('module', 'train', 'cell', 'slow_scan', 'fast_scan')
+
+    arr = det.get_array('image.data', pulses=by_id[0], subtrain_index='cellId')
+    assert arr.shape == (16, 3, 1, 256, 256)
+    assert (arr.coords['cell'] == 0).all()
+
+
 def test_get_array_pulse_indexes(mock_fxe_raw_run):
     run = RunDirectory(mock_fxe_raw_run)
     det = LPD1M(run.select_trains(by_index[:3]))