Lazily load the data in the frame-cache format

Previously, just instantiating the imageseries would cause the data to be loaded. For some of the larger datasets from eiger, this could use a lot of RAM and time. However, in some cases, we want to instantiate an imageseries just as a convenient (and somewhat standardized) way to look at things such as metadata (such as omegas) and other information without actually needing to load the data. This is actually something we do in the GUI to check the omegas (we do it for all image series types, in fact, not just frame-cache). This commit changes the class to wait to load the data until it is actually requested. This makes loading it in the Simple Image Series loader in the GUI (which is done to check metadata) *significantly* faster. It still loads in all the data the first time any data is requested. Another potentially useful change would be to just load in whatever frame was requested and cache that frame in a dict (to be returned next time). However, maybe that could be done in the future if we encounter a more relevant use case for it. Signed-off-by: Patrick Avery <[email protected]>
HEXRD · Aug 18, 2024 · e9017ee · e9017ee
1 parent 433a0d5
commit e9017ee
Showing 1 changed file with 54 additions and 46 deletions.
diff --git a/hexrd/imageseries/load/framecache.py b/hexrd/imageseries/load/framecache.py
@@ -22,10 +22,12 @@ def __init__(self, fname, style='npz', **kwargs):
         *kwargs* - keyword arguments (none required)
         """
         self._fname = fname
+        self._framelist = []
         if style.lower() in ('yml', 'yaml', 'test'):
+            self._from_yml = True
             self._load_yml()
-            self._load_cache(from_yml=True)
         else:
+            self._from_yml = False
             self._load_cache()
 
     def _load_yml(self):
@@ -38,62 +40,65 @@ def _load_yml(self):
         self._dtype = np.dtype(datad['dtype'])
         self._meta = yamlmeta(d['meta'], path=self._cache)
 
-    def _load_cache(self, from_yml=False):
+    def _load_cache(self):
+        arrs = np.load(self._fname)
+        # HACK: while the loaded npz file has a getitem method
+        # that mimicks a dict, it doesn't have a "pop" method.
+        # must make an empty dict to pop after assignment of
+        # class attributes so we can get to the metadata
+        keysd = dict.fromkeys(list(arrs.keys()))
+        self._nframes = int(arrs['nframes'])
+        self._shape = tuple(arrs['shape'])
+        # Check the type so we can read files written
+        # using Python 2.7
+        array_dtype = arrs['dtype'].dtype
+        # Python 3
+        if array_dtype.type == np.str_:
+            dtype_str = str(arrs['dtype'])
+        # Python 2.7
+        else:
+            dtype_str = arrs['dtype'].tobytes().decode()
+        self._dtype = np.dtype(dtype_str)
+
+        keysd.pop('nframes')
+        keysd.pop('shape')
+        keysd.pop('dtype')
+        for i in range(self._nframes):
+            keysd.pop(f"{i}_row")
+            keysd.pop(f"{i}_col")
+            keysd.pop(f"{i}_data")
+
+        # all rmaining keys should be metadata
+        for key in keysd:
+            keysd[key] = arrs[key]
+        self._meta = keysd
+
+    def _load_framelist(self):
         """load into list of csr sparse matrices"""
         self._framelist = []
-        if from_yml:
+        if self._from_yml:
             bpath = os.path.dirname(self._fname)
             if os.path.isabs(self._cache):
                 cachepath = self._cache
             else:
                 cachepath = os.path.join(bpath, self._cache)
             arrs = np.load(cachepath)
-
-            for i in range(self._nframes):
-                row = arrs["%d_row" % i]
-                col = arrs["%d_col" % i]
-                data = arrs["%d_data" % i]
-                frame = csr_matrix((data, (row, col)),
-                                   shape=self._shape, dtype=self._dtype)
-                self._framelist.append(frame)
         else:
             arrs = np.load(self._fname)
-            # HACK: while the loaded npz file has a getitem method
-            # that mimicks a dict, it doesn't have a "pop" method.
-            # must make an empty dict to pop after assignment of
-            # class attributes so we can get to the metadata
-            keysd = dict.fromkeys(list(arrs.keys()))
-            self._nframes = int(arrs['nframes'])
-            self._shape = tuple(arrs['shape'])
-            # Check the type so we can read files written
-            # using Python 2.7
-            array_dtype = arrs['dtype'].dtype
-            # Python 3
-            if array_dtype.type == np.str_:
-                dtype_str = str(arrs['dtype'])
-            # Python 2.7
-            else:
-                dtype_str = arrs['dtype'].tobytes().decode()
-            self._dtype = np.dtype(dtype_str)
-            keysd.pop('nframes')
-            keysd.pop('shape')
-            keysd.pop('dtype')
-            for i in range(self._nframes):
-                row = arrs["%d_row" % i]
-                col = arrs["%d_col" % i]
-                data = arrs["%d_data" % i]
-                keysd.pop("%d_row" % i)
-                keysd.pop("%d_col" % i)
-                keysd.pop("%d_data" % i)
-                frame = csr_matrix((data, (row, col)),
-                                   shape=self._shape,
-                                   dtype=self._dtype)
-                self._framelist.append(frame)
-            # all rmaining keys should be metadata
-            for key in keysd:
-                keysd[key] = arrs[key]
-            self._meta = keysd
 
+        for i in range(self._nframes):
+            row = arrs[f"{i}_row"]
+            col = arrs[f"{i}_col"]
+            data = arrs[f"{i}_data"]
+            frame = csr_matrix((data, (row, col)),
+                               shape=self._shape,
+                               dtype=self._dtype)
+            self._framelist.append(frame)
+
+    @property
+    def _framelist_was_loaded(self):
+        # Just assume that if the framelist is empty, it wasn't loaded...
+        return len(self._framelist) > 0
 
     @property
     def metadata(self):
@@ -127,6 +132,9 @@ def shape(self):
         return self._shape
 
     def __getitem__(self, key):
+        if not self._framelist_was_loaded:
+            # Load the framelist now
+            self._load_framelist()
         return self._framelist[key].toarray()
 
     def __iter__(self):