pyiron · pmrv · Oct 9, 2023 · Oct 9, 2023 · Oct 9, 2023 · Oct 9, 2023
diff --git a/pyiron_base/storage/flattenedstorage.py b/pyiron_base/storage/flattenedstorage.py
@@ -24,6 +24,7 @@
 
 import numpy as np
 import h5py
+import pandas as pd
 from pyiron_base.interfaces.has_hdf import HasHDF
 
 
@@ -269,6 +270,22 @@ def _init_arrays(self):
     def __len__(self):
         return self.current_chunk_index
 
+    def _internal_arrays(self) -> Tuple[str, ...]:
+        """
+        Names of "internal" arrays, i.e. arrays needed for the correct inner
+        working of the flattened storage and that not are not added by the
+        user via :meth:`.add_array`.
+
+        Subclasses can override this tuple, by calling `super()` and appending
+        to it.
+
+        This exists mostly to support :meth:`.to_pandas()`.
+        """
+        return (
+                'start_index',
+                'length',
+        )
+
     def copy(self):
         """
         Return a deep copy of the storage.
@@ -608,14 +625,21 @@ def has_array(self, name):
             return None
         return {"shape": a.shape[1:], "dtype": a.dtype, "per": per}
 
-    def list_arrays(self) -> List[str]:
+    def list_arrays(self, only_user=False) -> List[str]:
         """
         Return a list of names of arrays inside the storage.
 
+        Args:
+            only_user (bool): If `True` include only array names added by the
+            user via :meth:`.add_array` and the `identifier` array.
+
         Returns:
             list of str: array names
         """
-        return list(self._per_chunk_arrays) + list(self._per_element_arrays)
+        arrays = list(self._per_chunk_arrays) + list(self._per_element_arrays)
+        if only_user:
+            arrays = [a for a in arrays if a not in self._internal_arrays()]
+        return arrays
 
     def sample(
         self, selector: Callable[["FlattenedStorage", int], bool]
@@ -995,6 +1019,27 @@ def read_array(name, hdf):
         if version >= "0.3.0":
             self._fill_values = hdf["_fill_values"]
 
+    def to_pandas(self, explode=False, include_index=False) -> pd.DataFrame:
+        """
+        Convert arrays to pandas dataframe.
+
+        Args:
+            explode (bool): If `False` values of per element arrays are stored
+                            in the dataframe as arrays, otherwise each row in the dataframe
+                            corresponds to an element in the original storage.
+
+        Returns:
+            :class:`pandas.DataFrame`: table of array values
+        """
+        arrays = self.list_arrays(only_user=True)
+        df = pd.DataFrame(
+                {a: self.get_array_ragged(a) for a in arrays}
+        )
+        if explode:
+            elem_arrays = [a for a in arrays if self.has_array(a)["per"] == "element"]
+            df = df.explode(elem_arrays).infer_objects(copy=False).reset_index(drop=not include_index)
+        return df
+
 
 def get_dtype_and_fill(storage: FlattenedStorage, name: str) -> Tuple[np.generic, Any]:
     fill = None

diff --git a/tests/generic/test_flattenedstorage.py b/tests/generic/test_flattenedstorage.py
@@ -290,10 +290,14 @@ def test_list_arrays(self):
         store = FlattenedStorage()
         self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index"]),
                          "Array names of empty storage don't match default arrays!")
+        self.assertEqual(store.list_arrays(only_user=True), ["identifier"],
+                         "User array names of empty storage contains more than `identifier`!")
         store.add_array("energy", per="chunk")
         store.add_array("forces", shape=(3,), per="element")
         self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index", "energy", "forces"]),
                          "Array names don't match added ones!")
+        self.assertEqual(sorted(store.list_arrays(only_user=True)), sorted(["identifier", "energy", "forces"]),
+                         "Array names don't match added ones!")
 
     def test_hdf_empty(self):
         """Writing an empty storage should result in an empty storage when reading."""
@@ -543,7 +547,7 @@ def test_extend(self):
             foo.append(foo_val)
             bar.append(bar_val)
             store.add_chunk(i, identifier=f"ID{i}", foo=foo_val, bar=bar_val)
-            
+
         for i in range(3, 5):
             # default length for identifiers is 20 chars, so we need to push it a bit more
             foo_val = i
@@ -596,3 +600,33 @@ def test_del_array(self):
             store.del_array("elem2")
             self.assertTrue("elem2" not in store.list_arrays(),
                             "Per element array still present after del_array")
+
+    def test_to_pandas(self):
+        """to_pandas should return a dataframe with user defined arrays."""
+
+        store = FlattenedStorage(
+                even=self.even,
+                odd=self.odd,
+                even_sum=self.even_sum,
+                odd_sum=self.odd_sum,
+        )
+
+        arrays = store.list_arrays(only_user=True)
+        dfc = store.to_pandas()
+        self.assertEqual(sorted(arrays), sorted(dfc.columns),
+                         "Not all columns present in dataframe!")
+        for a in arrays:
+            with self.subTest(array=a):
+                for i, (elem_df, elem_st) in enumerate(zip(dfc[a], store.get_array_ragged(a))):
+                    self.assertEqual(elem_df, elem_st,
+                                     f"Element {i} in dataframe not equal to original: {elem_df}!={elem_st}!")
+
+        dfe = store.to_pandas(explode=True)
+        for a in arrays:
+            with self.subTest(array=a):
+                if a == "identifier":
+                    self.assertEqual(dfe[a].to_numpy().dtype, np.dtype("O"),
+                                     "dtype not conserved with explode=True!")
+                else:
+                    self.assertEqual(dfe[a].to_numpy().dtype, store[a].dtype,
+                                     "dtype not conserved with explode=True!")