diff --git a/pyiron_base/storage/flattenedstorage.py b/pyiron_base/storage/flattenedstorage.py index 7a6bdd47c..7d547593c 100644 --- a/pyiron_base/storage/flattenedstorage.py +++ b/pyiron_base/storage/flattenedstorage.py @@ -24,6 +24,7 @@ import numpy as np import h5py +import pandas as pd from pyiron_base.interfaces.has_hdf import HasHDF @@ -120,7 +121,7 @@ class FlattenedStorage(HasHDF): >>> store.get_array("sum", 2) 57 - Finally you may add multiple arrays in one call to :method:`.add_chunk` by using keyword arguments + Finally you may add multiple arrays in one call to :meth:`.add_chunk` by using keyword arguments >>> store.add_chunk(4, even=[14, 16, 18, 20], odd=[13, 15, 17, 19], sum=119) >>> store.get_array("sum", 3) @@ -128,7 +129,7 @@ class FlattenedStorage(HasHDF): >>> store.get_array("even", 3) array([14, 16, 18, 20]) - It is usually not necessary to call :method:`.add_array` before :method:`.add_chunk`, the type of the array will be + It is usually not necessary to call :meth:`.add_array` before :meth:`.add_chunk`, the type of the array will be inferred in this case. If you skip the `frame` argument to :meth:`.get_array` it will return a flat array of all the values for that array @@ -139,9 +140,9 @@ class FlattenedStorage(HasHDF): >>> store.get_array("even") array([ 0, 4, 6, 8, 10, 12, 14, 16, 18, 20]) - Arrays may be of more complicated shape, too, see :method:`.add_array` for details. + Arrays may be of more complicated shape, too, see :meth:`.add_array` for details. - Use :method:`.copy` to obtain a deep copy of the storage, for shallow copies using the builting `copy.copy` is + Use :meth:`.copy` to obtain a deep copy of the storage, for shallow copies using the builting `copy.copy` is sufficient. >>> copy = store.copy() @@ -152,7 +153,7 @@ class FlattenedStorage(HasHDF): >>> copy["even"] array([0, 4, 6, 8, 10, 12]) - Storages can be :method:`.split` and :method:`.join` again as long as their internal chunk structure is consistent, + Storages can be :meth:`.split` and :meth:`.join` again as long as their internal chunk structure is consistent, i.e. same number of chunks and same chunk lengths. If this is not the case a `ValueError` is raised. >>> even = store.split(["even"]) @@ -162,11 +163,11 @@ class FlattenedStorage(HasHDF): False >>> odd = store.split(["odd"]) - :method:`.join` adds new arrays to the storage it is called on in-place. To leave it unchanged, simply call copy + :meth:`.join` adds new arrays to the storage it is called on in-place. To leave it unchanged, simply call copy before join. >>> both = even.copy().join(odd) - Chunks may be given string names, either by passing `identifier` to :method:`.add_chunk` or by setting to the + Chunks may be given string names, either by passing `identifier` to :meth:`.add_chunk` or by setting to the special per chunk array "identifier" >>> store.set_array("identifier", 1, "second") @@ -269,6 +270,22 @@ def _init_arrays(self): def __len__(self): return self.current_chunk_index + def _internal_arrays(self) -> Tuple[str, ...]: + """ + Names of "internal" arrays, i.e. arrays needed for the correct inner + working of the flattened storage and that not are not added by the + user via :meth:`.add_array`. + + Subclasses can override this tuple, by calling `super()` and appending + to it. + + This exists mostly to support :meth:`.to_pandas()`. + """ + return ( + "start_index", + "length", + ) + def copy(self): """ Return a deep copy of the storage. @@ -283,7 +300,7 @@ def find_chunk(self, identifier): Return integer index for given identifier. Args: - identifier (str): name of chunk previously passed to :method:`.add_chunk` + identifier (str): name of chunk previously passed to :meth:`.add_chunk` Returns: int: integer index for chunk @@ -426,7 +443,7 @@ def get_array(self, name, frame=None): Args: name (str): name of the array to fetch - frame (int, str, optional): selects structure to fetch, as in :method:`.get_structure()`, if not given + frame (int, str, optional): selects structure to fetch, as in :meth:`.get_structure()`, if not given return a flat array of all values for either all chunks or elements Returns: @@ -458,7 +475,7 @@ def get_array_ragged(self, name: str) -> np.ndarray: Return elements of array `name` in all chunks. Values are returned in a ragged array of dtype=object. If `name` specifies a per chunk array, there's nothing to pad and this method is equivalent to - :method:`.get_array`. + :meth:`.get_array`. Args: name (str): name of array to fetch @@ -480,10 +497,10 @@ def get_array_filled(self, name: str) -> np.ndarray: Return elements of array `name` in all chunks. Arrays are padded to be all of the same length. The padding value depends on the datatpye of the array or can be configured via the `fill` parameter of - :method:`.add_array`. + :meth:`.add_array`. If `name` specifies a per chunk array, there's nothing to pad and this method is equivalent to - :method:`.get_array`. + :meth:`.get_array`. Args: name (str): name of array to fetch @@ -518,7 +535,7 @@ def set_array(self, name, frame, value): Args: name (str): name of array to set - frame (int, str): selects structure to set, as in :method:`.get_strucure()` + frame (int, str): selects structure to set, as in :meth:`.get_strucure()` value: value (for per chunk) or array of values (for per element); type and shape as per :meth:`.hasarray()`. Raises: @@ -584,7 +601,7 @@ def __delitem__(self, index): def has_array(self, name): """ - Checks whether an array of the given name exists and returns meta data given to :method:`.add_array()`. + Checks whether an array of the given name exists and returns meta data given to :meth:`.add_array()`. >>> container.has_array("energy") {'shape': (), 'dtype': np.float64, 'per': 'chunk'} @@ -596,7 +613,7 @@ def has_array(self, name): Returns: None: if array does not exist - dict: if array exists, keys corresponds to the shape, dtype and per arguments of :method:`.add_array` + dict: if array exists, keys corresponds to the shape, dtype and per arguments of :meth:`.add_array` """ if name in self._per_element_arrays: a = self._per_element_arrays[name] @@ -608,14 +625,21 @@ def has_array(self, name): return None return {"shape": a.shape[1:], "dtype": a.dtype, "per": per} - def list_arrays(self) -> List[str]: + def list_arrays(self, only_user=False) -> List[str]: """ Return a list of names of arrays inside the storage. + Args: + only_user (bool): If `True` include only array names added by the + user via :meth:`.add_array` and the `identifier` array. + Returns: list of str: array names """ - return list(self._per_chunk_arrays) + list(self._per_element_arrays) + arrays = list(self._per_chunk_arrays) + list(self._per_element_arrays) + if only_user: + arrays = [a for a in arrays if a not in self._internal_arrays()] + return arrays def sample( self, selector: Callable[["FlattenedStorage", int], bool] @@ -632,7 +656,7 @@ def sample( Returns: :class:`.FlattenedStorage` or subclass: storage with the selected chunks """ - new = self.__class__() + new = type(self)() for k, a in self._per_chunk_arrays.items(): if k not in ("start_index", "length", "identifier"): new.add_array(k, shape=a.shape[1:], dtype=a.dtype, per="chunk") @@ -694,9 +718,19 @@ def join( Args: store (:class:`.FlattenedStorage`): storage to join + lsuffix, rsuffix (str, optional): if either are given rename *all* arrays by appending the suffices to the + array name; `lsuffix` for arrays in this storage, `rsuffix` for arrays in + the added storage; in this case arrays are no longer available under the + old name Returns: :class:`.FlattenedStorage`: self + + Raise: + ValueError: if the two stores do not have the same number of chunks + ValueError: if the two stores do not have equal chunk lengths + ValueError: if lsuffix and rsuffix are equal and different from "" + ValueError: if the stores share array names but `lsuffix` and `rsuffix` are not given """ if len(self) != len(store): raise ValueError( @@ -727,6 +761,8 @@ def join( for k, a in store._per_element_arrays.items(): if k in self._per_element_arrays and rename: self._per_element_arrays[k + lsuffix] = self._per_element_arrays[k] + if lsuffix != "": + del self._per_element_arrays[k] k += rsuffix self._per_element_arrays[k] = a @@ -734,6 +770,8 @@ def join( if k not in ("start_index", "length", "identifier"): if k in self._per_chunk_arrays and rename: self._per_chunk_arrays[k + lsuffix] = self._per_chunk_arrays[k] + if lsuffix != "": + del self._per_chunk_arrays[k] k += rsuffix self._per_chunk_arrays[k] = a @@ -767,10 +805,10 @@ def add_chunk(self, chunk_length, identifier=None, **arrays): >>> container.get_array("pressure", 2).shape (3, 3) - .. attention: Edge-case! + .. attention:: Edge-case! This will not work when the chunk length is also 1 and the array does not exist yet! In this case the array - will be assumed to be per element and there is no way around explicitly calling :method:`.add_array()`. + will be assumed to be per element and there is no way around explicitly calling :meth:`.add_array()`. Args: @@ -833,6 +871,22 @@ def add_chunk(self, chunk_length, identifier=None, **arrays): # return last_chunk_index, last_element_index def extend(self, other: "FlattenedStorage"): + """ + Add chunks from `other` to this storage. + + Afterwards the number of chunks and elements are the sum of the respective previous values. + + If `other` defines new arrays or doesn't define some of the arrays they are padded by the fill values. + + Args: + other (:class:`.FlattenedStorage`): other storage to add + + Raises: + ValueError: if fill values between both storages are not compatible + + Returns: + FlattenedStorage: return this storage + """ self._check_compatible_fill_values(other=other) combined_num_chunks = self.num_chunks + other.num_chunks @@ -875,6 +929,8 @@ def extend(self, other: "FlattenedStorage"): self.current_chunk_index = self.num_chunks self.current_element_index = self.num_elements + return self + def _check_compatible_fill_values(self, other: "FlattenedStorage"): """ Check if fill values of 2 FlattenedStorages match to prevent errors due to wrong fill values, @@ -995,6 +1051,29 @@ def read_array(name, hdf): if version >= "0.3.0": self._fill_values = hdf["_fill_values"] + def to_pandas(self, explode=False, include_index=False) -> pd.DataFrame: + """ + Convert arrays to pandas dataframe. + + Args: + explode (bool): If `False` values of per element arrays are stored + in the dataframe as arrays, otherwise each row in the dataframe + corresponds to an element in the original storage. + + Returns: + :class:`pandas.DataFrame`: table of array values + """ + arrays = self.list_arrays(only_user=True) + df = pd.DataFrame({a: self.get_array_ragged(a) for a in arrays}) + if explode: + elem_arrays = [a for a in arrays if self.has_array(a)["per"] == "element"] + df = ( + df.explode(elem_arrays) + .infer_objects(copy=False) + .reset_index(drop=not include_index) + ) + return df + def get_dtype_and_fill(storage: FlattenedStorage, name: str) -> Tuple[np.generic, Any]: fill = None diff --git a/tests/generic/test_flattenedstorage.py b/tests/generic/test_flattenedstorage.py index a3245bee8..74bf5482c 100644 --- a/tests/generic/test_flattenedstorage.py +++ b/tests/generic/test_flattenedstorage.py @@ -290,10 +290,14 @@ def test_list_arrays(self): store = FlattenedStorage() self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index"]), "Array names of empty storage don't match default arrays!") + self.assertEqual(store.list_arrays(only_user=True), ["identifier"], + "User array names of empty storage contains more than `identifier`!") store.add_array("energy", per="chunk") store.add_array("forces", shape=(3,), per="element") self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index", "energy", "forces"]), "Array names don't match added ones!") + self.assertEqual(sorted(store.list_arrays(only_user=True)), sorted(["identifier", "energy", "forces"]), + "Array names don't match added ones!") def test_hdf_empty(self): """Writing an empty storage should result in an empty storage when reading.""" @@ -543,7 +547,7 @@ def test_extend(self): foo.append(foo_val) bar.append(bar_val) store.add_chunk(i, identifier=f"ID{i}", foo=foo_val, bar=bar_val) - + for i in range(3, 5): # default length for identifiers is 20 chars, so we need to push it a bit more foo_val = i @@ -596,3 +600,33 @@ def test_del_array(self): store.del_array("elem2") self.assertTrue("elem2" not in store.list_arrays(), "Per element array still present after del_array") + + def test_to_pandas(self): + """to_pandas should return a dataframe with user defined arrays.""" + + store = FlattenedStorage( + even=self.even, + odd=self.odd, + even_sum=self.even_sum, + odd_sum=self.odd_sum, + ) + + arrays = store.list_arrays(only_user=True) + dfc = store.to_pandas() + self.assertEqual(sorted(arrays), sorted(dfc.columns), + "Not all columns present in dataframe!") + for a in arrays: + with self.subTest(array=a): + for i, (elem_df, elem_st) in enumerate(zip(dfc[a], store.get_array_ragged(a))): + self.assertEqual(elem_df, elem_st, + f"Element {i} in dataframe not equal to original: {elem_df}!={elem_st}!") + + dfe = store.to_pandas(explode=True) + for a in arrays: + with self.subTest(array=a): + if a == "identifier": + self.assertEqual(dfe[a].to_numpy().dtype, np.dtype("O"), + "dtype not conserved with explode=True!") + else: + self.assertEqual(dfe[a].to_numpy().dtype, store[a].dtype, + "dtype not conserved with explode=True!")