Skip to content

Commit

Permalink
Merge pull request #1207 from pyiron/flatdf
Browse files Browse the repository at this point in the history
FlattenedStorage: Allow export to pandas dataframes
  • Loading branch information
pmrv authored Oct 9, 2023
2 parents c3bdb53 + c814c39 commit be280c4
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 21 deletions.
119 changes: 99 additions & 20 deletions pyiron_base/storage/flattenedstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import numpy as np
import h5py
import pandas as pd
from pyiron_base.interfaces.has_hdf import HasHDF


Expand Down Expand Up @@ -120,15 +121,15 @@ class FlattenedStorage(HasHDF):
>>> store.get_array("sum", 2)
57
Finally you may add multiple arrays in one call to :method:`.add_chunk` by using keyword arguments
Finally you may add multiple arrays in one call to :meth:`.add_chunk` by using keyword arguments
>>> store.add_chunk(4, even=[14, 16, 18, 20], odd=[13, 15, 17, 19], sum=119)
>>> store.get_array("sum", 3)
119
>>> store.get_array("even", 3)
array([14, 16, 18, 20])
It is usually not necessary to call :method:`.add_array` before :method:`.add_chunk`, the type of the array will be
It is usually not necessary to call :meth:`.add_array` before :meth:`.add_chunk`, the type of the array will be
inferred in this case.
If you skip the `frame` argument to :meth:`.get_array` it will return a flat array of all the values for that array
Expand All @@ -139,9 +140,9 @@ class FlattenedStorage(HasHDF):
>>> store.get_array("even")
array([ 0, 4, 6, 8, 10, 12, 14, 16, 18, 20])
Arrays may be of more complicated shape, too, see :method:`.add_array` for details.
Arrays may be of more complicated shape, too, see :meth:`.add_array` for details.
Use :method:`.copy` to obtain a deep copy of the storage, for shallow copies using the builting `copy.copy` is
Use :meth:`.copy` to obtain a deep copy of the storage, for shallow copies using the builting `copy.copy` is
sufficient.
>>> copy = store.copy()
Expand All @@ -152,7 +153,7 @@ class FlattenedStorage(HasHDF):
>>> copy["even"]
array([0, 4, 6, 8, 10, 12])
Storages can be :method:`.split` and :method:`.join` again as long as their internal chunk structure is consistent,
Storages can be :meth:`.split` and :meth:`.join` again as long as their internal chunk structure is consistent,
i.e. same number of chunks and same chunk lengths. If this is not the case a `ValueError` is raised.
>>> even = store.split(["even"])
Expand All @@ -162,11 +163,11 @@ class FlattenedStorage(HasHDF):
False
>>> odd = store.split(["odd"])
:method:`.join` adds new arrays to the storage it is called on in-place. To leave it unchanged, simply call copy
:meth:`.join` adds new arrays to the storage it is called on in-place. To leave it unchanged, simply call copy
before join.
>>> both = even.copy().join(odd)
Chunks may be given string names, either by passing `identifier` to :method:`.add_chunk` or by setting to the
Chunks may be given string names, either by passing `identifier` to :meth:`.add_chunk` or by setting to the
special per chunk array "identifier"
>>> store.set_array("identifier", 1, "second")
Expand Down Expand Up @@ -269,6 +270,22 @@ def _init_arrays(self):
def __len__(self):
return self.current_chunk_index

def _internal_arrays(self) -> Tuple[str, ...]:
"""
Names of "internal" arrays, i.e. arrays needed for the correct inner
working of the flattened storage and that not are not added by the
user via :meth:`.add_array`.
Subclasses can override this tuple, by calling `super()` and appending
to it.
This exists mostly to support :meth:`.to_pandas()`.
"""
return (
"start_index",
"length",
)

def copy(self):
"""
Return a deep copy of the storage.
Expand All @@ -283,7 +300,7 @@ def find_chunk(self, identifier):
Return integer index for given identifier.
Args:
identifier (str): name of chunk previously passed to :method:`.add_chunk`
identifier (str): name of chunk previously passed to :meth:`.add_chunk`
Returns:
int: integer index for chunk
Expand Down Expand Up @@ -426,7 +443,7 @@ def get_array(self, name, frame=None):
Args:
name (str): name of the array to fetch
frame (int, str, optional): selects structure to fetch, as in :method:`.get_structure()`, if not given
frame (int, str, optional): selects structure to fetch, as in :meth:`.get_structure()`, if not given
return a flat array of all values for either all chunks or elements
Returns:
Expand Down Expand Up @@ -458,7 +475,7 @@ def get_array_ragged(self, name: str) -> np.ndarray:
Return elements of array `name` in all chunks. Values are returned in a ragged array of dtype=object.
If `name` specifies a per chunk array, there's nothing to pad and this method is equivalent to
:method:`.get_array`.
:meth:`.get_array`.
Args:
name (str): name of array to fetch
Expand All @@ -480,10 +497,10 @@ def get_array_filled(self, name: str) -> np.ndarray:
Return elements of array `name` in all chunks. Arrays are padded to be all of the same length.
The padding value depends on the datatpye of the array or can be configured via the `fill` parameter of
:method:`.add_array`.
:meth:`.add_array`.
If `name` specifies a per chunk array, there's nothing to pad and this method is equivalent to
:method:`.get_array`.
:meth:`.get_array`.
Args:
name (str): name of array to fetch
Expand Down Expand Up @@ -518,7 +535,7 @@ def set_array(self, name, frame, value):
Args:
name (str): name of array to set
frame (int, str): selects structure to set, as in :method:`.get_strucure()`
frame (int, str): selects structure to set, as in :meth:`.get_strucure()`
value: value (for per chunk) or array of values (for per element); type and shape as per :meth:`.hasarray()`.
Raises:
Expand Down Expand Up @@ -584,7 +601,7 @@ def __delitem__(self, index):

def has_array(self, name):
"""
Checks whether an array of the given name exists and returns meta data given to :method:`.add_array()`.
Checks whether an array of the given name exists and returns meta data given to :meth:`.add_array()`.
>>> container.has_array("energy")
{'shape': (), 'dtype': np.float64, 'per': 'chunk'}
Expand All @@ -596,7 +613,7 @@ def has_array(self, name):
Returns:
None: if array does not exist
dict: if array exists, keys corresponds to the shape, dtype and per arguments of :method:`.add_array`
dict: if array exists, keys corresponds to the shape, dtype and per arguments of :meth:`.add_array`
"""
if name in self._per_element_arrays:
a = self._per_element_arrays[name]
Expand All @@ -608,14 +625,21 @@ def has_array(self, name):
return None
return {"shape": a.shape[1:], "dtype": a.dtype, "per": per}

def list_arrays(self) -> List[str]:
def list_arrays(self, only_user=False) -> List[str]:
"""
Return a list of names of arrays inside the storage.
Args:
only_user (bool): If `True` include only array names added by the
user via :meth:`.add_array` and the `identifier` array.
Returns:
list of str: array names
"""
return list(self._per_chunk_arrays) + list(self._per_element_arrays)
arrays = list(self._per_chunk_arrays) + list(self._per_element_arrays)
if only_user:
arrays = [a for a in arrays if a not in self._internal_arrays()]
return arrays

def sample(
self, selector: Callable[["FlattenedStorage", int], bool]
Expand All @@ -632,7 +656,7 @@ def sample(
Returns:
:class:`.FlattenedStorage` or subclass: storage with the selected chunks
"""
new = self.__class__()
new = type(self)()
for k, a in self._per_chunk_arrays.items():
if k not in ("start_index", "length", "identifier"):
new.add_array(k, shape=a.shape[1:], dtype=a.dtype, per="chunk")
Expand Down Expand Up @@ -694,9 +718,19 @@ def join(
Args:
store (:class:`.FlattenedStorage`): storage to join
lsuffix, rsuffix (str, optional): if either are given rename *all* arrays by appending the suffices to the
array name; `lsuffix` for arrays in this storage, `rsuffix` for arrays in
the added storage; in this case arrays are no longer available under the
old name
Returns:
:class:`.FlattenedStorage`: self
Raise:
ValueError: if the two stores do not have the same number of chunks
ValueError: if the two stores do not have equal chunk lengths
ValueError: if lsuffix and rsuffix are equal and different from ""
ValueError: if the stores share array names but `lsuffix` and `rsuffix` are not given
"""
if len(self) != len(store):
raise ValueError(
Expand Down Expand Up @@ -727,13 +761,17 @@ def join(
for k, a in store._per_element_arrays.items():
if k in self._per_element_arrays and rename:
self._per_element_arrays[k + lsuffix] = self._per_element_arrays[k]
if lsuffix != "":
del self._per_element_arrays[k]
k += rsuffix
self._per_element_arrays[k] = a

for k, a in store._per_chunk_arrays.items():
if k not in ("start_index", "length", "identifier"):
if k in self._per_chunk_arrays and rename:
self._per_chunk_arrays[k + lsuffix] = self._per_chunk_arrays[k]
if lsuffix != "":
del self._per_chunk_arrays[k]
k += rsuffix
self._per_chunk_arrays[k] = a

Expand Down Expand Up @@ -767,10 +805,10 @@ def add_chunk(self, chunk_length, identifier=None, **arrays):
>>> container.get_array("pressure", 2).shape
(3, 3)
.. attention: Edge-case!
.. attention:: Edge-case!
This will not work when the chunk length is also 1 and the array does not exist yet! In this case the array
will be assumed to be per element and there is no way around explicitly calling :method:`.add_array()`.
will be assumed to be per element and there is no way around explicitly calling :meth:`.add_array()`.
Args:
Expand Down Expand Up @@ -833,6 +871,22 @@ def add_chunk(self, chunk_length, identifier=None, **arrays):
# return last_chunk_index, last_element_index

def extend(self, other: "FlattenedStorage"):
"""
Add chunks from `other` to this storage.
Afterwards the number of chunks and elements are the sum of the respective previous values.
If `other` defines new arrays or doesn't define some of the arrays they are padded by the fill values.
Args:
other (:class:`.FlattenedStorage`): other storage to add
Raises:
ValueError: if fill values between both storages are not compatible
Returns:
FlattenedStorage: return this storage
"""
self._check_compatible_fill_values(other=other)

combined_num_chunks = self.num_chunks + other.num_chunks
Expand Down Expand Up @@ -875,6 +929,8 @@ def extend(self, other: "FlattenedStorage"):
self.current_chunk_index = self.num_chunks
self.current_element_index = self.num_elements

return self

def _check_compatible_fill_values(self, other: "FlattenedStorage"):
"""
Check if fill values of 2 FlattenedStorages match to prevent errors due to wrong fill values,
Expand Down Expand Up @@ -995,6 +1051,29 @@ def read_array(name, hdf):
if version >= "0.3.0":
self._fill_values = hdf["_fill_values"]

def to_pandas(self, explode=False, include_index=False) -> pd.DataFrame:
"""
Convert arrays to pandas dataframe.
Args:
explode (bool): If `False` values of per element arrays are stored
in the dataframe as arrays, otherwise each row in the dataframe
corresponds to an element in the original storage.
Returns:
:class:`pandas.DataFrame`: table of array values
"""
arrays = self.list_arrays(only_user=True)
df = pd.DataFrame({a: self.get_array_ragged(a) for a in arrays})
if explode:
elem_arrays = [a for a in arrays if self.has_array(a)["per"] == "element"]
df = (
df.explode(elem_arrays)
.infer_objects(copy=False)
.reset_index(drop=not include_index)
)
return df


def get_dtype_and_fill(storage: FlattenedStorage, name: str) -> Tuple[np.generic, Any]:
fill = None
Expand Down
36 changes: 35 additions & 1 deletion tests/generic/test_flattenedstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,14 @@ def test_list_arrays(self):
store = FlattenedStorage()
self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index"]),
"Array names of empty storage don't match default arrays!")
self.assertEqual(store.list_arrays(only_user=True), ["identifier"],
"User array names of empty storage contains more than `identifier`!")
store.add_array("energy", per="chunk")
store.add_array("forces", shape=(3,), per="element")
self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index", "energy", "forces"]),
"Array names don't match added ones!")
self.assertEqual(sorted(store.list_arrays(only_user=True)), sorted(["identifier", "energy", "forces"]),
"Array names don't match added ones!")

def test_hdf_empty(self):
"""Writing an empty storage should result in an empty storage when reading."""
Expand Down Expand Up @@ -543,7 +547,7 @@ def test_extend(self):
foo.append(foo_val)
bar.append(bar_val)
store.add_chunk(i, identifier=f"ID{i}", foo=foo_val, bar=bar_val)

for i in range(3, 5):
# default length for identifiers is 20 chars, so we need to push it a bit more
foo_val = i
Expand Down Expand Up @@ -596,3 +600,33 @@ def test_del_array(self):
store.del_array("elem2")
self.assertTrue("elem2" not in store.list_arrays(),
"Per element array still present after del_array")

def test_to_pandas(self):
"""to_pandas should return a dataframe with user defined arrays."""

store = FlattenedStorage(
even=self.even,
odd=self.odd,
even_sum=self.even_sum,
odd_sum=self.odd_sum,
)

arrays = store.list_arrays(only_user=True)
dfc = store.to_pandas()
self.assertEqual(sorted(arrays), sorted(dfc.columns),
"Not all columns present in dataframe!")
for a in arrays:
with self.subTest(array=a):
for i, (elem_df, elem_st) in enumerate(zip(dfc[a], store.get_array_ragged(a))):
self.assertEqual(elem_df, elem_st,
f"Element {i} in dataframe not equal to original: {elem_df}!={elem_st}!")

dfe = store.to_pandas(explode=True)
for a in arrays:
with self.subTest(array=a):
if a == "identifier":
self.assertEqual(dfe[a].to_numpy().dtype, np.dtype("O"),
"dtype not conserved with explode=True!")
else:
self.assertEqual(dfe[a].to_numpy().dtype, store[a].dtype,
"dtype not conserved with explode=True!")

0 comments on commit be280c4

Please sign in to comment.