Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FlattenedStorage: Allow export to pandas dataframes #1207

Merged
merged 4 commits into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 99 additions & 20 deletions pyiron_base/storage/flattenedstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import numpy as np
import h5py
import pandas as pd
from pyiron_base.interfaces.has_hdf import HasHDF


Expand Down Expand Up @@ -120,15 +121,15 @@ class FlattenedStorage(HasHDF):
>>> store.get_array("sum", 2)
57

Finally you may add multiple arrays in one call to :method:`.add_chunk` by using keyword arguments
Finally you may add multiple arrays in one call to :meth:`.add_chunk` by using keyword arguments

>>> store.add_chunk(4, even=[14, 16, 18, 20], odd=[13, 15, 17, 19], sum=119)
>>> store.get_array("sum", 3)
119
>>> store.get_array("even", 3)
array([14, 16, 18, 20])

It is usually not necessary to call :method:`.add_array` before :method:`.add_chunk`, the type of the array will be
It is usually not necessary to call :meth:`.add_array` before :meth:`.add_chunk`, the type of the array will be
inferred in this case.

If you skip the `frame` argument to :meth:`.get_array` it will return a flat array of all the values for that array
Expand All @@ -139,9 +140,9 @@ class FlattenedStorage(HasHDF):
>>> store.get_array("even")
array([ 0, 4, 6, 8, 10, 12, 14, 16, 18, 20])

Arrays may be of more complicated shape, too, see :method:`.add_array` for details.
Arrays may be of more complicated shape, too, see :meth:`.add_array` for details.

Use :method:`.copy` to obtain a deep copy of the storage, for shallow copies using the builting `copy.copy` is
Use :meth:`.copy` to obtain a deep copy of the storage, for shallow copies using the builting `copy.copy` is
sufficient.

>>> copy = store.copy()
Expand All @@ -152,7 +153,7 @@ class FlattenedStorage(HasHDF):
>>> copy["even"]
array([0, 4, 6, 8, 10, 12])

Storages can be :method:`.split` and :method:`.join` again as long as their internal chunk structure is consistent,
Storages can be :meth:`.split` and :meth:`.join` again as long as their internal chunk structure is consistent,
i.e. same number of chunks and same chunk lengths. If this is not the case a `ValueError` is raised.

>>> even = store.split(["even"])
Expand All @@ -162,11 +163,11 @@ class FlattenedStorage(HasHDF):
False
>>> odd = store.split(["odd"])

:method:`.join` adds new arrays to the storage it is called on in-place. To leave it unchanged, simply call copy
:meth:`.join` adds new arrays to the storage it is called on in-place. To leave it unchanged, simply call copy
before join.
>>> both = even.copy().join(odd)

Chunks may be given string names, either by passing `identifier` to :method:`.add_chunk` or by setting to the
Chunks may be given string names, either by passing `identifier` to :meth:`.add_chunk` or by setting to the
special per chunk array "identifier"

>>> store.set_array("identifier", 1, "second")
Expand Down Expand Up @@ -269,6 +270,22 @@ def _init_arrays(self):
def __len__(self):
return self.current_chunk_index

def _internal_arrays(self) -> Tuple[str, ...]:
"""
Names of "internal" arrays, i.e. arrays needed for the correct inner
working of the flattened storage and that not are not added by the
user via :meth:`.add_array`.

Subclasses can override this tuple, by calling `super()` and appending
to it.

This exists mostly to support :meth:`.to_pandas()`.
"""
return (
"start_index",
"length",
)

def copy(self):
"""
Return a deep copy of the storage.
Expand All @@ -283,7 +300,7 @@ def find_chunk(self, identifier):
Return integer index for given identifier.

Args:
identifier (str): name of chunk previously passed to :method:`.add_chunk`
identifier (str): name of chunk previously passed to :meth:`.add_chunk`

Returns:
int: integer index for chunk
Expand Down Expand Up @@ -426,7 +443,7 @@ def get_array(self, name, frame=None):

Args:
name (str): name of the array to fetch
frame (int, str, optional): selects structure to fetch, as in :method:`.get_structure()`, if not given
frame (int, str, optional): selects structure to fetch, as in :meth:`.get_structure()`, if not given
return a flat array of all values for either all chunks or elements

Returns:
Expand Down Expand Up @@ -458,7 +475,7 @@ def get_array_ragged(self, name: str) -> np.ndarray:
Return elements of array `name` in all chunks. Values are returned in a ragged array of dtype=object.

If `name` specifies a per chunk array, there's nothing to pad and this method is equivalent to
:method:`.get_array`.
:meth:`.get_array`.

Args:
name (str): name of array to fetch
Expand All @@ -480,10 +497,10 @@ def get_array_filled(self, name: str) -> np.ndarray:
Return elements of array `name` in all chunks. Arrays are padded to be all of the same length.

The padding value depends on the datatpye of the array or can be configured via the `fill` parameter of
:method:`.add_array`.
:meth:`.add_array`.

If `name` specifies a per chunk array, there's nothing to pad and this method is equivalent to
:method:`.get_array`.
:meth:`.get_array`.

Args:
name (str): name of array to fetch
Expand Down Expand Up @@ -518,7 +535,7 @@ def set_array(self, name, frame, value):

Args:
name (str): name of array to set
frame (int, str): selects structure to set, as in :method:`.get_strucure()`
frame (int, str): selects structure to set, as in :meth:`.get_strucure()`
value: value (for per chunk) or array of values (for per element); type and shape as per :meth:`.hasarray()`.

Raises:
Expand Down Expand Up @@ -584,7 +601,7 @@ def __delitem__(self, index):

def has_array(self, name):
"""
Checks whether an array of the given name exists and returns meta data given to :method:`.add_array()`.
Checks whether an array of the given name exists and returns meta data given to :meth:`.add_array()`.

>>> container.has_array("energy")
{'shape': (), 'dtype': np.float64, 'per': 'chunk'}
Expand All @@ -596,7 +613,7 @@ def has_array(self, name):

Returns:
None: if array does not exist
dict: if array exists, keys corresponds to the shape, dtype and per arguments of :method:`.add_array`
dict: if array exists, keys corresponds to the shape, dtype and per arguments of :meth:`.add_array`
"""
if name in self._per_element_arrays:
a = self._per_element_arrays[name]
Expand All @@ -608,14 +625,21 @@ def has_array(self, name):
return None
return {"shape": a.shape[1:], "dtype": a.dtype, "per": per}

def list_arrays(self) -> List[str]:
def list_arrays(self, only_user=False) -> List[str]:
"""
Return a list of names of arrays inside the storage.

Args:
only_user (bool): If `True` include only array names added by the
user via :meth:`.add_array` and the `identifier` array.

Returns:
list of str: array names
"""
return list(self._per_chunk_arrays) + list(self._per_element_arrays)
arrays = list(self._per_chunk_arrays) + list(self._per_element_arrays)
if only_user:
arrays = [a for a in arrays if a not in self._internal_arrays()]
return arrays

def sample(
self, selector: Callable[["FlattenedStorage", int], bool]
Expand All @@ -632,7 +656,7 @@ def sample(
Returns:
:class:`.FlattenedStorage` or subclass: storage with the selected chunks
"""
new = self.__class__()
new = type(self)()
for k, a in self._per_chunk_arrays.items():
if k not in ("start_index", "length", "identifier"):
new.add_array(k, shape=a.shape[1:], dtype=a.dtype, per="chunk")
Expand Down Expand Up @@ -694,9 +718,19 @@ def join(

Args:
store (:class:`.FlattenedStorage`): storage to join
lsuffix, rsuffix (str, optional): if either are given rename *all* arrays by appending the suffices to the
array name; `lsuffix` for arrays in this storage, `rsuffix` for arrays in
the added storage; in this case arrays are no longer available under the
old name

Returns:
:class:`.FlattenedStorage`: self

Raise:
ValueError: if the two stores do not have the same number of chunks
ValueError: if the two stores do not have equal chunk lengths
ValueError: if lsuffix and rsuffix are equal and different from ""
ValueError: if the stores share array names but `lsuffix` and `rsuffix` are not given
"""
if len(self) != len(store):
raise ValueError(
Expand Down Expand Up @@ -727,13 +761,17 @@ def join(
for k, a in store._per_element_arrays.items():
if k in self._per_element_arrays and rename:
self._per_element_arrays[k + lsuffix] = self._per_element_arrays[k]
if lsuffix != "":
del self._per_element_arrays[k]
k += rsuffix
self._per_element_arrays[k] = a

for k, a in store._per_chunk_arrays.items():
if k not in ("start_index", "length", "identifier"):
if k in self._per_chunk_arrays and rename:
self._per_chunk_arrays[k + lsuffix] = self._per_chunk_arrays[k]
if lsuffix != "":
del self._per_chunk_arrays[k]
k += rsuffix
self._per_chunk_arrays[k] = a

Expand Down Expand Up @@ -767,10 +805,10 @@ def add_chunk(self, chunk_length, identifier=None, **arrays):
>>> container.get_array("pressure", 2).shape
(3, 3)

.. attention: Edge-case!
.. attention:: Edge-case!

This will not work when the chunk length is also 1 and the array does not exist yet! In this case the array
will be assumed to be per element and there is no way around explicitly calling :method:`.add_array()`.
will be assumed to be per element and there is no way around explicitly calling :meth:`.add_array()`.


Args:
Expand Down Expand Up @@ -833,6 +871,22 @@ def add_chunk(self, chunk_length, identifier=None, **arrays):
# return last_chunk_index, last_element_index

def extend(self, other: "FlattenedStorage"):
"""
Add chunks from `other` to this storage.

Afterwards the number of chunks and elements are the sum of the respective previous values.

If `other` defines new arrays or doesn't define some of the arrays they are padded by the fill values.

Args:
other (:class:`.FlattenedStorage`): other storage to add

Raises:
ValueError: if fill values between both storages are not compatible

Returns:
FlattenedStorage: return this storage
"""
self._check_compatible_fill_values(other=other)

combined_num_chunks = self.num_chunks + other.num_chunks
Expand Down Expand Up @@ -875,6 +929,8 @@ def extend(self, other: "FlattenedStorage"):
self.current_chunk_index = self.num_chunks
self.current_element_index = self.num_elements

return self

def _check_compatible_fill_values(self, other: "FlattenedStorage"):
"""
Check if fill values of 2 FlattenedStorages match to prevent errors due to wrong fill values,
Expand Down Expand Up @@ -995,6 +1051,29 @@ def read_array(name, hdf):
if version >= "0.3.0":
self._fill_values = hdf["_fill_values"]

def to_pandas(self, explode=False, include_index=False) -> pd.DataFrame:
"""
Convert arrays to pandas dataframe.

Args:
explode (bool): If `False` values of per element arrays are stored
in the dataframe as arrays, otherwise each row in the dataframe
corresponds to an element in the original storage.

Returns:
:class:`pandas.DataFrame`: table of array values
"""
arrays = self.list_arrays(only_user=True)
df = pd.DataFrame({a: self.get_array_ragged(a) for a in arrays})
if explode:
elem_arrays = [a for a in arrays if self.has_array(a)["per"] == "element"]
df = (
df.explode(elem_arrays)
.infer_objects(copy=False)
.reset_index(drop=not include_index)
)
return df


def get_dtype_and_fill(storage: FlattenedStorage, name: str) -> Tuple[np.generic, Any]:
fill = None
Expand Down
36 changes: 35 additions & 1 deletion tests/generic/test_flattenedstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,14 @@ def test_list_arrays(self):
store = FlattenedStorage()
self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index"]),
"Array names of empty storage don't match default arrays!")
self.assertEqual(store.list_arrays(only_user=True), ["identifier"],
"User array names of empty storage contains more than `identifier`!")
store.add_array("energy", per="chunk")
store.add_array("forces", shape=(3,), per="element")
self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index", "energy", "forces"]),
"Array names don't match added ones!")
self.assertEqual(sorted(store.list_arrays(only_user=True)), sorted(["identifier", "energy", "forces"]),
"Array names don't match added ones!")

def test_hdf_empty(self):
"""Writing an empty storage should result in an empty storage when reading."""
Expand Down Expand Up @@ -543,7 +547,7 @@ def test_extend(self):
foo.append(foo_val)
bar.append(bar_val)
store.add_chunk(i, identifier=f"ID{i}", foo=foo_val, bar=bar_val)

for i in range(3, 5):
# default length for identifiers is 20 chars, so we need to push it a bit more
foo_val = i
Expand Down Expand Up @@ -596,3 +600,33 @@ def test_del_array(self):
store.del_array("elem2")
self.assertTrue("elem2" not in store.list_arrays(),
"Per element array still present after del_array")

def test_to_pandas(self):
"""to_pandas should return a dataframe with user defined arrays."""

store = FlattenedStorage(
even=self.even,
odd=self.odd,
even_sum=self.even_sum,
odd_sum=self.odd_sum,
)

arrays = store.list_arrays(only_user=True)
dfc = store.to_pandas()
self.assertEqual(sorted(arrays), sorted(dfc.columns),
"Not all columns present in dataframe!")
for a in arrays:
with self.subTest(array=a):
for i, (elem_df, elem_st) in enumerate(zip(dfc[a], store.get_array_ragged(a))):
self.assertEqual(elem_df, elem_st,
f"Element {i} in dataframe not equal to original: {elem_df}!={elem_st}!")

dfe = store.to_pandas(explode=True)
for a in arrays:
with self.subTest(array=a):
if a == "identifier":
self.assertEqual(dfe[a].to_numpy().dtype, np.dtype("O"),
"dtype not conserved with explode=True!")
else:
self.assertEqual(dfe[a].to_numpy().dtype, store[a].dtype,
"dtype not conserved with explode=True!")