Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FlattenedStorage: Allow export to pandas dataframes #1207

Merged
merged 4 commits into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions pyiron_base/storage/flattenedstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import numpy as np
import h5py
import pandas as pd
from pyiron_base.interfaces.has_hdf import HasHDF


Expand Down Expand Up @@ -269,6 +270,22 @@ def _init_arrays(self):
def __len__(self):
return self.current_chunk_index

def _internal_arrays(self) -> Tuple[str, ...]:
"""
Names of "internal" arrays, i.e. arrays needed for the correct inner
working of the flattened storage and that not are not added by the
user via :meth:`.add_array`.

Subclasses can override this tuple, by calling `super()` and appending
to it.

This exists mostly to support :meth:`.to_pandas()`.
"""
return (
'start_index',
'length',
)

def copy(self):
"""
Return a deep copy of the storage.
Expand Down Expand Up @@ -608,14 +625,21 @@ def has_array(self, name):
return None
return {"shape": a.shape[1:], "dtype": a.dtype, "per": per}

def list_arrays(self) -> List[str]:
def list_arrays(self, only_user=False) -> List[str]:
"""
Return a list of names of arrays inside the storage.

Args:
only_user (bool): If `True` include only array names added by the
user via :meth:`.add_array` and the `identifier` array.

Returns:
list of str: array names
"""
return list(self._per_chunk_arrays) + list(self._per_element_arrays)
arrays = list(self._per_chunk_arrays) + list(self._per_element_arrays)
if only_user:
arrays = [a for a in arrays if a not in self._internal_arrays()]
return arrays

def sample(
self, selector: Callable[["FlattenedStorage", int], bool]
Expand Down Expand Up @@ -995,6 +1019,27 @@ def read_array(name, hdf):
if version >= "0.3.0":
self._fill_values = hdf["_fill_values"]

def to_pandas(self, explode=False, include_index=False) -> pd.DataFrame:
"""
Convert arrays to pandas dataframe.

Args:
explode (bool): If `False` values of per element arrays are stored
in the dataframe as arrays, otherwise each row in the dataframe
corresponds to an element in the original storage.

Returns:
:class:`pandas.DataFrame`: table of array values
"""
arrays = self.list_arrays(only_user=True)
df = pd.DataFrame(
{a: self.get_array_ragged(a) for a in arrays}
)
if explode:
elem_arrays = [a for a in arrays if self.has_array(a)["per"] == "element"]
df = df.explode(elem_arrays).infer_objects(copy=False).reset_index(drop=not include_index)
return df


def get_dtype_and_fill(storage: FlattenedStorage, name: str) -> Tuple[np.generic, Any]:
fill = None
Expand Down
36 changes: 35 additions & 1 deletion tests/generic/test_flattenedstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,14 @@ def test_list_arrays(self):
store = FlattenedStorage()
self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index"]),
"Array names of empty storage don't match default arrays!")
self.assertEqual(store.list_arrays(only_user=True), ["identifier"],
"User array names of empty storage contains more than `identifier`!")
store.add_array("energy", per="chunk")
store.add_array("forces", shape=(3,), per="element")
self.assertEqual(sorted(store.list_arrays()), sorted(["identifier", "length", "start_index", "energy", "forces"]),
"Array names don't match added ones!")
self.assertEqual(sorted(store.list_arrays(only_user=True)), sorted(["identifier", "energy", "forces"]),
"Array names don't match added ones!")

def test_hdf_empty(self):
"""Writing an empty storage should result in an empty storage when reading."""
Expand Down Expand Up @@ -543,7 +547,7 @@ def test_extend(self):
foo.append(foo_val)
bar.append(bar_val)
store.add_chunk(i, identifier=f"ID{i}", foo=foo_val, bar=bar_val)

for i in range(3, 5):
# default length for identifiers is 20 chars, so we need to push it a bit more
foo_val = i
Expand Down Expand Up @@ -596,3 +600,33 @@ def test_del_array(self):
store.del_array("elem2")
self.assertTrue("elem2" not in store.list_arrays(),
"Per element array still present after del_array")

def test_to_pandas(self):
"""to_pandas should return a dataframe with user defined arrays."""

store = FlattenedStorage(
even=self.even,
odd=self.odd,
even_sum=self.even_sum,
odd_sum=self.odd_sum,
)

arrays = store.list_arrays(only_user=True)
dfc = store.to_pandas()
self.assertEqual(sorted(arrays), sorted(dfc.columns),
"Not all columns present in dataframe!")
for a in arrays:
with self.subTest(array=a):
for i, (elem_df, elem_st) in enumerate(zip(dfc[a], store.get_array_ragged(a))):
self.assertEqual(elem_df, elem_st,
f"Element {i} in dataframe not equal to original: {elem_df}!={elem_st}!")

dfe = store.to_pandas(explode=True)
for a in arrays:
with self.subTest(array=a):
if a == "identifier":
self.assertEqual(dfe[a].to_numpy().dtype, np.dtype("O"),
"dtype not conserved with explode=True!")
else:
self.assertEqual(dfe[a].to_numpy().dtype, store[a].dtype,
"dtype not conserved with explode=True!")
Loading