Skip to content

Commit

Permalink
Merge pull request #1247 from pyiron/h5json
Browse files Browse the repository at this point in the history
HDF5 interface: add write_dict_to_hdf() and read_dict_from_hdf()
  • Loading branch information
jan-janssen authored Dec 20, 2023
2 parents 4ff3c6e + a22d4b6 commit 2964d4d
Show file tree
Hide file tree
Showing 3 changed files with 307 additions and 80 deletions.
2 changes: 1 addition & 1 deletion .ci_support/environment-old.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
dependencies:
- cloudpickle =2.0.0
- gitpython =3.1.23
- h5io =0.1.2
- h5io =0.1.8
- h5py =3.6.0
- jinja2 =2.11.3
- numpy =1.23.5
Expand Down
100 changes: 22 additions & 78 deletions pyiron_base/storage/hdfio.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,20 @@
from typing import Union, Optional, Any, Tuple

from pyiron_base.utils.deprecate import deprecate
from pyiron_base.storage.helper_functions import read_hdf5, write_hdf5
from pyiron_base.storage.helper_functions import (
get_h5_path,
open_hdf5,
read_hdf5,
list_groups_and_nodes,
write_hdf5_with_json_support,
write_dict_to_hdf,
_is_ragged_in_1st_dim_only,
)
from pyiron_base.interfaces.has_groups import HasGroups
from pyiron_base.state import state
from pyiron_base.jobs.dynamic import JOB_DYN_DICT, class_constructor
from pyiron_base.jobs.job.util import _get_safe_job_name
import pyiron_base.project.maintenance
import warnings

__author__ = "Joerg Neugebauer, Jan Janssen"
__copyright__ = (
Expand All @@ -37,32 +44,6 @@
__date__ = "Sep 1, 2017"


def _is_ragged_in_1st_dim_only(value: Union[np.ndarray, list]) -> bool:
"""
Checks whether array or list of lists is ragged in the first dimension.
That means all other dimensions (except the first one) still have to match.
Args:
value (ndarray/list): array to check
Returns:
bool: True if elements of value are not all of the same shape
"""
if isinstance(value, np.ndarray) and value.dtype != np.dtype("O"):
return False
else:

def extract_dims(v):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
s = np.shape(v)
return s[0], s[1:]

dim1, dim_other = zip(*map(extract_dims, value))
return len(set(dim1)) > 1 and len(set(dim_other)) == 1


# for historic reasons we write str(class) into the HDF 'TYPE' field of objects, so we need to parse this back out
def _extract_fully_qualified_name(type_field: str) -> str:
return type_field.split("'")[1]
Expand Down Expand Up @@ -162,15 +143,6 @@ def _to_object(hdf, class_name=None, **kwargs):
return obj


def open_hdf5(filename, mode="r", swmr=False):
if swmr and mode != "r":
store = h5py.File(filename, mode=mode, libver="latest")
store.swmr = True
return store
else:
return h5py.File(filename, mode=mode, libver="latest", swmr=swmr)


class FileHDFio(HasGroups, MutableMapping):
"""
Class that provides all info to access a h5 file. This class is based on h5io.py, which allows to
Expand Down Expand Up @@ -342,29 +314,8 @@ def __setitem__(self, key, value):
):
value.to_hdf(self, key)
return

use_json = True
if (
isinstance(value, (list, np.ndarray))
and len(value) > 0
and isinstance(value[0], (list, np.ndarray))
and len(value[0]) > 0
and not isinstance(value[0][0], str)
and _is_ragged_in_1st_dim_only(value)
):
# if the sub-arrays in value all share shape[1:], h5io comes up with a more efficient storage format than
# just writing a dataset for each element, by concatenating along the first axis and storing the indices
# where to break the concatenated array again
value = np.array([np.asarray(v) for v in value], dtype=object)
use_json = False
elif isinstance(value, tuple):
value = list(value)
write_hdf5(
self.file_name,
value,
title=self._get_h5_path(key),
overwrite="update",
use_json=use_json,
write_hdf5_with_json_support(
value=value, path=self._get_h5_path(key), file_handle=self.file_name
)

def __delitem__(self, key):
Expand Down Expand Up @@ -773,22 +724,12 @@ def _list_all(self):
dict: {'groups': [list of groups], 'nodes': [list of nodes]}
"""
if self.file_exists:
groups = set()
nodes = set()
with open_hdf5(self.file_name) as h:
try:
h = h[self.h5_path]
for k in h.keys():
if isinstance(h[k], h5py.Group):
groups.add(k)
else:
nodes.add(k)
except KeyError:
pass
iopy_nodes = self._filter_io_objects(groups)
with open_hdf5(self.file_name) as hdf:
groups, nodes = list_groups_and_nodes(hdf=hdf, h5_path=self.h5_path)
iopy_nodes = self._filter_io_objects(set(groups))
return {
"groups": sorted(list(groups - iopy_nodes)),
"nodes": sorted(list((nodes - groups).union(iopy_nodes))),
"groups": sorted(list(set(groups) - iopy_nodes)),
"nodes": sorted(list((set(nodes) - set(groups)).union(iopy_nodes))),
}
else:
return {"groups": [], "nodes": []}
Expand Down Expand Up @@ -897,8 +838,11 @@ def hd_copy(self, hdf_old, hdf_new, exclude_groups=None, exclude_nodes=None):
(set(hdf_old.list_nodes()) ^ set(check_nodes))
& set(hdf_old.list_nodes())
)
for p in node_list:
hdf_new[p] = hdf_old[p]
write_dict_to_hdf(
file_name=hdf_new.file_name,
h5_path=hdf_new.h5_path,
data_dict={p: hdf_old[p] for p in node_list},
)
for p in group_list:
h_new = hdf_new.create_group(p)
ex_n = [e[-1] for e in exclude_nodes_split if p == e[0] or len(e) == 1]
Expand Down Expand Up @@ -1049,7 +993,7 @@ def _get_h5_path(self, name):
Returns:
str: combined path
"""
return posixpath.join(self.h5_path, name)
return get_h5_path(h5_path=self.h5_path, name=name)

def _get_h5io_type(self, name):
"""
Expand Down
Loading

0 comments on commit 2964d4d

Please sign in to comment.