Skip to content

Commit

Permalink
First ideas of validation based on hdf tree traversal
Browse files Browse the repository at this point in the history
  • Loading branch information
domna committed May 17, 2024
1 parent 91c0420 commit 3483bd4
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 6 deletions.
31 changes: 28 additions & 3 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --extra=dev --extra=docs --output-file=dev-requirements.txt pyproject.toml
#
annotated-types==0.6.0
# via pydantic
anytree==2.12.1
# via pynxtools (pyproject.toml)
ase==3.22.1
# via pynxtools (pyproject.toml)
babel==2.14.0
# via mkdocs-material
build==1.1.1
# via pip-tools
cachetools==5.3.3
# via pynxtools (pyproject.toml)
certifi==2024.2.2
# via requests
cfgv==3.4.0
Expand All @@ -34,6 +40,8 @@ cycler==0.12.1
# via matplotlib
distlib==0.3.8
# via virtualenv
exceptiongroup==1.2.1
# via pytest
filelock==3.13.3
# via virtualenv
fonttools==4.50.0
Expand Down Expand Up @@ -130,6 +138,10 @@ pluggy==1.4.0
# via pytest
pre-commit==3.7.0
# via pynxtools (pyproject.toml)
pydantic==2.7.1
# via pynxtools (pyproject.toml)
pydantic-core==2.18.2
# via pydantic
pygments==2.17.2
# via mkdocs-material
pymdown-extensions==10.7.1
Expand Down Expand Up @@ -176,19 +188,32 @@ ruff==0.3.4
scipy==1.12.0
# via ase
six==1.16.0
# via python-dateutil
# via
# anytree
# python-dateutil
structlog==24.1.0
# via pynxtools (pyproject.toml)
termcolor==2.4.0
# via mkdocs-macros-plugin
tomli==2.0.1
# via
# build
# coverage
# mypy
# pip-tools
# pyproject-hooks
# pytest
types-pytz==2024.1.0.20240203
# via pynxtools (pyproject.toml)
types-pyyaml==6.0.12.20240311
# via pynxtools (pyproject.toml)
types-requests==2.31.0.20240311
# via pynxtools (pyproject.toml)
typing-extensions==4.10.0
# via mypy
# via
# mypy
# pydantic
# pydantic-core
tzdata==2024.1
# via pandas
urllib3==2.2.1
Expand Down
89 changes: 86 additions & 3 deletions pynxtools/dataconverter/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@
from collections import defaultdict
from functools import reduce
from operator import getitem
from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union
from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple, Union

import h5py
import lxml.etree as ET
import numpy as np
from anytree import Resolver
from cachetools import LRUCache, cached
from cachetools.keys import hashkey

from pynxtools.dataconverter.helpers import (
Collector,
Expand All @@ -42,20 +44,101 @@
from pynxtools.definitions.dev_tools.utils.nxdl_utils import get_nx_namefit


def best_namefit_of_(
name: str, concepts: Set[str], nx_class: Optional[str] = None
) -> str:
# TODO: Find the best namefit of name in concepts
# Consider nx_class if it is not None
...


def validate_hdf_group_against(appdef: str, data: h5py.Group):
"""
Checks whether all the required paths from the template are returned in data dict.
THIS IS JUST A FUNCTION SKELETON AND IS NOT WORKING YET!
"""

def validate(name: str, data: Union[h5py.Group, h5py.Dataset]):
# Only cache based on path. That way we retain the nx_class information
# in the tree
# Allow for 10000 cache entries. This should be enough for most cases
@cached(
cache=LRUCache(maxsize=10000),
key=lambda path, _: hashkey(path),
)
def find_node_for(path: str, nx_class: Optional[str] = None) -> Optional[NexusNode]:
if path == "":
return tree

prev_path, last_elem = path.rsplit("/", 1)
node = find_node_for(prev_path)

best_child = best_namefit_of_(
last_elem,
# TODO: Consider renaming `get_all_children_names` to
# `get_all_direct_children_names`. Because that's what it is.
node.get_all_children_names(),
nx_class,
)
if best_child is None:
return None

return node.search_child_with_name(best_child)

def remove_from_req_fields(path: str):
if path in required_fields:
required_fields.remove(path)

def handle_group(path: str, data: h5py.Group):
node = find_node_for(path, data.attrs.get("NX_class"))
if node is None:
# TODO: Log undocumented
return

# TODO: Do actual group checks

def handle_field(path: str, data: h5py.Dataset):
node = find_node_for(path)
if node is None:
# TODO: Log undocumented
return
remove_from_req_fields(f"{path}")

# TODO: Do actual field checks

def handle_attributes(path: str, attribute_names: h5py.AttributeManager):
for attr_name in attribute_names:
node = find_node_for(f"{path}/{attr_name}")
if node is None:
# TODO: Log undocumented
continue
remove_from_req_fields(f"{path}/@{attr_name}")

# TODO: Do actual attribute checks

def validate(path: str, data: Union[h5py.Group, h5py.Dataset]):
# Namefit name against tree (use recursive caching)
pass
if isinstance(data, h5py.Group):
handle_group(path, data)
elif isinstance(data, h5py.Dataset):
handle_field(path, data)

handle_attributes(path, data.attrs)

tree = generate_tree_from(appdef)
required_fields = tree.required_fields_and_attrs_names()
data.visitems(validate)

for req_field in required_fields:
if "@" in req_field:
collector.collect_and_log(
req_field, ValidationProblem.MissingRequiredAttribute, None
)
continue
collector.collect_and_log(
req_field, ValidationProblem.MissingRequiredField, None
)


def build_nested_dict_from(
mapping: Mapping[str, Any],
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ dependencies = [
"lxml>=4.9.1",
"anytree",
"pydantic",
"cachetools",
]

[project.urls]
Expand Down

0 comments on commit 3483bd4

Please sign in to comment.