From 3483bd43ef99e039050c808a371582bdedfccc5c Mon Sep 17 00:00:00 2001 From: domna Date: Fri, 17 May 2024 12:23:03 +0200 Subject: [PATCH] First ideas of validation based on hdf tree traversal --- dev-requirements.txt | 31 +++++++++- pynxtools/dataconverter/validation.py | 89 ++++++++++++++++++++++++++- pyproject.toml | 1 + 3 files changed, 115 insertions(+), 6 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 6f5f67de1..aadbbfaee 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,15 +1,21 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile --extra=dev --extra=docs --output-file=dev-requirements.txt pyproject.toml # +annotated-types==0.6.0 + # via pydantic +anytree==2.12.1 + # via pynxtools (pyproject.toml) ase==3.22.1 # via pynxtools (pyproject.toml) babel==2.14.0 # via mkdocs-material build==1.1.1 # via pip-tools +cachetools==5.3.3 + # via pynxtools (pyproject.toml) certifi==2024.2.2 # via requests cfgv==3.4.0 @@ -34,6 +40,8 @@ cycler==0.12.1 # via matplotlib distlib==0.3.8 # via virtualenv +exceptiongroup==1.2.1 + # via pytest filelock==3.13.3 # via virtualenv fonttools==4.50.0 @@ -130,6 +138,10 @@ pluggy==1.4.0 # via pytest pre-commit==3.7.0 # via pynxtools (pyproject.toml) +pydantic==2.7.1 + # via pynxtools (pyproject.toml) +pydantic-core==2.18.2 + # via pydantic pygments==2.17.2 # via mkdocs-material pymdown-extensions==10.7.1 @@ -176,11 +188,21 @@ ruff==0.3.4 scipy==1.12.0 # via ase six==1.16.0 - # via python-dateutil + # via + # anytree + # python-dateutil structlog==24.1.0 # via pynxtools (pyproject.toml) termcolor==2.4.0 # via mkdocs-macros-plugin +tomli==2.0.1 + # via + # build + # coverage + # mypy + # pip-tools + # pyproject-hooks + # pytest types-pytz==2024.1.0.20240203 # via pynxtools (pyproject.toml) types-pyyaml==6.0.12.20240311 @@ -188,7 +210,10 @@ types-pyyaml==6.0.12.20240311 types-requests==2.31.0.20240311 # via pynxtools (pyproject.toml) typing-extensions==4.10.0 - # via mypy + # via + # mypy + # pydantic + # pydantic-core tzdata==2024.1 # via pandas urllib3==2.2.1 diff --git a/pynxtools/dataconverter/validation.py b/pynxtools/dataconverter/validation.py index 26025eb04..0af37686b 100644 --- a/pynxtools/dataconverter/validation.py +++ b/pynxtools/dataconverter/validation.py @@ -20,12 +20,14 @@ from collections import defaultdict from functools import reduce from operator import getitem -from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union +from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple, Union import h5py import lxml.etree as ET import numpy as np from anytree import Resolver +from cachetools import LRUCache, cached +from cachetools.keys import hashkey from pynxtools.dataconverter.helpers import ( Collector, @@ -42,6 +44,14 @@ from pynxtools.definitions.dev_tools.utils.nxdl_utils import get_nx_namefit +def best_namefit_of_( + name: str, concepts: Set[str], nx_class: Optional[str] = None +) -> str: + # TODO: Find the best namefit of name in concepts + # Consider nx_class if it is not None + ... + + def validate_hdf_group_against(appdef: str, data: h5py.Group): """ Checks whether all the required paths from the template are returned in data dict. @@ -49,13 +59,86 @@ def validate_hdf_group_against(appdef: str, data: h5py.Group): THIS IS JUST A FUNCTION SKELETON AND IS NOT WORKING YET! """ - def validate(name: str, data: Union[h5py.Group, h5py.Dataset]): + # Only cache based on path. That way we retain the nx_class information + # in the tree + # Allow for 10000 cache entries. This should be enough for most cases + @cached( + cache=LRUCache(maxsize=10000), + key=lambda path, _: hashkey(path), + ) + def find_node_for(path: str, nx_class: Optional[str] = None) -> Optional[NexusNode]: + if path == "": + return tree + + prev_path, last_elem = path.rsplit("/", 1) + node = find_node_for(prev_path) + + best_child = best_namefit_of_( + last_elem, + # TODO: Consider renaming `get_all_children_names` to + # `get_all_direct_children_names`. Because that's what it is. + node.get_all_children_names(), + nx_class, + ) + if best_child is None: + return None + + return node.search_child_with_name(best_child) + + def remove_from_req_fields(path: str): + if path in required_fields: + required_fields.remove(path) + + def handle_group(path: str, data: h5py.Group): + node = find_node_for(path, data.attrs.get("NX_class")) + if node is None: + # TODO: Log undocumented + return + + # TODO: Do actual group checks + + def handle_field(path: str, data: h5py.Dataset): + node = find_node_for(path) + if node is None: + # TODO: Log undocumented + return + remove_from_req_fields(f"{path}") + + # TODO: Do actual field checks + + def handle_attributes(path: str, attribute_names: h5py.AttributeManager): + for attr_name in attribute_names: + node = find_node_for(f"{path}/{attr_name}") + if node is None: + # TODO: Log undocumented + continue + remove_from_req_fields(f"{path}/@{attr_name}") + + # TODO: Do actual attribute checks + + def validate(path: str, data: Union[h5py.Group, h5py.Dataset]): # Namefit name against tree (use recursive caching) - pass + if isinstance(data, h5py.Group): + handle_group(path, data) + elif isinstance(data, h5py.Dataset): + handle_field(path, data) + + handle_attributes(path, data.attrs) tree = generate_tree_from(appdef) + required_fields = tree.required_fields_and_attrs_names() data.visitems(validate) + for req_field in required_fields: + if "@" in req_field: + collector.collect_and_log( + req_field, ValidationProblem.MissingRequiredAttribute, None + ) + continue + collector.collect_and_log( + req_field, ValidationProblem.MissingRequiredField, None + ) + def build_nested_dict_from( mapping: Mapping[str, Any], diff --git a/pyproject.toml b/pyproject.toml index 362475d3c..a0c097df3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "lxml>=4.9.1", "anytree", "pydantic", + "cachetools", ] [project.urls]