From 3483bd43ef99e039050c808a371582bdedfccc5c Mon Sep 17 00:00:00 2001
From: domna <florian.dobener@physik.hu-berlin.de>
Date: Fri, 17 May 2024 12:23:03 +0200
Subject: [PATCH] First ideas of validation based on hdf tree traversal

---
 dev-requirements.txt                  | 31 +++++++++-
 pynxtools/dataconverter/validation.py | 89 ++++++++++++++++++++++++++-
 pyproject.toml                        |  1 +
 3 files changed, 115 insertions(+), 6 deletions(-)

diff --git a/dev-requirements.txt b/dev-requirements.txt
index 6f5f67de1..aadbbfaee 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,15 +1,21 @@
 #
-# This file is autogenerated by pip-compile with Python 3.11
+# This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
 #    pip-compile --extra=dev --extra=docs --output-file=dev-requirements.txt pyproject.toml
 #
+annotated-types==0.6.0
+    # via pydantic
+anytree==2.12.1
+    # via pynxtools (pyproject.toml)
 ase==3.22.1
     # via pynxtools (pyproject.toml)
 babel==2.14.0
     # via mkdocs-material
 build==1.1.1
     # via pip-tools
+cachetools==5.3.3
+    # via pynxtools (pyproject.toml)
 certifi==2024.2.2
     # via requests
 cfgv==3.4.0
@@ -34,6 +40,8 @@ cycler==0.12.1
     # via matplotlib
 distlib==0.3.8
     # via virtualenv
+exceptiongroup==1.2.1
+    # via pytest
 filelock==3.13.3
     # via virtualenv
 fonttools==4.50.0
@@ -130,6 +138,10 @@ pluggy==1.4.0
     # via pytest
 pre-commit==3.7.0
     # via pynxtools (pyproject.toml)
+pydantic==2.7.1
+    # via pynxtools (pyproject.toml)
+pydantic-core==2.18.2
+    # via pydantic
 pygments==2.17.2
     # via mkdocs-material
 pymdown-extensions==10.7.1
@@ -176,11 +188,21 @@ ruff==0.3.4
 scipy==1.12.0
     # via ase
 six==1.16.0
-    # via python-dateutil
+    # via
+    #   anytree
+    #   python-dateutil
 structlog==24.1.0
     # via pynxtools (pyproject.toml)
 termcolor==2.4.0
     # via mkdocs-macros-plugin
+tomli==2.0.1
+    # via
+    #   build
+    #   coverage
+    #   mypy
+    #   pip-tools
+    #   pyproject-hooks
+    #   pytest
 types-pytz==2024.1.0.20240203
     # via pynxtools (pyproject.toml)
 types-pyyaml==6.0.12.20240311
@@ -188,7 +210,10 @@ types-pyyaml==6.0.12.20240311
 types-requests==2.31.0.20240311
     # via pynxtools (pyproject.toml)
 typing-extensions==4.10.0
-    # via mypy
+    # via
+    #   mypy
+    #   pydantic
+    #   pydantic-core
 tzdata==2024.1
     # via pandas
 urllib3==2.2.1
diff --git a/pynxtools/dataconverter/validation.py b/pynxtools/dataconverter/validation.py
index 26025eb04..0af37686b 100644
--- a/pynxtools/dataconverter/validation.py
+++ b/pynxtools/dataconverter/validation.py
@@ -20,12 +20,14 @@
 from collections import defaultdict
 from functools import reduce
 from operator import getitem
-from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union
+from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple, Union
 
 import h5py
 import lxml.etree as ET
 import numpy as np
 from anytree import Resolver
+from cachetools import LRUCache, cached
+from cachetools.keys import hashkey
 
 from pynxtools.dataconverter.helpers import (
     Collector,
@@ -42,6 +44,14 @@
 from pynxtools.definitions.dev_tools.utils.nxdl_utils import get_nx_namefit
 
 
+def best_namefit_of_(
+    name: str, concepts: Set[str], nx_class: Optional[str] = None
+) -> str:
+    # TODO: Find the best namefit of name in concepts
+    # Consider nx_class if it is not None
+    ...
+
+
 def validate_hdf_group_against(appdef: str, data: h5py.Group):
     """
     Checks whether all the required paths from the template are returned in data dict.
@@ -49,13 +59,86 @@ def validate_hdf_group_against(appdef: str, data: h5py.Group):
     THIS IS JUST A FUNCTION SKELETON AND IS NOT WORKING YET!
     """
 
-    def validate(name: str, data: Union[h5py.Group, h5py.Dataset]):
+    # Only cache based on path. That way we retain the nx_class information
+    # in the tree
+    # Allow for 10000 cache entries. This should be enough for most cases
+    @cached(
+        cache=LRUCache(maxsize=10000),
+        key=lambda path, _: hashkey(path),
+    )
+    def find_node_for(path: str, nx_class: Optional[str] = None) -> Optional[NexusNode]:
+        if path == "":
+            return tree
+
+        prev_path, last_elem = path.rsplit("/", 1)
+        node = find_node_for(prev_path)
+
+        best_child = best_namefit_of_(
+            last_elem,
+            # TODO: Consider renaming `get_all_children_names` to
+            # `get_all_direct_children_names`. Because that's what it is.
+            node.get_all_children_names(),
+            nx_class,
+        )
+        if best_child is None:
+            return None
+
+        return node.search_child_with_name(best_child)
+
+    def remove_from_req_fields(path: str):
+        if path in required_fields:
+            required_fields.remove(path)
+
+    def handle_group(path: str, data: h5py.Group):
+        node = find_node_for(path, data.attrs.get("NX_class"))
+        if node is None:
+            # TODO: Log undocumented
+            return
+
+        # TODO: Do actual group checks
+
+    def handle_field(path: str, data: h5py.Dataset):
+        node = find_node_for(path)
+        if node is None:
+            # TODO: Log undocumented
+            return
+        remove_from_req_fields(f"{path}")
+
+        # TODO: Do actual field checks
+
+    def handle_attributes(path: str, attribute_names: h5py.AttributeManager):
+        for attr_name in attribute_names:
+            node = find_node_for(f"{path}/{attr_name}")
+            if node is None:
+                # TODO: Log undocumented
+                continue
+            remove_from_req_fields(f"{path}/@{attr_name}")
+
+        # TODO: Do actual attribute checks
+
+    def validate(path: str, data: Union[h5py.Group, h5py.Dataset]):
         # Namefit name against tree (use recursive caching)
-        pass
+        if isinstance(data, h5py.Group):
+            handle_group(path, data)
+        elif isinstance(data, h5py.Dataset):
+            handle_field(path, data)
+
+        handle_attributes(path, data.attrs)
 
     tree = generate_tree_from(appdef)
+    required_fields = tree.required_fields_and_attrs_names()
     data.visitems(validate)
 
+    for req_field in required_fields:
+        if "@" in req_field:
+            collector.collect_and_log(
+                req_field, ValidationProblem.MissingRequiredAttribute, None
+            )
+            continue
+        collector.collect_and_log(
+            req_field, ValidationProblem.MissingRequiredField, None
+        )
+
 
 def build_nested_dict_from(
     mapping: Mapping[str, Any],
diff --git a/pyproject.toml b/pyproject.toml
index 362475d3c..a0c097df3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     "lxml>=4.9.1",
     "anytree",
     "pydantic",
+    "cachetools",
 ]
 
 [project.urls]