FAIRmat-NFDI · domna · Jun 4, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -202,6 +202,8 @@ cython_debug/
 !dev-requirements.txt
 !mkdocs-requirements.txt
 !src/pynxtools/nexus-version.txt
+!src/pynxtools/dataconverter/units/constants_en.txt
+!src/pynxtools/dataconverter/units/default_en.txt
 build/
 nexusparser.egg-info/PKG-INFO
 .python-version
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -9,5 +9,6 @@ recursive-include src/pynxtools/definitions/applications/ *.xml
 recursive-include src/pynxtools/definitions/contributed_definitions/ *.xml
 include src/pynxtools/definitions/*.xsd
 include src/pynxtools/nexus-version.txt
+include src/pynxtools/dataconverter/units *.txt
 include src/pynxtools/definitions/NXDL_VERSION
 
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -2,10 +2,14 @@
 #    uv pip compile --extra=dev --extra=docs --output-file=dev-requirements.txt pyproject.toml
 anytree==2.12.1
     # via pynxtools (pyproject.toml)
+appdirs==1.4.4
+    # via pint
 ase==3.23.0
     # via pynxtools (pyproject.toml)
 babel==2.15.0
     # via mkdocs-material
+cachetools==5.4.0
+    # via pynxtools (pyproject.toml)
 certifi==2024.7.4
     # via requests
 cfgv==3.4.0
@@ -34,6 +38,10 @@ exceptiongroup==1.2.1
     # via pytest
 filelock==3.15.4
     # via virtualenv
+flexcache==0.3
+    # via pint
+flexparser==0.3.1
+    # via pint
 fonttools==4.53.1
     # via matplotlib
 ghp-import==2.1.0
@@ -123,6 +131,8 @@ pathspec==0.12.1
     # via mkdocs
 pillow==10.4.0
     # via matplotlib
+pint==0.24.3
+    # via pynxtools (pyproject.toml)
 platformdirs==4.2.2
     # via
     #   mkdocs-get-deps
@@ -169,7 +179,7 @@ regex==2024.5.15
     # via mkdocs-material
 requests==2.32.3
     # via mkdocs-material
-ruff==0.4.8
+ruff==0.5.5
     # via pynxtools (pyproject.toml)
 scipy==1.14.0
     # via ase
@@ -193,7 +203,11 @@ types-pyyaml==6.0.12.20240311
 types-requests==2.32.0.20240622
     # via pynxtools (pyproject.toml)
 typing-extensions==4.12.2
-    # via mypy
+    # via
+    #   flexcache
+    #   flexparser
+    #   mypy
+    #   pint
 tzdata==2024.1
     # via pandas
 urllib3==2.2.2

diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,8 @@ dependencies = [
     "importlib-metadata",
     "lxml>=4.9.1",
     "anytree",
+    "cachetools",
+    "pint>=0.17",
 ]
 
 [project.urls]
@@ -98,9 +100,11 @@ nexus_data_converter = "pynxtools.nomad.entrypoints:nexus_data_converter"
 read_nexus = "pynxtools.nexus.nexus:main"
 dataconverter = "pynxtools.dataconverter.convert:main_cli"
 generate_eln = "pynxtools.eln_mapper.eln_mapper:get_eln"
+verify_nexus = "pynxtools.dataconverter.verify:verify"
 
 [tool.setuptools.package-data]
 pynxtools = ["definitions/**/*.xml", "definitions/**/*.xsd"]
+"pynxtools.dataconverter.units" = ["*.txt"]
 "pynxtools.dataconverter.readers.hall" = ["enum_map.json"]
 "pynxtools.dataconverter.readers.rii_database.formula_parser" = ["dispersion_function_grammar.lark"]
 

diff --git a/src/pynxtools/dataconverter/helpers.py b/src/pynxtools/dataconverter/helpers.py
@@ -31,7 +31,14 @@
 import numpy as np
 from ase.data import chemical_symbols
 
+if np.lib.NumpyVersion(np.__version__) < "2.0.0":
+    from numpy import chararray
+else:
+    from numpy.char import chararray
+from pint import UndefinedUnitError
+
 from pynxtools import get_nexus_version, get_nexus_version_hash
+from pynxtools.dataconverter.units import ureg
 from pynxtools.definitions.dev_tools.utils.nxdl_utils import (
     get_enums,
     get_inherited_nodes,
@@ -91,10 +98,7 @@ def _log(self, path: str, log_type: ValidationProblem, value: Optional[Any], *ar
         elif log_type == ValidationProblem.MissingRequiredGroup:
             logger.warning(f"The required group, {path}, hasn't been supplied.")
         elif log_type == ValidationProblem.MissingRequiredField:
-            logger.warning(
-                f"The data entry corresponding to {path} is required "
-                "and hasn't been supplied by the reader.",
-            )
+            logger.warning(f"Missing field: {path}")
         elif log_type == ValidationProblem.InvalidType:
             logger.warning(
                 f"The value at {path} should be one of: {value}"
@@ -299,24 +303,35 @@ def get_nxdl_root_and_path(nxdl: str):
     special_names = {
         "NXtest": os.path.join(data_path, "NXtest.nxdl.xml"),
         "NXtest_extended": os.path.join(data_path, "NXtest_extended.nxdl.xml"),
+        "NXhdf5_validator_1": os.path.join(data_path, "NXhdf5_validator_1.nxdl.xml"),
+        "NXhdf5_validator_2": os.path.join(data_path, "NXhdf5_validator_2.nxdl.xml"),
     }
 
+    probable_file_paths = [
+        os.path.join(definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml"),
+        os.path.join(definitions_path, "applications", f"{nxdl}.nxdl.xml"),
+        os.path.join(definitions_path, "base_classes", f"{nxdl}.nxdl.xml"),
+        os.path.join(
+            definitions_path, "dev_tools/tests/test_nxdls", f"{nxdl}.nxdl.xml"
+        ),
+    ]
     if nxdl in special_names:
         nxdl_f_path = special_names[nxdl]
     else:
-        nxdl_f_path = os.path.join(
-            definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml"
-        )
-        if not os.path.exists(nxdl_f_path):
-            nxdl_f_path = os.path.join(
-                definitions_path, "applications", f"{nxdl}.nxdl.xml"
-            )
-        if not os.path.exists(nxdl_f_path):
-            nxdl_f_path = os.path.join(
-                definitions_path, "base_classes", f"{nxdl}.nxdl.xml"
-            )
-        if not os.path.exists(nxdl_f_path):
-            raise FileNotFoundError(f"The nxdl file, {nxdl}, was not found.")
+        nxdl_f_path = next(x for x in probable_file_paths if os.path.exists(x))
+        # nxdl_f_path = os.path.join(
+        #     definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml"
+        # )
+        # if not os.path.exists(nxdl_f_path):
+        #     nxdl_f_path = os.path.join(
+        #         definitions_path, "applications", f"{nxdl}.nxdl.xml"
+        #     )
+        # if not os.path.exists(nxdl_f_path):
+        #     nxdl_f_path = os.path.join(
+        #         definitions_path, "base_classes", f"{nxdl}.nxdl.xml"
+        #     )
+        # if not os.path.exists(nxdl_f_path):
+        #     raise FileNotFoundError(f"The nxdl file, {nxdl}, was not found.")
 
     return ET.parse(nxdl_f_path).getroot(), nxdl_f_path
 
@@ -582,7 +597,7 @@ def is_value_valid_element_of_enum(value, elist) -> Tuple[bool, list]:
     "ISO8601": (str,),
     "NX_BINARY": (bytes, bytearray, np.byte, np.ubyte, np.ndarray),
     "NX_BOOLEAN": (bool, np.ndarray, np.bool_),
-    "NX_CHAR": (str, np.ndarray, np.chararray),
+    "NX_CHAR": (str, np.ndarray, chararray),
     "NX_DATE_TIME": (str,),
     "NX_FLOAT": (float, np.ndarray, np.floating),
     "NX_INT": (int, np.ndarray, np.signedinteger),
@@ -647,6 +662,63 @@ def convert_str_to_bool_safe(value):
     return None
 
 
+def clean_str_attr(
+    attr: Optional[Union[str, bytes]], encoding="utf-8"
+) -> Optional[str]:
+    """
+    Cleans the string attribute which means it will decode bytes to str if necessary.
+    If `attr` is not str, bytes or None it raises a TypeError.
+    """
+    if attr is None:
+        return attr
+    if isinstance(attr, bytes):
+        return attr.decode(encoding)
+    if isinstance(attr, str):
+        return attr
+
+    raise TypeError(
+        "Invalid type {type} for attribute. Should be either None, bytes or str."
+    )
+
+
+def is_valid_unit(
+    unit: str, nx_category: str, transformation_type: Optional[str]
+) -> bool:
+    """
+    The provided unit belongs to the provided nexus unit category.
+    Args:
+        unit (str): The unit to check. Should be according to pint.
+        nx_category (str): A nexus unit category, e.g. `NX_LENGTH`,
+            or derived unit category, e.g., `NX_LENGTH ** 2`.
+        transformation_type (Optional[str]):
+            The transformation type of an NX_TRANSFORMATION.
+            This parameter is ignored if the `nx_category` is not `NX_TRANSFORMATION`.
+            If `transformation_type` is not present this should be set to None.
+    Returns:
+        bool: The unit belongs to the provided category
+    """
+    unit = clean_str_attr(unit)
+    try:
+        if nx_category in ("NX_ANY"):
+            ureg(unit)  # Check if unit is generally valid
+            return True
+        nx_category = re.sub(r"(NX_[A-Z]+)", r"[\1]", nx_category)
+        if nx_category == "[NX_TRANSFORMATION]":
+            # NX_TRANSFORMATIONS is a pseudo unit
+            # and can be either an angle, a length or unitless
+            # depending on the transformation type.
+            if transformation_type is None:
+                return ureg(unit).check("[NX_UNITLESS]")
+            if transformation_type == "translation":
+                return ureg(unit).check("[NX_LENGTH]")
+            if transformation_type == "rotation":
+                return ureg(unit).check("[NX_ANGLE]")
+            return False
+        return ureg(unit).check(f"{nx_category}")
+    except UndefinedUnitError:
+        return False
+
+
 def is_valid_data_field(value, nxdl_type, path) -> bool:
     """Checks whether a given value is valid according to what is defined in the NXDL.
 

diff --git a/src/pynxtools/dataconverter/nexus_tree.py b/src/pynxtools/dataconverter/nexus_tree.py
@@ -29,7 +29,7 @@
 """
 
 from functools import reduce
-from typing import Any, List, Literal, Optional, Set, Tuple, Union
+from typing import Any, List, Literal, Optional, Set, Tuple
 
 import lxml.etree as ET
 from anytree.node.nodemixin import NodeMixin
@@ -142,18 +142,24 @@ class NexusNode(NodeMixin):
             The inverse of the above `is_a`. In the example case
             `DATA` `parent_of` `my_data`.
     """
-
+    # TODO rename type to nx_type in every place
     name: str
     type: Literal["group", "field", "attribute", "choice"]
     optionality: Literal["required", "recommended", "optional"] = "required"
+    name_type: Literal["any", "partial"]
     variadic: bool = False
     inheritance: List[ET._Element]
     is_a: List["NexusNode"]
     parent_of: List["NexusNode"]
+    occurrence_limits: Tuple[
+        # TODO: Use Annotated[int, Field(strict=True, ge=0)] for py>3.8
+        Optional[int],
+        Optional[int],
+    ] = (None, None)
 
     def _set_optionality(self):
         """
-        Sets the optionality of the current node
+        Sets the optionality of the current node based on the inheritance chain.
         if `recommended`, `required` or `optional` is set.
         Also sets the field to optional if `maxOccurs == 0` or to required
         if `maxOccurs > 0`.
@@ -179,6 +185,7 @@ def __init__(
         type: Literal["group", "field", "attribute", "choice"],
         optionality: Literal["required", "recommended", "optional"] = "required",
         variadic: Optional[bool] = None,
+        name_type: Optional[Literal["any", "partial"]] = None,
         parent: Optional["NexusNode"] = None,
         inheritance: Optional[List[Any]] = None,
     ) -> None:
@@ -187,6 +194,7 @@ def __init__(
         self.type = type
         self.optionality = optionality
         self.variadic = contains_uppercase(self.name)
+        self.name_type = name_type
         if variadic is not None:
             self.variadic = variadic
         if inheritance is not None:
@@ -222,6 +230,9 @@ def get_path(self) -> str:
         while current_node.parent is not None:
             names.insert(0, current_node.name)
             current_node = current_node.parent
+
+        if self.type == "attribute" and names:
+            names[-1] = f"@{names[-1]}"
         return "/" + "/".join(names)
 
     def search_add_child_for_multiple(
@@ -793,10 +804,10 @@ def _set_items(self):
         if not self.dtype == "NX_CHAR":
             return
         for elem in self.inheritance:
-            enum = elem.find(f"nx:enumeration", namespaces=namespaces)
+            enum = elem.find("nx:enumeration", namespaces=namespaces)
             if enum is not None:
                 self.items = []
-                for items in enum.findall(f"nx:item", namespaces=namespaces):
+                for items in enum.findall("nx:item", namespaces=namespaces):
                     self.items.append(items.attrib["value"])
                 return
 
@@ -806,7 +817,7 @@ def _set_shape(self):
         The first vale found is used.
         """
         for elem in self.inheritance:
-            dimension = elem.find(f"nx:dimensions", namespaces=namespaces)
+            dimension = elem.find("nx:dimensions", namespaces=namespaces)
             if dimension is not None:
                 break
         if not self.inheritance or dimension is None:

diff --git a/src/pynxtools/dataconverter/readers/example/reader.py b/src/pynxtools/dataconverter/readers/example/reader.py
@@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#
 """An example reader implementation for the DataConverter."""
 
 import json

diff --git a/src/pynxtools/dataconverter/units/__init__.py b/src/pynxtools/dataconverter/units/__init__.py
@@ -0,0 +1,24 @@
+#
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD. See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""A unit registry for nexus units"""
+
+import os
+
+from pint import UnitRegistry
+
+ureg = UnitRegistry(os.path.join(os.path.dirname(__file__), "default_en.txt"))