Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validation based on hdf tree traversal #333

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
284333e
Read extends keyword from file
domna Jun 4, 2024
92a2be0
Insert extends parents into the inheritance chain
domna Jun 6, 2024
0e3b844
Automatically populate tree from appdef parents
domna Jun 6, 2024
d7140e3
Only populate tree if parents are present
domna Jun 6, 2024
9d64431
Docstring improvements
domna Jun 6, 2024
f2dfd9f
Fix exact match in NX_CLASS[path] notation
domna Jun 7, 2024
37f6497
If minOccurs == 0, set the group to optional
domna Jun 7, 2024
5cb15e4
Add extended NXtest
domna Jun 7, 2024
a96d2d2
First ideas of validation based on hdf tree traversal
domna May 17, 2024
9eab38c
Improved get_children_names function
domna May 21, 2024
a756a13
Update requirements
domna Jun 7, 2024
4ef58b8
Merge branch 'support-extends-in-nexus-tree' into hdf-based-validation
domna Jun 7, 2024
aa6ac27
Fix function name
domna Jun 7, 2024
71d0bff
Do actual field checks
domna Jun 7, 2024
25ee741
Add cli and units support
domna Jun 10, 2024
470bd7d
Include units files in package
domna Jun 12, 2024
7fcaadb
Add working validation
domna Jun 12, 2024
46fb58d
Use node_type in find_node_for
domna Jun 12, 2024
4c7584b
Fix tests
domna Jun 12, 2024
66d239a
Merge branch 'master' into hdf-based-validation
domna Jun 14, 2024
6affe10
Merge branch 'master' into hdf-based-validation
domna Jun 14, 2024
ab221b4
Merge branch 'master' into hdf-based-validation
domna Aug 12, 2024
533f0a0
Fixes from merging
domna Aug 12, 2024
007e3be
Re-add pint to the dependencies
domna Aug 12, 2024
f5d05c3
Remove distutils import
domna Aug 12, 2024
0ab02bc
Import ureg for units validation
domna Aug 12, 2024
561ef86
Adding some test files for hdf5_validator.
RubelMozumder Aug 13, 2024
2df5d72
pytest for hdf5 validation.
RubelMozumder Aug 14, 2024
de56834
test for nexus file validation.
RubelMozumder Aug 14, 2024
9bcad38
VisitingCode-1
RubelMozumder Oct 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ cython_debug/
!dev-requirements.txt
!mkdocs-requirements.txt
!src/pynxtools/nexus-version.txt
!src/pynxtools/dataconverter/units/constants_en.txt
!src/pynxtools/dataconverter/units/default_en.txt
build/
nexusparser.egg-info/PKG-INFO
.python-version
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ recursive-include src/pynxtools/definitions/applications/ *.xml
recursive-include src/pynxtools/definitions/contributed_definitions/ *.xml
include src/pynxtools/definitions/*.xsd
include src/pynxtools/nexus-version.txt
include src/pynxtools/dataconverter/units *.txt
include src/pynxtools/definitions/NXDL_VERSION

18 changes: 16 additions & 2 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
# uv pip compile --extra=dev --extra=docs --output-file=dev-requirements.txt pyproject.toml
anytree==2.12.1
# via pynxtools (pyproject.toml)
appdirs==1.4.4
# via pint
ase==3.23.0
# via pynxtools (pyproject.toml)
babel==2.15.0
# via mkdocs-material
cachetools==5.4.0
# via pynxtools (pyproject.toml)
certifi==2024.7.4
# via requests
cfgv==3.4.0
Expand Down Expand Up @@ -34,6 +38,10 @@ exceptiongroup==1.2.1
# via pytest
filelock==3.15.4
# via virtualenv
flexcache==0.3
# via pint
flexparser==0.3.1
# via pint
fonttools==4.53.1
# via matplotlib
ghp-import==2.1.0
Expand Down Expand Up @@ -123,6 +131,8 @@ pathspec==0.12.1
# via mkdocs
pillow==10.4.0
# via matplotlib
pint==0.24.3
# via pynxtools (pyproject.toml)
platformdirs==4.2.2
# via
# mkdocs-get-deps
Expand Down Expand Up @@ -169,7 +179,7 @@ regex==2024.5.15
# via mkdocs-material
requests==2.32.3
# via mkdocs-material
ruff==0.4.8
ruff==0.5.5
# via pynxtools (pyproject.toml)
scipy==1.14.0
# via ase
Expand All @@ -193,7 +203,11 @@ types-pyyaml==6.0.12.20240311
types-requests==2.32.0.20240622
# via pynxtools (pyproject.toml)
typing-extensions==4.12.2
# via mypy
# via
# flexcache
# flexparser
# mypy
# pint
tzdata==2024.1
# via pandas
urllib3==2.2.2
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ dependencies = [
"importlib-metadata",
"lxml>=4.9.1",
"anytree",
"cachetools",
"pint>=0.17",
]

[project.urls]
Expand Down Expand Up @@ -98,9 +100,11 @@ nexus_data_converter = "pynxtools.nomad.entrypoints:nexus_data_converter"
read_nexus = "pynxtools.nexus.nexus:main"
dataconverter = "pynxtools.dataconverter.convert:main_cli"
generate_eln = "pynxtools.eln_mapper.eln_mapper:get_eln"
verify_nexus = "pynxtools.dataconverter.verify:verify"

[tool.setuptools.package-data]
pynxtools = ["definitions/**/*.xml", "definitions/**/*.xsd"]
"pynxtools.dataconverter.units" = ["*.txt"]
"pynxtools.dataconverter.readers.hall" = ["enum_map.json"]
"pynxtools.dataconverter.readers.rii_database.formula_parser" = ["dispersion_function_grammar.lark"]

Expand Down
108 changes: 90 additions & 18 deletions src/pynxtools/dataconverter/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,14 @@
import numpy as np
from ase.data import chemical_symbols

if np.lib.NumpyVersion(np.__version__) < "2.0.0":
from numpy import chararray
else:
from numpy.char import chararray
from pint import UndefinedUnitError

from pynxtools import get_nexus_version, get_nexus_version_hash
from pynxtools.dataconverter.units import ureg
from pynxtools.definitions.dev_tools.utils.nxdl_utils import (
get_enums,
get_inherited_nodes,
Expand Down Expand Up @@ -91,10 +98,7 @@ def _log(self, path: str, log_type: ValidationProblem, value: Optional[Any], *ar
elif log_type == ValidationProblem.MissingRequiredGroup:
logger.warning(f"The required group, {path}, hasn't been supplied.")
elif log_type == ValidationProblem.MissingRequiredField:
logger.warning(
f"The data entry corresponding to {path} is required "
"and hasn't been supplied by the reader.",
)
logger.warning(f"Missing field: {path}")
elif log_type == ValidationProblem.InvalidType:
logger.warning(
f"The value at {path} should be one of: {value}"
Expand Down Expand Up @@ -299,24 +303,35 @@ def get_nxdl_root_and_path(nxdl: str):
special_names = {
"NXtest": os.path.join(data_path, "NXtest.nxdl.xml"),
"NXtest_extended": os.path.join(data_path, "NXtest_extended.nxdl.xml"),
"NXhdf5_validator_1": os.path.join(data_path, "NXhdf5_validator_1.nxdl.xml"),
"NXhdf5_validator_2": os.path.join(data_path, "NXhdf5_validator_2.nxdl.xml"),
}

probable_file_paths = [
os.path.join(definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml"),
os.path.join(definitions_path, "applications", f"{nxdl}.nxdl.xml"),
os.path.join(definitions_path, "base_classes", f"{nxdl}.nxdl.xml"),
os.path.join(
definitions_path, "dev_tools/tests/test_nxdls", f"{nxdl}.nxdl.xml"
),
]
if nxdl in special_names:
nxdl_f_path = special_names[nxdl]
else:
nxdl_f_path = os.path.join(
definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml"
)
if not os.path.exists(nxdl_f_path):
nxdl_f_path = os.path.join(
definitions_path, "applications", f"{nxdl}.nxdl.xml"
)
if not os.path.exists(nxdl_f_path):
nxdl_f_path = os.path.join(
definitions_path, "base_classes", f"{nxdl}.nxdl.xml"
)
if not os.path.exists(nxdl_f_path):
raise FileNotFoundError(f"The nxdl file, {nxdl}, was not found.")
nxdl_f_path = next(x for x in probable_file_paths if os.path.exists(x))
# nxdl_f_path = os.path.join(
# definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml"
# )
# if not os.path.exists(nxdl_f_path):
# nxdl_f_path = os.path.join(
# definitions_path, "applications", f"{nxdl}.nxdl.xml"
# )
# if not os.path.exists(nxdl_f_path):
# nxdl_f_path = os.path.join(
# definitions_path, "base_classes", f"{nxdl}.nxdl.xml"
# )
# if not os.path.exists(nxdl_f_path):
# raise FileNotFoundError(f"The nxdl file, {nxdl}, was not found.")

return ET.parse(nxdl_f_path).getroot(), nxdl_f_path

Expand Down Expand Up @@ -582,7 +597,7 @@ def is_value_valid_element_of_enum(value, elist) -> Tuple[bool, list]:
"ISO8601": (str,),
"NX_BINARY": (bytes, bytearray, np.byte, np.ubyte, np.ndarray),
"NX_BOOLEAN": (bool, np.ndarray, np.bool_),
"NX_CHAR": (str, np.ndarray, np.chararray),
"NX_CHAR": (str, np.ndarray, chararray),
"NX_DATE_TIME": (str,),
"NX_FLOAT": (float, np.ndarray, np.floating),
"NX_INT": (int, np.ndarray, np.signedinteger),
Expand Down Expand Up @@ -647,6 +662,63 @@ def convert_str_to_bool_safe(value):
return None


def clean_str_attr(
attr: Optional[Union[str, bytes]], encoding="utf-8"
) -> Optional[str]:
"""
Cleans the string attribute which means it will decode bytes to str if necessary.
If `attr` is not str, bytes or None it raises a TypeError.
"""
if attr is None:
return attr
if isinstance(attr, bytes):
return attr.decode(encoding)
if isinstance(attr, str):
return attr

raise TypeError(
"Invalid type {type} for attribute. Should be either None, bytes or str."
)


def is_valid_unit(
unit: str, nx_category: str, transformation_type: Optional[str]
) -> bool:
"""
The provided unit belongs to the provided nexus unit category.
Args:
unit (str): The unit to check. Should be according to pint.
nx_category (str): A nexus unit category, e.g. `NX_LENGTH`,
or derived unit category, e.g., `NX_LENGTH ** 2`.
transformation_type (Optional[str]):
The transformation type of an NX_TRANSFORMATION.
This parameter is ignored if the `nx_category` is not `NX_TRANSFORMATION`.
If `transformation_type` is not present this should be set to None.
Returns:
bool: The unit belongs to the provided category
"""
unit = clean_str_attr(unit)
try:
if nx_category in ("NX_ANY"):
ureg(unit) # Check if unit is generally valid
return True
nx_category = re.sub(r"(NX_[A-Z]+)", r"[\1]", nx_category)
if nx_category == "[NX_TRANSFORMATION]":
# NX_TRANSFORMATIONS is a pseudo unit
# and can be either an angle, a length or unitless
# depending on the transformation type.
if transformation_type is None:
return ureg(unit).check("[NX_UNITLESS]")
if transformation_type == "translation":
return ureg(unit).check("[NX_LENGTH]")
if transformation_type == "rotation":
return ureg(unit).check("[NX_ANGLE]")
return False
return ureg(unit).check(f"{nx_category}")
except UndefinedUnitError:
return False


def is_valid_data_field(value, nxdl_type, path) -> bool:
"""Checks whether a given value is valid according to what is defined in the NXDL.

Expand Down
23 changes: 17 additions & 6 deletions src/pynxtools/dataconverter/nexus_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"""

from functools import reduce
from typing import Any, List, Literal, Optional, Set, Tuple, Union
from typing import Any, List, Literal, Optional, Set, Tuple

import lxml.etree as ET
from anytree.node.nodemixin import NodeMixin
Expand Down Expand Up @@ -142,18 +142,24 @@ class NexusNode(NodeMixin):
The inverse of the above `is_a`. In the example case
`DATA` `parent_of` `my_data`.
"""

# TODO rename type to nx_type in every place
name: str
type: Literal["group", "field", "attribute", "choice"]
optionality: Literal["required", "recommended", "optional"] = "required"
name_type: Literal["any", "partial"]
variadic: bool = False
inheritance: List[ET._Element]
is_a: List["NexusNode"]
parent_of: List["NexusNode"]
occurrence_limits: Tuple[
# TODO: Use Annotated[int, Field(strict=True, ge=0)] for py>3.8
Optional[int],
Optional[int],
] = (None, None)

def _set_optionality(self):
"""
Sets the optionality of the current node
Sets the optionality of the current node based on the inheritance chain.
if `recommended`, `required` or `optional` is set.
Also sets the field to optional if `maxOccurs == 0` or to required
if `maxOccurs > 0`.
Expand All @@ -179,6 +185,7 @@ def __init__(
type: Literal["group", "field", "attribute", "choice"],
optionality: Literal["required", "recommended", "optional"] = "required",
variadic: Optional[bool] = None,
name_type: Optional[Literal["any", "partial"]] = None,
parent: Optional["NexusNode"] = None,
inheritance: Optional[List[Any]] = None,
) -> None:
Expand All @@ -187,6 +194,7 @@ def __init__(
self.type = type
self.optionality = optionality
self.variadic = contains_uppercase(self.name)
self.name_type = name_type
if variadic is not None:
self.variadic = variadic
if inheritance is not None:
Expand Down Expand Up @@ -222,6 +230,9 @@ def get_path(self) -> str:
while current_node.parent is not None:
names.insert(0, current_node.name)
current_node = current_node.parent

if self.type == "attribute" and names:
names[-1] = f"@{names[-1]}"
return "/" + "/".join(names)

def search_add_child_for_multiple(
Expand Down Expand Up @@ -793,10 +804,10 @@ def _set_items(self):
if not self.dtype == "NX_CHAR":
return
for elem in self.inheritance:
enum = elem.find(f"nx:enumeration", namespaces=namespaces)
enum = elem.find("nx:enumeration", namespaces=namespaces)
if enum is not None:
self.items = []
for items in enum.findall(f"nx:item", namespaces=namespaces):
for items in enum.findall("nx:item", namespaces=namespaces):
self.items.append(items.attrib["value"])
return

Expand All @@ -806,7 +817,7 @@ def _set_shape(self):
The first vale found is used.
"""
for elem in self.inheritance:
dimension = elem.find(f"nx:dimensions", namespaces=namespaces)
dimension = elem.find("nx:dimensions", namespaces=namespaces)
if dimension is not None:
break
if not self.inheritance or dimension is None:
Expand Down
1 change: 0 additions & 1 deletion src/pynxtools/dataconverter/readers/example/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

#
"""An example reader implementation for the DataConverter."""

import json
Expand Down
24 changes: 24 additions & 0 deletions src/pynxtools/dataconverter/units/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#
# Copyright The NOMAD Authors.
#
# This file is part of NOMAD. See https://nomad-lab.eu for further info.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""A unit registry for nexus units"""

import os

from pint import UnitRegistry

ureg = UnitRegistry(os.path.join(os.path.dirname(__file__), "default_en.txt"))
Loading
Loading