diff --git a/.gitignore b/.gitignore index bd139eea0..109f11c48 100644 --- a/.gitignore +++ b/.gitignore @@ -202,6 +202,8 @@ cython_debug/ !dev-requirements.txt !mkdocs-requirements.txt !src/pynxtools/nexus-version.txt +!src/pynxtools/dataconverter/units/constants_en.txt +!src/pynxtools/dataconverter/units/default_en.txt build/ nexusparser.egg-info/PKG-INFO .python-version diff --git a/MANIFEST.in b/MANIFEST.in index 7b8255ac3..69c36fbb7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -9,5 +9,6 @@ recursive-include src/pynxtools/definitions/applications/ *.xml recursive-include src/pynxtools/definitions/contributed_definitions/ *.xml include src/pynxtools/definitions/*.xsd include src/pynxtools/nexus-version.txt +include src/pynxtools/dataconverter/units *.txt include src/pynxtools/definitions/NXDL_VERSION diff --git a/dev-requirements.txt b/dev-requirements.txt index 27205fe2c..637c54026 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,10 +2,14 @@ # uv pip compile --extra=dev --extra=docs --output-file=dev-requirements.txt pyproject.toml anytree==2.12.1 # via pynxtools (pyproject.toml) +appdirs==1.4.4 + # via pint ase==3.23.0 # via pynxtools (pyproject.toml) babel==2.15.0 # via mkdocs-material +cachetools==5.4.0 + # via pynxtools (pyproject.toml) certifi==2024.7.4 # via requests cfgv==3.4.0 @@ -34,6 +38,10 @@ exceptiongroup==1.2.1 # via pytest filelock==3.15.4 # via virtualenv +flexcache==0.3 + # via pint +flexparser==0.3.1 + # via pint fonttools==4.53.1 # via matplotlib ghp-import==2.1.0 @@ -123,6 +131,8 @@ pathspec==0.12.1 # via mkdocs pillow==10.4.0 # via matplotlib +pint==0.24.3 + # via pynxtools (pyproject.toml) platformdirs==4.2.2 # via # mkdocs-get-deps @@ -169,7 +179,7 @@ regex==2024.5.15 # via mkdocs-material requests==2.32.3 # via mkdocs-material -ruff==0.4.8 +ruff==0.5.5 # via pynxtools (pyproject.toml) scipy==1.14.0 # via ase @@ -193,7 +203,11 @@ types-pyyaml==6.0.12.20240311 types-requests==2.32.0.20240622 # via pynxtools (pyproject.toml) typing-extensions==4.12.2 - # via mypy + # via + # flexcache + # flexparser + # mypy + # pint tzdata==2024.1 # via pandas urllib3==2.2.2 diff --git a/pyproject.toml b/pyproject.toml index 307fb5918..85478ab23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,8 @@ dependencies = [ "importlib-metadata", "lxml>=4.9.1", "anytree", + "cachetools", + "pint>=0.17", ] [project.urls] @@ -98,9 +100,11 @@ nexus_data_converter = "pynxtools.nomad.entrypoints:nexus_data_converter" read_nexus = "pynxtools.nexus.nexus:main" dataconverter = "pynxtools.dataconverter.convert:main_cli" generate_eln = "pynxtools.eln_mapper.eln_mapper:get_eln" +verify_nexus = "pynxtools.dataconverter.verify:verify" [tool.setuptools.package-data] pynxtools = ["definitions/**/*.xml", "definitions/**/*.xsd"] +"pynxtools.dataconverter.units" = ["*.txt"] "pynxtools.dataconverter.readers.hall" = ["enum_map.json"] "pynxtools.dataconverter.readers.rii_database.formula_parser" = ["dispersion_function_grammar.lark"] diff --git a/src/pynxtools/dataconverter/helpers.py b/src/pynxtools/dataconverter/helpers.py index 27f051e22..b0bbd9103 100644 --- a/src/pynxtools/dataconverter/helpers.py +++ b/src/pynxtools/dataconverter/helpers.py @@ -31,7 +31,14 @@ import numpy as np from ase.data import chemical_symbols +if np.lib.NumpyVersion(np.__version__) < "2.0.0": + from numpy import chararray +else: + from numpy.char import chararray +from pint import UndefinedUnitError + from pynxtools import get_nexus_version, get_nexus_version_hash +from pynxtools.dataconverter.units import ureg from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( get_enums, get_inherited_nodes, @@ -91,10 +98,7 @@ def _log(self, path: str, log_type: ValidationProblem, value: Optional[Any], *ar elif log_type == ValidationProblem.MissingRequiredGroup: logger.warning(f"The required group, {path}, hasn't been supplied.") elif log_type == ValidationProblem.MissingRequiredField: - logger.warning( - f"The data entry corresponding to {path} is required " - "and hasn't been supplied by the reader.", - ) + logger.warning(f"Missing field: {path}") elif log_type == ValidationProblem.InvalidType: logger.warning( f"The value at {path} should be one of: {value}" @@ -299,24 +303,35 @@ def get_nxdl_root_and_path(nxdl: str): special_names = { "NXtest": os.path.join(data_path, "NXtest.nxdl.xml"), "NXtest_extended": os.path.join(data_path, "NXtest_extended.nxdl.xml"), + "NXhdf5_validator_1": os.path.join(data_path, "NXhdf5_validator_1.nxdl.xml"), + "NXhdf5_validator_2": os.path.join(data_path, "NXhdf5_validator_2.nxdl.xml"), } + probable_file_paths = [ + os.path.join(definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml"), + os.path.join(definitions_path, "applications", f"{nxdl}.nxdl.xml"), + os.path.join(definitions_path, "base_classes", f"{nxdl}.nxdl.xml"), + os.path.join( + definitions_path, "dev_tools/tests/test_nxdls", f"{nxdl}.nxdl.xml" + ), + ] if nxdl in special_names: nxdl_f_path = special_names[nxdl] else: - nxdl_f_path = os.path.join( - definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml" - ) - if not os.path.exists(nxdl_f_path): - nxdl_f_path = os.path.join( - definitions_path, "applications", f"{nxdl}.nxdl.xml" - ) - if not os.path.exists(nxdl_f_path): - nxdl_f_path = os.path.join( - definitions_path, "base_classes", f"{nxdl}.nxdl.xml" - ) - if not os.path.exists(nxdl_f_path): - raise FileNotFoundError(f"The nxdl file, {nxdl}, was not found.") + nxdl_f_path = next(x for x in probable_file_paths if os.path.exists(x)) + # nxdl_f_path = os.path.join( + # definitions_path, "contributed_definitions", f"{nxdl}.nxdl.xml" + # ) + # if not os.path.exists(nxdl_f_path): + # nxdl_f_path = os.path.join( + # definitions_path, "applications", f"{nxdl}.nxdl.xml" + # ) + # if not os.path.exists(nxdl_f_path): + # nxdl_f_path = os.path.join( + # definitions_path, "base_classes", f"{nxdl}.nxdl.xml" + # ) + # if not os.path.exists(nxdl_f_path): + # raise FileNotFoundError(f"The nxdl file, {nxdl}, was not found.") return ET.parse(nxdl_f_path).getroot(), nxdl_f_path @@ -582,7 +597,7 @@ def is_value_valid_element_of_enum(value, elist) -> Tuple[bool, list]: "ISO8601": (str,), "NX_BINARY": (bytes, bytearray, np.byte, np.ubyte, np.ndarray), "NX_BOOLEAN": (bool, np.ndarray, np.bool_), - "NX_CHAR": (str, np.ndarray, np.chararray), + "NX_CHAR": (str, np.ndarray, chararray), "NX_DATE_TIME": (str,), "NX_FLOAT": (float, np.ndarray, np.floating), "NX_INT": (int, np.ndarray, np.signedinteger), @@ -647,6 +662,63 @@ def convert_str_to_bool_safe(value): return None +def clean_str_attr( + attr: Optional[Union[str, bytes]], encoding="utf-8" +) -> Optional[str]: + """ + Cleans the string attribute which means it will decode bytes to str if necessary. + If `attr` is not str, bytes or None it raises a TypeError. + """ + if attr is None: + return attr + if isinstance(attr, bytes): + return attr.decode(encoding) + if isinstance(attr, str): + return attr + + raise TypeError( + "Invalid type {type} for attribute. Should be either None, bytes or str." + ) + + +def is_valid_unit( + unit: str, nx_category: str, transformation_type: Optional[str] +) -> bool: + """ + The provided unit belongs to the provided nexus unit category. + Args: + unit (str): The unit to check. Should be according to pint. + nx_category (str): A nexus unit category, e.g. `NX_LENGTH`, + or derived unit category, e.g., `NX_LENGTH ** 2`. + transformation_type (Optional[str]): + The transformation type of an NX_TRANSFORMATION. + This parameter is ignored if the `nx_category` is not `NX_TRANSFORMATION`. + If `transformation_type` is not present this should be set to None. + Returns: + bool: The unit belongs to the provided category + """ + unit = clean_str_attr(unit) + try: + if nx_category in ("NX_ANY"): + ureg(unit) # Check if unit is generally valid + return True + nx_category = re.sub(r"(NX_[A-Z]+)", r"[\1]", nx_category) + if nx_category == "[NX_TRANSFORMATION]": + # NX_TRANSFORMATIONS is a pseudo unit + # and can be either an angle, a length or unitless + # depending on the transformation type. + if transformation_type is None: + return ureg(unit).check("[NX_UNITLESS]") + if transformation_type == "translation": + return ureg(unit).check("[NX_LENGTH]") + if transformation_type == "rotation": + return ureg(unit).check("[NX_ANGLE]") + return False + return ureg(unit).check(f"{nx_category}") + except UndefinedUnitError: + return False + + def is_valid_data_field(value, nxdl_type, path) -> bool: """Checks whether a given value is valid according to what is defined in the NXDL. diff --git a/src/pynxtools/dataconverter/nexus_tree.py b/src/pynxtools/dataconverter/nexus_tree.py index bbba22c09..24cb91753 100644 --- a/src/pynxtools/dataconverter/nexus_tree.py +++ b/src/pynxtools/dataconverter/nexus_tree.py @@ -29,7 +29,7 @@ """ from functools import reduce -from typing import Any, List, Literal, Optional, Set, Tuple, Union +from typing import Any, List, Literal, Optional, Set, Tuple import lxml.etree as ET from anytree.node.nodemixin import NodeMixin @@ -142,18 +142,24 @@ class NexusNode(NodeMixin): The inverse of the above `is_a`. In the example case `DATA` `parent_of` `my_data`. """ - + # TODO rename type to nx_type in every place name: str type: Literal["group", "field", "attribute", "choice"] optionality: Literal["required", "recommended", "optional"] = "required" + name_type: Literal["any", "partial"] variadic: bool = False inheritance: List[ET._Element] is_a: List["NexusNode"] parent_of: List["NexusNode"] + occurrence_limits: Tuple[ + # TODO: Use Annotated[int, Field(strict=True, ge=0)] for py>3.8 + Optional[int], + Optional[int], + ] = (None, None) def _set_optionality(self): """ - Sets the optionality of the current node + Sets the optionality of the current node based on the inheritance chain. if `recommended`, `required` or `optional` is set. Also sets the field to optional if `maxOccurs == 0` or to required if `maxOccurs > 0`. @@ -179,6 +185,7 @@ def __init__( type: Literal["group", "field", "attribute", "choice"], optionality: Literal["required", "recommended", "optional"] = "required", variadic: Optional[bool] = None, + name_type: Optional[Literal["any", "partial"]] = None, parent: Optional["NexusNode"] = None, inheritance: Optional[List[Any]] = None, ) -> None: @@ -187,6 +194,7 @@ def __init__( self.type = type self.optionality = optionality self.variadic = contains_uppercase(self.name) + self.name_type = name_type if variadic is not None: self.variadic = variadic if inheritance is not None: @@ -222,6 +230,9 @@ def get_path(self) -> str: while current_node.parent is not None: names.insert(0, current_node.name) current_node = current_node.parent + + if self.type == "attribute" and names: + names[-1] = f"@{names[-1]}" return "/" + "/".join(names) def search_add_child_for_multiple( @@ -793,10 +804,10 @@ def _set_items(self): if not self.dtype == "NX_CHAR": return for elem in self.inheritance: - enum = elem.find(f"nx:enumeration", namespaces=namespaces) + enum = elem.find("nx:enumeration", namespaces=namespaces) if enum is not None: self.items = [] - for items in enum.findall(f"nx:item", namespaces=namespaces): + for items in enum.findall("nx:item", namespaces=namespaces): self.items.append(items.attrib["value"]) return @@ -806,7 +817,7 @@ def _set_shape(self): The first vale found is used. """ for elem in self.inheritance: - dimension = elem.find(f"nx:dimensions", namespaces=namespaces) + dimension = elem.find("nx:dimensions", namespaces=namespaces) if dimension is not None: break if not self.inheritance or dimension is None: diff --git a/src/pynxtools/dataconverter/readers/example/reader.py b/src/pynxtools/dataconverter/readers/example/reader.py index 34d0aae1c..3e3fd09af 100644 --- a/src/pynxtools/dataconverter/readers/example/reader.py +++ b/src/pynxtools/dataconverter/readers/example/reader.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# """An example reader implementation for the DataConverter.""" import json diff --git a/src/pynxtools/dataconverter/units/__init__.py b/src/pynxtools/dataconverter/units/__init__.py new file mode 100644 index 000000000..90230a882 --- /dev/null +++ b/src/pynxtools/dataconverter/units/__init__.py @@ -0,0 +1,24 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A unit registry for nexus units""" + +import os + +from pint import UnitRegistry + +ureg = UnitRegistry(os.path.join(os.path.dirname(__file__), "default_en.txt")) diff --git a/src/pynxtools/dataconverter/units/constants_en.txt b/src/pynxtools/dataconverter/units/constants_en.txt new file mode 100644 index 000000000..6ec8d2dbc --- /dev/null +++ b/src/pynxtools/dataconverter/units/constants_en.txt @@ -0,0 +1,72 @@ +# Default Pint constants definition file +# Based on the International System of Units +# Language: english +# Source: https://physics.nist.gov/cuu/Constants/ +# https://physics.nist.gov/PhysRefData/XrayTrans/Html/search.html +# :copyright: 2013,2019 by Pint Authors, see AUTHORS for more details. + +#### MATHEMATICAL CONSTANTS #### +# As computed by Maxima with fpprec:50 + +pi = 3.1415926535897932384626433832795028841971693993751 = π # pi +tansec = 4.8481368111333441675396429478852851658848753880815e-6 # tangent of 1 arc-second ~ arc_second/radian +ln10 = 2.3025850929940456840179914546843642076011014886288 # natural logarithm of 10 +wien_x = 4.9651142317442763036987591313228939440555849867973 # solution to (x-5)*exp(x)+5 = 0 => x = W(5/exp(5))+5 +wien_u = 2.8214393721220788934031913302944851953458817440731 # solution to (u-3)*exp(u)+3 = 0 => u = W(3/exp(3))+3 + +#### DEFINED EXACT CONSTANTS #### + +speed_of_light = 299792458 m/s = c = c_0 # since 1983 +planck_constant = 6.62607015e-34 J s = h # since May 2019 +elementary_charge = 1.602176634e-19 C = e # since May 2019 +avogadro_number = 6.02214076e23 # since May 2019 +boltzmann_constant = 1.380649e-23 J K^-1 = k = k_B # since May 2019 +standard_gravity = 9.80665 m/s^2 = g_0 = g0 = g_n = gravity # since 1901 +standard_atmosphere = 1.01325e5 Pa = atm = atmosphere # since 1954 +conventional_josephson_constant = 4.835979e14 Hz / V = K_J90 # since Jan 1990 +conventional_von_klitzing_constant = 2.5812807e4 ohm = R_K90 # since Jan 1990 + +#### DERIVED EXACT CONSTANTS #### +# Floating-point conversion may introduce inaccuracies + +zeta = c / (cm/s) = ζ +dirac_constant = h / (2 * π) = ħ = h_bar = atomic_unit_of_action = a_u_action +avogadro_constant = avogadro_number * mol^-1 = N_A +molar_gas_constant = k * N_A = R +faraday_constant = e * N_A +conductance_quantum = 2 * e ** 2 / h = G_0 +magnetic_flux_quantum = h / (2 * e) = Φ_0 = Phi_0 +josephson_constant = 2 * e / h = K_J +von_klitzing_constant = h / e ** 2 = R_K +stefan_boltzmann_constant = 2 / 15 * π ** 5 * k ** 4 / (h ** 3 * c ** 2) = σ = sigma +first_radiation_constant = 2 * π * h * c ** 2 = c_1 +second_radiation_constant = h * c / k = c_2 +wien_wavelength_displacement_law_constant = h * c / (k * wien_x) +wien_frequency_displacement_law_constant = wien_u * k / h + +#### MEASURED CONSTANTS #### +# Recommended CODATA-2018 values +# To some extent, what is measured and what is derived is a bit arbitrary. +# The choice of measured constants is based on convenience and on available uncertainty. +# The uncertainty in the last significant digits is given in parentheses as a comment. + +newtonian_constant_of_gravitation = 6.67430e-11 m^3/(kg s^2) = _ = gravitational_constant # (15) +rydberg_constant = 1.0973731568160e7 * m^-1 = R_∞ = R_inf # (21) +electron_g_factor = -2.00231930436256 = g_e # (35) +atomic_mass_constant = 1.66053906660e-27 kg = m_u # (50) +electron_mass = 9.1093837015e-31 kg = m_e = atomic_unit_of_mass = a_u_mass # (28) +proton_mass = 1.67262192369e-27 kg = m_p # (51) +neutron_mass = 1.67492749804e-27 kg = m_n # (95) +K_alpha_Cu_d_220 = 0.80232719 # (22) +K_alpha_Mo_d_220 = 0.36940604 # (19) +K_alpha_W_d_220 = 0.108852175 # (98) + +#### DERIVED CONSTANTS #### + +fine_structure_constant = (2 * h * R_inf / (m_e * c)) ** 0.5 = α = alpha +vacuum_permeability = 2 * α * h / (e ** 2 * c) = µ_0 = mu_0 = mu0 = magnetic_constant +vacuum_permittivity = e ** 2 / (2 * α * h * c) = ε_0 = epsilon_0 = eps_0 = eps0 = electric_constant +impedance_of_free_space = 2 * α * h / e ** 2 = Z_0 = characteristic_impedance_of_vacuum +coulomb_constant = α * h_bar * c / e ** 2 = k_C +classical_electron_radius = α * h_bar / (m_e * c) = r_e +thomson_cross_section = 8 / 3 * π * r_e ** 2 = σ_e = sigma_e diff --git a/src/pynxtools/dataconverter/units/default_en.txt b/src/pynxtools/dataconverter/units/default_en.txt new file mode 100644 index 000000000..dc9c58685 --- /dev/null +++ b/src/pynxtools/dataconverter/units/default_en.txt @@ -0,0 +1,634 @@ +# Default Pint units definition file +# Based on the International System of Units +# Language: english +# :copyright: 2013,2019 by Pint Authors, see AUTHORS for more details. + +# Syntax +# ====== +# Units +# ----- +# = [= ] [= ] [ = ] [...] +# +# The canonical name and aliases should be expressed in singular form. +# Pint automatically deals with plurals built by adding 's' to the singular form; plural +# forms that don't follow this rule should be instead explicitly listed as aliases. +# +# If a unit has no symbol and one wants to define aliases, then the symbol should be +# conventionally set to _. +# +# Example: +# millennium = 1e3 * year = _ = millennia +# +# +# Prefixes +# -------- +# - = [= ] [= ] [ = ] [...] +# +# Example: +# deca- = 1e+1 = da- = deka- +# +# +# Derived dimensions +# ------------------ +# [dimension name] = +# +# Example: +# [density] = [mass] / [volume] +# +# Note that primary dimensions don't need to be declared; they can be +# defined for the first time in a unit definition. +# E.g. see below `meter = [length]` +# +# +# Additional aliases +# ------------------ +# @alias = [ = ] [...] +# +# Used to add aliases to already existing unit definitions. +# Particularly useful when one wants to enrich definitions +# from defaults_en.txt with custom aliases. +# +# Example: +# @alias meter = my_meter + +# See also: https://pint.readthedocs.io/en/latest/defining.html + +@defaults + group = international + system = mks +@end + + +#### PREFIXES #### + +# decimal prefixes +yocto- = 1e-24 = y- +zepto- = 1e-21 = z- +atto- = 1e-18 = a- +femto- = 1e-15 = f- +pico- = 1e-12 = p- +nano- = 1e-9 = n- +micro- = 1e-6 = µ- = u- +milli- = 1e-3 = m- +centi- = 1e-2 = c- +deci- = 1e-1 = d- +deca- = 1e+1 = da- = deka- +hecto- = 1e2 = h- +kilo- = 1e3 = k- +mega- = 1e6 = M- +giga- = 1e9 = G- +tera- = 1e12 = T- +peta- = 1e15 = P- +exa- = 1e18 = E- +zetta- = 1e21 = Z- +yotta- = 1e24 = Y- + +# binary_prefixes +kibi- = 2**10 = Ki- +mebi- = 2**20 = Mi- +gibi- = 2**30 = Gi- +tebi- = 2**40 = Ti- +pebi- = 2**50 = Pi- +exbi- = 2**60 = Ei- +zebi- = 2**70 = Zi- +yobi- = 2**80 = Yi- + +#### BASE UNITS #### + +meter = [length] = m = metre +second = [time] = s = sec +ampere = [current] = A = amp +candela = [luminosity] = cd = candle +gram = [mass] = g +mole = [substance] = mol +kelvin = [temperature]; offset: 0 = K = degK = °K = degree_Kelvin = degreeK # older names supported for compatibility +radian = [angle] = rad +bit = [information] +pixel = [digital_image_resolution] = px = pel + +#### NEXUS UNIT CATEGORIES #### +[NX_ANGLE] = [angle] +[NX_ANY] = [] +[NX_AREA] = [area] +[NX_CHARGE] = [charge] +[NX_COUNT] = [] +[NX_CROSS_SECTION] = [area] +[NX_CURRENT] = [current] +[NX_DIMENSIONLESS] = [] +[NX_EMITTANCE] = [length] * [angle] +[NX_ENERGY] = [energy] +[NX_FLUX] = 1 / [time] / [length] ** 2 +[NX_FREQUENCY] = [frequency] +[NX_LENGTH] = [length] +[NX_MASS] = [mass] +[NX_MASS_DENSITY] = [mass] / [length] ** 3 +[NX_MOLECULAR_WEIGHT] = [mass] / [substance] +[NX_PERIOD] = [time] +[NX_PER_AREA] = 1 / [length] ** 2 +[NX_PER_LENGTH] = 1 / [length] +[NX_POWER] = [power] +[NX_PRESSURE] = [pressure] +[NX_PULSES] = [] +[NX_SCATTERING_LENGTH_DENSITY] = 1 / [length] ** 2 +[NX_SOLID_ANGLE] = [angle] ** 2 +[NX_TEMPERATURE] = [temperature] +[NX_TIME] = [time] +[NX_TIME_OF_FLIGHT] = [time] +[NX_UNITLESS] = [] +[NX_VOLTAGE] = [electric_potential] +[NX_VOLUME] = [length] ** 3 +[NX_WAVELENGTH] = [length] +[NX_WAVENUMBER] = [wavenumber] + + +#### CONSTANTS #### + +@import constants_en.txt + + +#### UNITS #### +# Common and less common, grouped by quantity. +# Conversion factors are exact (except when noted), +# although floating-point conversion may introduce inaccuracies + +# Unitless +counts = [] + +# Angle +degree = π / 180 * radian = ° = deg = arcdeg = arcdegree = angular_degree +arcminute = degree / 60 = arcmin = arc_minute = angular_minute +arcsecond = arcminute / 60 = arcsec = arc_second = angular_second +milliarcsecond = 1e-3 * arcsecond = mas +grade = π / 200 * radian = grad = gon +mil = π / 32000 * radian + +# Solid angle +steradian = radian ** 2 = sr +square_degree = (π / 180) ** 2 * sr = sq_deg = sqdeg + +# Information +byte = 8 * bit = B = octet +baud = bit / second = Bd = bps + +# Length +angstrom = 1e-10 * meter = Å = ångström = Å +micron = micrometer = µ +fermi = femtometer +atomic_unit_of_length = h_bar / (alpha * m_e * c) = bohr = a_0 = a0 = bohr_radius = a_u_length +planck_length = (h_bar * gravitational_constant / c ** 3) ** 0.5 + +# Mass +metric_ton = 1e3 * kilogram = tonne +unified_atomic_mass_unit = atomic_mass_constant = u = amu +dalton = atomic_mass_constant = Da +grain = 64.79891 * milligram = gr +gamma_mass = microgram +carat = 200 * milligram = ct = karat +planck_mass = (h_bar * c / gravitational_constant) ** 0.5 + +# Time +minute = 60 * second +hour = 60 * minute = hr +atomic_unit_of_time = h_bar / E_h = a_u_time +planck_time = (h_bar * gravitational_constant / c ** 5) ** 0.5 + +# Temperature +degree_Celsius = kelvin; offset: 273.15 = °C = celsius = degC = degreeC +degree_Rankine = 5 / 9 * kelvin; offset: 0 = °R = rankine = degR = degreeR +degree_Fahrenheit = 5 / 9 * kelvin; offset: 233.15 + 200 / 9 = °F = fahrenheit = degF = degreeF +degree_Reaumur = 4 / 5 * kelvin; offset: 273.15 = °Re = reaumur = degRe = degreeRe = degree_Réaumur = réaumur +atomic_unit_of_temperature = E_h / k = a_u_temp +planck_temperature = (h_bar * c ** 5 / gravitational_constant / k ** 2) ** 0.5 + +# Area +[area] = [length] ** 2 +barn = 1e-28 * meter ** 2 = b +darcy = centipoise * centimeter ** 2 / (second * atmosphere) + +# Volume +[volume] = [length] ** 3 +liter = decimeter ** 3 = l = L = litre +lambda = microliter = λ + +# Frequency +[frequency] = 1 / [time] +hertz = 1 / second = Hz +revolutions_per_minute = 1 / minute = rpm +revolutions_per_second = 1 / second = rps +counts_per_second = 1 / second = cps + +# Wavenumber +[wavenumber] = 1 / [length] +reciprocal_centimeter = 1 / cm = cm_1 = kayser + +# Speed +[speed] = [length] / [time] +mile_per_hour = mile / hour = mph = MPH +kilometer_per_hour = kilometer / hour = kph = KPH +kilometer_per_second = kilometer / second = kps +meter_per_second = meter / second = mps +foot_per_second = foot / second = fps + +# Acceleration +[acceleration] = [speed] / [time] +galileo = centimeter / second ** 2 = Gal + +# Force +[force] = [mass] * [acceleration] +newton = kilogram * meter / second ** 2 = N +dyne = gram * centimeter / second ** 2 = dyn +force_kilogram = g_0 * kilogram = pond +force_gram = g_0 * gram = gf = gram_force +force_metric_ton = g_0 * metric_ton = tf = metric_ton_force = force_t = t_force +atomic_unit_of_force = E_h / a_0 = a_u_force + +# Energy +[energy] = [force] * [length] +joule = newton * meter = J +erg = dyne * centimeter +watt_hour = watt * hour = Wh = watthour +electron_volt = e * volt = eV +rydberg = h * c * R_inf = Ry +hartree = 2 * rydberg = Ha = E_h = hartree_energy = atomic_unit_of_energy = a_u_energy +calorie = 4.184 * joule = cal = thermochemical_calorie = cal_th +international_calorie = 4.1868 * joule = cal_it = international_steam_table_calorie +fifteen_degree_calorie = 4.1855 * joule = cal_15 + +# Power +[power] = [energy] / [time] +watt = joule / second = W +volt_ampere = volt * ampere = VA +horsepower = 550 * foot * force_pound / second = hp = UK_horsepower = hydraulic_horsepower +metric_horsepower = 75 * force_kilogram * meter / second +electrical_horsepower = 746 * watt +standard_liter_per_minute = atmosphere * liter / minute = slpm = slm +conventional_watt_90 = K_J90 ** 2 * R_K90 / (K_J ** 2 * R_K) * watt = W_90 + +# Momentum +[momentum] = [length] * [mass] / [time] + +# Density (as auxiliary for pressure) +[density] = [mass] / [volume] +mercury = 13.5951 * kilogram / liter = Hg = Hg_0C = Hg_32F = conventional_mercury +water = 1.0 * kilogram / liter = H2O = conventional_water +mercury_60F = 13.5568 * kilogram / liter = Hg_60F # approximate +water_39F = 0.999972 * kilogram / liter = water_4C # approximate +water_60F = 0.999001 * kilogram / liter # approximate + +# Pressure +[pressure] = [force] / [area] +pascal = newton / meter ** 2 = Pa +barye = dyne / centimeter ** 2 = Ba = barie = barad = barrie = baryd +bar = 1e5 * pascal +torr = atm / 760 +pound_force_per_square_inch = force_pound / inch ** 2 = psi +kip_per_square_inch = kip / inch ** 2 = ksi +millimeter_Hg = millimeter * Hg * g_0 = mmHg = mm_Hg = millimeter_Hg_0C +centimeter_Hg = centimeter * Hg * g_0 = cmHg = cm_Hg = centimeter_Hg_0C +inch_Hg = inch * Hg * g_0 = inHg = in_Hg = inch_Hg_32F +inch_Hg_60F = inch * Hg_60F * g_0 +inch_H2O_39F = inch * water_39F * g_0 +inch_H2O_60F = inch * water_60F * g_0 +foot_H2O = foot * water * g_0 = ftH2O = feet_H2O +centimeter_H2O = centimeter * water * g_0 = cmH2O = cm_H2O +atomic_unit_of_pressure = E_h / bohr_radius ** 3 = a_u_pressure + +# Viscosity +[viscosity] = [pressure] * [time] +poise = 0.1 * Pa * second = P +reyn = psi * second + +# Kinematic viscosity +[kinematic_viscosity] = [area] / [time] +stokes = centimeter ** 2 / second = St + +# Fluidity +[fluidity] = 1 / [viscosity] +rhe = 1 / poise + +# Amount of substance +particle = 1 / N_A = _ = molec = molecule + +# Concentration +[concentration] = [substance] / [volume] +molar = mole / liter = M + +# Catalytic activity +[activity] = [substance] / [time] +katal = mole / second = kat +enzyme_unit = micromole / minute = U = enzymeunit + +# Entropy +[entropy] = [energy] / [temperature] +clausius = calorie / kelvin = Cl + +# Molar entropy +[molar_entropy] = [entropy] / [substance] +entropy_unit = calorie / kelvin / mole = eu + +# Radiation +becquerel = counts_per_second = Bq +curie = 3.7e10 * becquerel = Ci +rutherford = 1e6 * becquerel = Rd +gray = joule / kilogram = Gy +sievert = joule / kilogram = Sv +rem = 0.01 * sievert +roentgen = 2.58e-4 * coulomb / kilogram = _ = röntgen # approximate, depends on medium + +# Luminance +[luminance] = [luminosity] / [area] +nit = candela / meter ** 2 +stilb = candela / centimeter ** 2 +lambert = 1 / π * candela / centimeter ** 2 + +# Luminous flux +[luminous_flux] = [luminosity] * [angle] ** 2 +lumen = candela * steradian = lm + +# Illuminance +[illuminance] = [luminous_flux] / [area] +lux = lumen / meter ** 2 = lx + +# Intensity +[intensity] = [power] / [area] +atomic_unit_of_intensity = 0.5 * ε_0 * c * atomic_unit_of_electric_field ** 2 = a_u_intensity + +# Current +biot = 10 * ampere = Bi +abampere = biot = abA +atomic_unit_of_current = e / atomic_unit_of_time = a_u_current +mean_international_ampere = mean_international_volt / mean_international_ohm = A_it +US_international_ampere = US_international_volt / US_international_ohm = A_US +conventional_ampere_90 = K_J90 * R_K90 / (K_J * R_K) * ampere = A_90 +planck_current = (c ** 6 / gravitational_constant / k_C) ** 0.5 + +# Charge +[charge] = [current] * [time] +coulomb = ampere * second = C +abcoulomb = 10 * C = abC +faraday = e * N_A * mole +conventional_coulomb_90 = K_J90 * R_K90 / (K_J * R_K) * coulomb = C_90 + +# Electric potential +[electric_potential] = [energy] / [charge] +volt = joule / coulomb = V +abvolt = 1e-8 * volt = abV +mean_international_volt = 1.00034 * volt = V_it # approximate +US_international_volt = 1.00033 * volt = V_US # approximate +conventional_volt_90 = K_J90 / K_J * volt = V_90 + +# Electric field +[electric_field] = [electric_potential] / [length] +atomic_unit_of_electric_field = e * k_C / a_0 ** 2 = a_u_electric_field + +# Electric displacement field +[electric_displacement_field] = [charge] / [area] + +# Resistance +[resistance] = [electric_potential] / [current] +ohm = volt / ampere = Ω +abohm = 1e-9 * ohm = abΩ +mean_international_ohm = 1.00049 * ohm = Ω_it = ohm_it # approximate +US_international_ohm = 1.000495 * ohm = Ω_US = ohm_US # approximate +conventional_ohm_90 = R_K / R_K90 * ohm = Ω_90 = ohm_90 + +# Resistivity +[resistivity] = [resistance] * [length] + +# Conductance +[conductance] = [current] / [electric_potential] +siemens = ampere / volt = S = mho +absiemens = 1e9 * siemens = abS = abmho + +# Capacitance +[capacitance] = [charge] / [electric_potential] +farad = coulomb / volt = F +abfarad = 1e9 * farad = abF +conventional_farad_90 = R_K90 / R_K * farad = F_90 + +# Inductance +[inductance] = [magnetic_flux] / [current] +henry = weber / ampere = H +abhenry = 1e-9 * henry = abH +conventional_henry_90 = R_K / R_K90 * henry = H_90 + +# Magnetic flux +[magnetic_flux] = [electric_potential] * [time] +weber = volt * second = Wb +unit_pole = µ_0 * biot * centimeter + +# Magnetic field +[magnetic_field] = [magnetic_flux] / [area] +tesla = weber / meter ** 2 = T +gamma = 1e-9 * tesla = γ +gauss = 1e-4 * tesla = G + +# Magnetic field strength +[magnetic_field_strength] = [current] / [length] + +# Electric dipole moment +[electric_dipole] = [charge] * [length] +debye = 1e-9 / ζ * coulomb * angstrom = D # formally 1 D = 1e-10 Fr*Å, but we generally want to use it outside the Gaussian context + +# Electric quadrupole moment +[electric_quadrupole] = [charge] * [area] +buckingham = debye * angstrom + +# Magnetic dipole moment +[magnetic_dipole] = [current] * [area] +bohr_magneton = e * h_bar / (2 * m_e) = µ_B = mu_B +nuclear_magneton = e * h_bar / (2 * m_p) = µ_N = mu_N + +# Pixel density +[pixel_density] = [digital_image_resolution] / [length] +pixels_per_inch = px / inch = PPI = ppi +pixels_per_centimeter = px / cm = PPCM = ppcm + +#### UNIT GROUPS #### +# Mostly for length, area, volume, mass, force +# (customary or specialized units) + +@group USCSLengthInternational + thou = 1e-3 * inch = th = mil_length + inch = yard / 36 = in = international_inch = inches = international_inches + hand = 4 * inch + foot = yard / 3 = ft = international_foot = feet = international_feet + yard = 0.9144 * meter = yd = international_yard # since Jul 1959 + mile = 1760 * yard = mi = international_mile + + square_inch = inch ** 2 = sq_in = square_inches + square_foot = foot ** 2 = sq_ft = square_feet + square_yard = yard ** 2 = sq_yd + square_mile = mile ** 2 = sq_mi + + cubic_inch = in ** 3 = cu_in + cubic_foot = ft ** 3 = cu_ft = cubic_feet + cubic_yard = yd ** 3 = cu_yd +@end + +@group USCSLengthSurvey + link = 1e-2 * chain = li = survey_link + survey_foot = 1200 / 3937 * meter = sft + fathom = 6 * survey_foot + rod = 16.5 * survey_foot = rd = pole = perch + chain = 4 * rod + furlong = 40 * rod = fur + cables_length = 120 * fathom + survey_mile = 5280 * survey_foot = smi = us_statute_mile + league = 3 * survey_mile + + square_rod = rod ** 2 = sq_rod = sq_pole = sq_perch + acre = 10 * chain ** 2 + square_survey_mile = survey_mile ** 2 = _ = section + square_league = league ** 2 + + acre_foot = acre * survey_foot = _ = acre_feet +@end + +@group USCSLiquidVolume + minim = pint / 7680 + fluid_dram = pint / 128 = fldr = fluidram = US_fluid_dram = US_liquid_dram + fluid_ounce = pint / 16 = floz = US_fluid_ounce = US_liquid_ounce + gill = pint / 4 = gi = liquid_gill = US_liquid_gill + pint = quart / 2 = liquid_pint = US_pint + fifth = gallon / 5 = _ = US_liquid_fifth + quart = gallon / 4 = qt = liquid_quart = US_liquid_quart + gallon = 231 * cubic_inch = gal = liquid_gallon = US_liquid_gallon +@end + +@group Avoirdupois + dram = pound / 256 = dr = avoirdupois_dram = avdp_dram = drachm + ounce = pound / 16 = oz = avoirdupois_ounce = avdp_ounce + pound = 7e3 * grain = lb = avoirdupois_pound = avdp_pound + stone = 14 * pound + quarter = 28 * stone + bag = 94 * pound + hundredweight = 100 * pound = cwt = short_hundredweight + long_hundredweight = 112 * pound + ton = 2e3 * pound = _ = short_ton + long_ton = 2240 * pound + slug = g_0 * pound * second ** 2 / foot + slinch = g_0 * pound * second ** 2 / inch = blob = slugette + + force_ounce = g_0 * ounce = ozf = ounce_force + force_pound = g_0 * pound = lbf = pound_force + force_ton = g_0 * ton = _ = ton_force = force_short_ton = short_ton_force + force_long_ton = g_0 * long_ton = _ = long_ton_force + kip = 1e3 * force_pound + poundal = pound * foot / second ** 2 = pdl +@end + +@group AvoirdupoisUK using Avoirdupois + UK_hundredweight = long_hundredweight = UK_cwt + UK_ton = long_ton + UK_force_ton = force_long_ton = _ = UK_ton_force +@end + +@group AvoirdupoisUS using Avoirdupois + US_hundredweight = hundredweight = US_cwt + US_ton = ton + US_force_ton = force_ton = _ = US_ton_force +@end + +@group Troy + pennyweight = 24 * grain = dwt + troy_ounce = 480 * grain = toz = ozt + troy_pound = 12 * troy_ounce = tlb = lbt +@end + +@group ImperialVolume + imperial_minim = imperial_fluid_ounce / 480 + imperial_fluid_scruple = imperial_fluid_ounce / 24 + imperial_fluid_drachm = imperial_fluid_ounce / 8 = imperial_fldr = imperial_fluid_dram + imperial_fluid_ounce = imperial_pint / 20 = imperial_floz = UK_fluid_ounce + imperial_gill = imperial_pint / 4 = imperial_gi = UK_gill + imperial_cup = imperial_pint / 2 = imperial_cp = UK_cup + imperial_pint = imperial_gallon / 8 = imperial_pt = UK_pint + imperial_quart = imperial_gallon / 4 = imperial_qt = UK_quart + imperial_gallon = 4.54609 * liter = imperial_gal = UK_gallon + imperial_peck = 2 * imperial_gallon = imperial_pk = UK_pk + imperial_bushel = 8 * imperial_gallon = imperial_bu = UK_bushel + imperial_barrel = 36 * imperial_gallon = imperial_bbl = UK_bbl +@end + + +#### CONVERSION CONTEXTS #### + +@context(n=1) spectroscopy = sp + # n index of refraction of the medium. + [length] <-> [frequency]: speed_of_light / n / value + [frequency] -> [energy]: planck_constant * value + [energy] -> [frequency]: value / planck_constant + # allow wavenumber / kayser + [wavenumber] <-> [length]: 1 / value +@end + +@context boltzmann + [temperature] -> [energy]: boltzmann_constant * value + [energy] -> [temperature]: value / boltzmann_constant +@end + +@context energy + [energy] -> [energy] / [substance]: value * N_A + [energy] / [substance] -> [energy]: value / N_A + [energy] -> [mass]: value / c ** 2 + [mass] -> [energy]: value * c ** 2 +@end + +@context(mw=0,volume=0,solvent_mass=0) chemistry = chem + # mw is the molecular weight of the species + # volume is the volume of the solution + # solvent_mass is the mass of solvent in the solution + + # moles -> mass require the molecular weight + [substance] -> [mass]: value * mw + [mass] -> [substance]: value / mw + + # moles/volume -> mass/volume and moles/mass -> mass/mass + # require the molecular weight + [substance] / [volume] -> [mass] / [volume]: value * mw + [mass] / [volume] -> [substance] / [volume]: value / mw + [substance] / [mass] -> [mass] / [mass]: value * mw + [mass] / [mass] -> [substance] / [mass]: value / mw + + # moles/volume -> moles requires the solution volume + [substance] / [volume] -> [substance]: value * volume + [substance] -> [substance] / [volume]: value / volume + + # moles/mass -> moles requires the solvent (usually water) mass + [substance] / [mass] -> [substance]: value * solvent_mass + [substance] -> [substance] / [mass]: value / solvent_mass + + # moles/mass -> moles/volume require the solvent mass and the volume + [substance] / [mass] -> [substance]/[volume]: value * solvent_mass / volume + [substance] / [volume] -> [substance] / [mass]: value / solvent_mass * volume + +@end + + +#### SYSTEMS OF UNITS #### + +@system SI + second + meter + kilogram + ampere + kelvin + mole + candela +@end + +@system mks using international + meter + kilogram + second +@end + +@system atomic using international + # based on unit m_e, e, h_bar, k_C, k + bohr: meter + electron_mass: gram + atomic_unit_of_time: second + atomic_unit_of_current: ampere + atomic_unit_of_temperature: kelvin +@end \ No newline at end of file diff --git a/src/pynxtools/dataconverter/validation.py b/src/pynxtools/dataconverter/validation.py index 63a715f6a..7e03cf1ae 100644 --- a/src/pynxtools/dataconverter/validation.py +++ b/src/pynxtools/dataconverter/validation.py @@ -20,11 +20,13 @@ from collections import defaultdict from functools import reduce from operator import getitem -from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union +from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple, Union import h5py import lxml.etree as ET import numpy as np +from cachetools import LRUCache, cached +from cachetools.keys import hashkey from pynxtools.dataconverter.helpers import ( Collector, @@ -32,6 +34,7 @@ collector, convert_nexus_to_caps, is_valid_data_field, + is_valid_unit, ) from pynxtools.dataconverter.nexus_tree import ( NexusEntity, @@ -42,19 +45,145 @@ from pynxtools.definitions.dev_tools.utils.nxdl_utils import get_nx_namefit -def validate_hdf_group_against(appdef: str, data: h5py.Group): +def best_namefit_of_(name: str, concepts: Set[str]) -> str: + if not concepts: + return None + + if name in concepts: + return name + + best_match, score = max( + map(lambda x: (x, get_nx_namefit(name, x)), concepts), key=lambda x: x[1] + ) + if score < 0: + return None + + return best_match + + +def validate_hdf_group_against(appdef: str, data: h5py.Group) -> bool: """ - Checks whether all the required paths from the template are returned in data dict. + Validate an HDF5 group against the Nexus tree for the application definition `appdef`. - THIS IS JUST A FUNCTION SKELETON AND IS NOT WORKING YET! + Args: + appdef (str): The application definition to validate against. + data (h5py.Group): The h5py group to validate. + + Returns: + bool: True if the group is valid according to `appdef`, False otherwise. """ - def validate(name: str, data: Union[h5py.Group, h5py.Dataset]): + # Only cache based on path. That way we retain the nx_class information + # in the tree + # Allow for 10000 cache entries. This should be enough for most cases + @cached( + cache=LRUCache(maxsize=10000), + key=lambda path, node_type=None, nx_class=None: hashkey(path), + ) + def find_node_for( + path: str, node_type: Optional[str] = None, nx_class: Optional[str] = None + ) -> Optional[NexusNode]: + if path == "": + return tree + + *prev_path, last_elem = path.rsplit("/", 1) + node = find_node_for(prev_path[0]) if prev_path else tree + + if node is None: + return None + + best_child = best_namefit_of_( + last_elem, + node.get_all_direct_children_names(nx_class=nx_class, node_type=node_type), + ) + if best_child is None: + return None + + return node.search_add_child_for(best_child) + + def remove_from_req_fields(path: str): + if path in required_fields: + required_fields.remove(path) + + def handle_group(path: str, data: h5py.Group): + node = find_node_for( + path, node_type="group", nx_class=data.attrs.get("NX_class") + ) + if node is None: + collector.collect_and_log( + path, ValidationProblem.MissingDocumentation, None + ) + return + + # TODO: Do actual group checks + + def handle_field(path: str, data: h5py.Dataset): + node = find_node_for(path, node_type="field") + if node is None: + collector.collect_and_log( + path, ValidationProblem.MissingDocumentation, None + ) + return + remove_from_req_fields(node.get_path()) + is_valid_data_field(data[()], node.dtype, path) + + units = data.attrs.get("units") + if node.unit is not None: + if units is None: + collector.collect_and_log( + f"{path}/@units", ValidationProblem.MissingUnit, node.unit + ) + return + remove_from_req_fields(f"{node.get_path()}/@units") + is_valid_unit(units, node.unit, None) + elif units is not None: + collector.collect_and_log( + f"{entry_name}/{path}/@units", + ValidationProblem.MissingDocumentation, + path, + ) + + def handle_attributes(path: str, attrs: h5py.AttributeManager): + for attr_name in attrs: + if attr_name in ("NX_class", "units"): + # Ignore special attrs + continue + + node = find_node_for(f"{path}/{attr_name}", node_type="attribute") + if node is None: + collector.collect_and_log( + f"{path}/@{attr_name}", ValidationProblem.MissingDocumentation, None + ) + continue + remove_from_req_fields(node.get_path()) + is_valid_data_field(attrs.get(attr_name), node.dtype, node.get_path()) + + def validate(path: str, data: Union[h5py.Group, h5py.Dataset]): # Namefit name against tree (use recursive caching) - pass + if isinstance(data, h5py.Group): + handle_group(path, data) + elif isinstance(data, h5py.Dataset): + handle_field(path, data) - tree = generate_tree_from(appdef) - data.visitems(validate) + handle_attributes(path, data.attrs) + + appdef = generate_tree_from(appdef) + required_fields = appdef.required_fields_and_attrs_names() + tree = appdef.search_add_child_for("ENTRY") + entry_name = data.name + data.visititems(validate) + + for req_field in required_fields: + if "@" in req_field: + collector.collect_and_log( + req_field, ValidationProblem.MissingRequiredAttribute, None + ) + continue + collector.collect_and_log( + req_field, ValidationProblem.MissingRequiredField, None + ) + + return not collector.has_validation_problems() def build_nested_dict_from( @@ -174,7 +303,7 @@ def validate_dict_against( appdef (str): The appdef name to validate against. mapping (Mapping[str, Any]): The mapping containing the data to validate. - This should be a dict of `/` separated paths. + This should be a dict of `/` separated paths elements. Attributes are denoted with `@` in front of the last element. ignore_undocumented (bool, optional): Ignore all undocumented keys in the verification diff --git a/src/pynxtools/dataconverter/verify.py b/src/pynxtools/dataconverter/verify.py new file mode 100644 index 000000000..dd2a3c5a1 --- /dev/null +++ b/src/pynxtools/dataconverter/verify.py @@ -0,0 +1,84 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Verifies a nxs file""" + +import logging +import os +import sys +import xml.etree.ElementTree as ET +from os import path +from typing import Dict, Union + +import click +from h5py import File, is_hdf5 + +from pynxtools.dataconverter import helpers +from pynxtools.dataconverter.validation import validate_hdf_group_against + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def _get_def_map(file: str) -> Dict[str, str]: + def_map: Dict[str, str] = {} + with File(file, "r") as h5file: + for entry_name, dataset in h5file.items(): + if ( + helpers.clean_str_attr(dataset.attrs.get("NX_class")) == "NXentry" + and f"/{entry_name}/definition" in h5file + ): + def_map.update( + {entry_name: h5file[f"/{entry_name}/definition"][()].decode("utf8")} + ) + + return def_map + + +@click.command() +@click.argument("file") +def verify(file: str): + """Verifies a nexus file""" + + if not path.exists(file): + raise click.FileError(file, hint=f'File "{file}" does not exist.') + + if not path.isfile(file): + raise click.FileError(file, hint=f'"{file}" is not a file.') + + if not is_hdf5(file): + raise click.FileError(file, hint=f'"{file}" is not a valid HDF5 file.') + + def_map = _get_def_map(file) + + if not def_map: + logger.info(f"Could not find any valid entry in file {file}") + + with File(file, "r") as h5file: + for entry, nxdl in def_map.items(): + is_valid = validate_hdf_group_against(nxdl, h5file[entry]) + + if is_valid: + logger.info( + f"The entry `{entry}` in file `{file}` is a valid file" + f" according to the `{nxdl}` application definition.", + ) + else: + logger.info( + f"Invalid: The entry `{entry}` in file `{file}` is NOT a valid file" + f" according to the `{nxdl}` application definition.", + ) diff --git a/src/pynxtools/definitions b/src/pynxtools/definitions index f75a29836..40a2e8ad5 160000 --- a/src/pynxtools/definitions +++ b/src/pynxtools/definitions @@ -1 +1 @@ -Subproject commit f75a29836431f35d68df6174e3868a0418523397 +Subproject commit 40a2e8ad57facb6f16839ef47c4d3eedcc2e4b37 diff --git a/tests/dataconverter/test_helpers.py b/tests/dataconverter/test_helpers.py index ddbcb2351..84eb61506 100644 --- a/tests/dataconverter/test_helpers.py +++ b/tests/dataconverter/test_helpers.py @@ -336,11 +336,7 @@ def fixture_filled_test_data(template, tmp_path): "/ENTRY[my_entry]/NXODD_name[nxodd_name]/bool_value", "required", ), - ( - "The data entry corresponding to /ENTRY[my_entry]/NXODD_name[nxodd_name]" - "/bool_value is" - " required and hasn't been supplied by the reader." - ), + "Missing field: /ENTRY[my_entry]/NXODD_name[nxodd_name]/bool_value", id="empty-required-field", ), pytest.param( @@ -349,11 +345,7 @@ def fixture_filled_test_data(template, tmp_path): "/ENTRY[my_entry]/NXODD_name[nxodd_two_name]/bool_value", "required", ), - ( - "The data entry corresponding to /ENTRY[my_entry]/" - "NXODD_name[nxodd_two_name]/bool_value is" - " required and hasn't been supplied by the reader." - ), + "Missing field: /ENTRY[my_entry]/NXODD_name[nxodd_two_name]/bool_value", id="empty-required-field", ), pytest.param( @@ -366,11 +358,7 @@ def fixture_filled_test_data(template, tmp_path): "/ENTRY[my_entry]/NXODD_name[nxodd_name]/bool_value", "required", ), - ( - "The data entry corresponding to /ENTRY[my_entry]/NXODD_name[nxodd_name]" - "/bool_value is" - " required and hasn't been supplied by the reader." - ), + "Missing field: /ENTRY[my_entry]/NXODD_name[nxodd_name]/bool_value", id="empty-required-field", ), pytest.param( @@ -432,10 +420,7 @@ def fixture_filled_test_data(template, tmp_path): set_to_none_in_dict( TEMPLATE, "/ENTRY[my_entry]/optional_parent/required_child", "optional" ), - ( - "The data entry corresponding to /ENTRY[my_entry]/optional_parent/" - "required_child is required and hasn't been supplied by the reader." - ), + "Missing field: /ENTRY[my_entry]/optional_parent/required_child", id="atleast-one-required-child-not-provided-optional-parent", ), pytest.param( @@ -444,11 +429,7 @@ def fixture_filled_test_data(template, tmp_path): "/ENTRY[my_entry]/OPTIONAL_group[my_group]/required_field", "required", ), - ( - "The data entry corresponding to /ENTRY[my_entry]/" - "OPTIONAL_group[my_group]/required_field " - "is required and hasn't been supplied by the reader." - ), + "Missing field: /ENTRY[my_entry]/OPTIONAL_group[my_group]/required_field", id="required-field-not-provided-in-variadic-optional-group", ), pytest.param( diff --git a/tests/dataconverter/test_validation.py b/tests/dataconverter/test_validation.py index 135d0476c..a8dde5ffa 100644 --- a/tests/dataconverter/test_validation.py +++ b/tests/dataconverter/test_validation.py @@ -17,11 +17,17 @@ # limitations under the License. # import logging +import os from typing import Any, Dict, List, Tuple, Union import numpy as np import pytest +from click.testing import CliRunner +from pynxtools.dataconverter.helpers import get_nxdl_root_and_path +from pynxtools.dataconverter.template import Template from pynxtools.dataconverter.validation import validate_dict_against +from pynxtools.dataconverter.verify import verify +from pynxtools.dataconverter.writer import Writer def get_data_dict(): @@ -109,7 +115,7 @@ def test_valid_data_dict(caplog, data_dict): remove_from_dict( "/ENTRY[my_entry]/NXODD_name[nxodd_name]/bool_value", get_data_dict() ), - "The data entry corresponding to /ENTRY[my_entry]/NXODD_name[nxodd_name]/bool_value is required and hasn't been supplied by the reader.", + "Missing field: /ENTRY[my_entry]/NXODD_name[nxodd_name]/bool_value", id="missing-required-value", ) ], @@ -119,3 +125,68 @@ def test_validation_shows_warning(caplog, data_dict, error_message): assert not validate_dict_against("NXtest", data_dict) assert error_message in caplog.text + + +data_dict_list = [ + ( + { + "/ENTRY[entry]/definition": "NXhdf5_validator_2", + "/ENTRY[entry]/version": "no version", + "/ENTRY[entry]/experiment_result/hdf5_validator_2_intensity": np.array( + [[11, 12, 13], [21, 22, 23]] + ), + "/ENTRY[entry]/hdf5_validator_1_program_name": "hdf5_file_validator", + "/ENTRY[entry]/hdf5_validator_1_required/required_field": "Required_field_from nxdl-1", + "/ENTRY[entry]/hdf5_validator_2_users_req/required_field": "Required_field_from_nxdl-2", + }, + { + "error_messages": [ + "WARNING: Field version written without documentation.", + 'WARNING: Missing attribute: "/ENTRY/experiment_result/@long_name"', + 'WARNING: Missing attribute: "/ENTRY/experiment_result/@AXISNAME_indices"', + 'WARNING: Missing attribute: "/ENTRY/experiment_result/@axes"', + 'WARNING: Missing attribute: "/ENTRY/experiment_result/@auxiliary_signals"', + "WARNING: Missing field: /ENTRY/experiment_result/DATA", + 'WARNING: Missing attribute: "/ENTRY/experiment_result/DATA/@units"', + "WARNING: Missing field: /ENTRY/experiment_result/AXISNAME", + 'WARNING: Missing attribute: "/ENTRY/experiment_result/AXISNAME/@units"', + 'WARNING: Missing attribute: "/ENTRY/experiment_result/@signal"', + 'WARNING: Missing attribute: "/ENTRY/definition/@version"', + "is NOT a valid file according to the `NXhdf5_validator_2` application definition.", + ] + }, + ), + ({}, {}), +] + + +@pytest.mark.parametrize("data_dict, error_massages", data_dict_list) +def test_nexus_file_validate(data_dict, error_massages, tmp_path, caplog): + if not data_dict and not error_massages: + return + + caplog_level = "INFO" + template = Template() + + for key, val in data_dict.items(): + template[key] = val + + nxdl_name = "NXhdf5_validator_2" + _, nxdl_path = get_nxdl_root_and_path(nxdl=nxdl_name) + hdf_file_path = tmp_path / "hdf5_validator_test.nxs" + Writer(data=template, nxdl_f_path=nxdl_path, output_path=hdf_file_path).write() + with caplog.at_level(caplog_level): + _ = CliRunner().invoke(verify, [str(hdf_file_path)]) + error_massages = error_massages["error_messages"] + for record in caplog.records: + try: + assert ( + record.msg in error_massages + ), f"Error message not found: {record.msg}" + except AssertionError: + # Only for detecting entry or missing application definition massage + assert ( + error_massages[-1] in record.msg + ), f"Error message not found: {record.msg}" + + os.remove(hdf_file_path)