From 29f1ac9c734a01e75eab579c4a31730f084900eb Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Fri, 23 Aug 2024 16:06:25 +0200 Subject: [PATCH 01/10] save current state --- .gitmodules | 2 +- src/pynxtools/dataconverter/writer.py | 148 ++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 71907ead7..ed00dea93 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "src/pynxtools/definitions"] path = src/pynxtools/definitions - url = https://github.com/FAIRmat-NFDI/nexus_definitions.git \ No newline at end of file + url = https://github.com/FAIRmat-NFDI/nexus_definitions.git diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index d22307c88..467554173 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -237,6 +237,134 @@ def __nxdl_to_attrs(self, path: str = "/") -> dict: return elem.attrib + def get_nxdl_docs(self, path: str = "/", attr: bool = False) -> dict: + """Get the NXDL docs for a path in the data.""" + + def extract_and_format_docs(elem: ET.Element) -> str: + """Get the docstring for a given element in the NDXL tree.""" + docs = elem.findall(f"{self.nxs_namespace}doc") + if docs: + return docs[0].text.strip().replace("\\n", "\n") + return "" + + nxdl_path = helpers.convert_data_converter_dict_to_nxdl_path(path) + + try: + elem = get_node_at_nxdl_path(nxdl_path, elem=copy.deepcopy(self.nxdl_data)) + except NxdlAttributeNotFoundError: + return None + + if not attr: + return extract_and_format_docs(elem) + else: + from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( + get_inherited_nodes, + ) + + (class_path, nxdlpath, elist) = get_inherited_nodes( + nxdl_path, elem=copy.deepcopy(self.nxdl_data) + ) + print(class_path, nxdlpath, elist) + if "data/@signal" in path: + print((class_path, nxdlpath, elist)) + elem = get_node_at_nxdl_path( + nxdl_path, elem=copy.deepcopy(self.nxdl_data) + ) + print(path, nxdl_path, elem) + + # return get_nxdl_attr_doc(elem) + + # def append_docs(elem, doc: str): + # """Append docs to existing docs.""" + # if isinstance(elem, (h5py.Group, h5py.Dataset)): + # existing_doc = elem.attrs.get("doc", "") + # # if isinstance(elem, (h5py.Attribute)): + # # existing_doc = str(elem) + + # if doc: + # if isinstance(existing_doc, str): + # return existing_doc + doc + # else: + # return doc + + # def get_nxdl_attr_doc( # pylint: disable=too-many-arguments,too-many-locals + # elem, elist, attr, hdf_node, logger, doc, nxdl_path, req_str, path, hdf_info + # ): + # """Get nxdl documentation for an attribute""" + # new_elem = [] + # old_elem = elem + # attr_inheritance_chain = [] + # for elem_index, act_elem1 in enumerate(elist): + # act_elem = act_elem1 + # # NX_class is a compulsory attribute for groups in a nexus file + # # which should match the type of the corresponding NXDL element + # if ( + # attr == "NX_class" + # and not isinstance(hdf_node, h5py.Dataset) + # and elem_index == 0 + # ): + # elem = None + # logger, doc, attr = write_doc_string(logger, doc, attr) + # new_elem = elem + # break + # # units category is a compulsory attribute for any fields + # if attr == "units" and isinstance(hdf_node, h5py.Dataset): + # req_str = "<>" + # logger, act_elem, attr_inheritance_chain, doc, attr = try_find_units( + # logger, act_elem, attr_inheritance_chain, doc, attr + # ) + # # units for attributes can be given as ATTRIBUTENAME_units + # elif attr.endswith("_units"): + # logger, act_elem, attr_inheritance_chain, doc, attr, req_str = ( + # check_attr_name_nxdl( + # (logger, act_elem, attr_inheritance_chain, doc, attr, req_str) + # ) + # ) + # # default is allowed for groups + # elif attr == "default" and not isinstance(hdf_node, h5py.Dataset): + # req_str = "<>" + # # try to find if default is defined as a child of the NXDL element + # act_elem = get_nxdl_child( + # act_elem, attr, nexus_type="attribute", go_base=False + # ) + # logger, act_elem, attr_inheritance_chain, doc, attr = try_find_default( + # logger, act_elem1, act_elem, attr_inheritance_chain, doc, attr + # ) + # else: # other attributes + # act_elem = get_nxdl_child( + # act_elem, attr, nexus_type="attribute", go_base=False + # ) + # if act_elem is not None: + # logger, act_elem, attr_inheritance_chain, doc, attr = other_attrs( + # logger, act_elem1, act_elem, attr_inheritance_chain, doc, attr + # ) + # if act_elem is not None: + # new_elem.append(act_elem) + # if req_str is None: + # req_str = get_required_string(act_elem) # check for being required + # if doc: + # logger.debug(req_str) + # variables = [logger, act_elem, path] + # ( + # logger, + # elem, + # path, + # doc, + # elist, + # attr, + # hdf_node, + # ) = check_deprecation_enum_axis(variables, doc, elist, attr, hdf_node) + # elem = old_elem + # if req_str is None and doc: + # if attr != "NX_class": + # logger.debug("@" + attr + " - IS NOT IN SCHEMA") + # logger.debug("") + + # # Add the lowest child element to the nxdl_path + # if attr_inheritance_chain: + # nxdl_path.append(attr_inheritance_chain[0]) + # return (req_str, get_nxdl_entry(hdf_info), nxdl_path) + def ensure_and_get_parent_node(self, path: str, undocumented_paths) -> h5py.Group: """Returns the parent if it exists for a given path else creates the parent group.""" parent_path = path[0 : path.rindex("/")] or "/" @@ -249,6 +377,11 @@ def ensure_and_get_parent_node(self, path: str, undocumented_paths) -> h5py.Grou if attrs is not None: grp.attrs["NX_class"] = attrs["type"] + + docs = self.get_nxdl_docs(parent_path) + if docs: + grp.attrs["doc"] = docs + return grp return self.output_nexus[parent_path_hdf5] @@ -263,6 +396,11 @@ def add_units_key(dataset, path): dataset.attrs["units"] = self.data[units_key] for path, value in self.data.items(): + if path.split("/")[-1][0] == "@": + docs = self.get_nxdl_docs(path, attr=True) + else: + docs = self.get_nxdl_docs(path) + try: if path[path.rindex("/") + 1 :] == "@units": continue @@ -279,17 +417,24 @@ def add_units_key(dataset, path): grp = self.ensure_and_get_parent_node( path, self.data.undocumented.keys() ) + if isinstance(data, dict): if "compress" in data.keys(): dataset = handle_dicts_entries( data, grp, entry_name, self.output_path, path ) + if docs: + dataset.attrs["doc"] = docs + else: hdf5_links_for_later.append( [data, grp, entry_name, self.output_path, path] ) else: dataset = grp.create_dataset(entry_name, data=data) + if docs: + dataset.attrs["doc"] = docs + except InvalidDictProvided as exc: print(str(exc)) except Exception as exc: @@ -305,6 +450,7 @@ def add_units_key(dataset, path): del self.data[links[-1]] for path, value in self.data.items(): + docs = self.get_nxdl_docs(path) try: if path[path.rindex("/") + 1 :] == "@units": continue @@ -327,6 +473,8 @@ def add_units_key(dataset, path): path, self.data.undocumented.keys() ) dataset.attrs[entry_name[1:]] = data + if docs: + dataset.attrs["doc"] = docs except Exception as exc: raise IOError( f"Unknown error occured writing the path: {path} " From 24c3b4138286db687e067eacad62cdc7198fa78a Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:09:51 +0200 Subject: [PATCH 02/10] fix everything except for attribute doc retrieval --- src/pynxtools/data/NXtest.nxdl.xml | 1 + src/pynxtools/dataconverter/convert.py | 15 +- src/pynxtools/dataconverter/writer.py | 195 ++++++++----------------- tests/dataconverter/test_writer.py | 20 +++ 4 files changed, 93 insertions(+), 138 deletions(-) diff --git a/src/pynxtools/data/NXtest.nxdl.xml b/src/pynxtools/data/NXtest.nxdl.xml index 8695a20c9..01e39214d 100644 --- a/src/pynxtools/data/NXtest.nxdl.xml +++ b/src/pynxtools/data/NXtest.nxdl.xml @@ -14,6 +14,7 @@ This is a dummy NXDL to test out the dataconverter. + This is the version of the definition. diff --git a/src/pynxtools/dataconverter/convert.py b/src/pynxtools/dataconverter/convert.py index 508071906..0ca50fc44 100644 --- a/src/pynxtools/dataconverter/convert.py +++ b/src/pynxtools/dataconverter/convert.py @@ -237,7 +237,11 @@ def convert( ) helpers.add_default_root_attributes(data=data, filename=os.path.basename(output)) - Writer(data=data, nxdl_f_path=nxdl_f_path, output_path=output).write() + + write_docs = kwargs.pop("write_docs", False) + Writer(data=data, nxdl_f_path=nxdl_f_path, output_path=output).write( + write_docs=write_docs + ) logger.info(f"The output file generated: {output}.") @@ -350,6 +354,12 @@ def main_cli(): default=None, help="A json config file for the reader", ) +@click.option( + "--write-docs", + is_flag=True, + default=False, + help="Write docs for the individual NeXus concepts as HDF5 attributes.", +) # pylint: disable=too-many-arguments def convert_cli( files: Tuple[str, ...], @@ -390,6 +400,9 @@ def convert_cli( if config_file: kwargs["config_file"] = config_file + if write_docs: + kwargs["write_docs"] = write_docs + file_list = [] for file in files: if os.path.isdir(file): diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index 467554173..e7cd667b6 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -21,7 +21,6 @@ import copy import logging -import sys import xml.etree.ElementTree as ET import h5py @@ -32,6 +31,7 @@ from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( NxdlAttributeNotFoundError, get_node_at_nxdl_path, + get_inherited_nodes, ) logger = logging.getLogger("pynxtools") # pylint: disable=C0103 @@ -109,7 +109,7 @@ def handle_shape_entries(data, file, path): # pylint: disable=too-many-locals, inconsistent-return-statements -def handle_dicts_entries(data, grp, entry_name, output_path, path): +def handle_dicts_entries(data, grp, entry_name, output_path, path, docs): """Handle function for dictionaries found as value of the nexus file. Several cases can be encoutered: @@ -118,12 +118,14 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path): - Internal links - External links - compression label""" + if "link" in data: file, path = split_link(data, output_path) # generate virtual datasets from slices if "shape" in data.keys(): layout = handle_shape_entries(data, file, path) - grp.create_virtual_dataset(entry_name, layout) + dataset = grp.create_virtual_dataset(entry_name, layout) + # multiple datasets to concatenate elif "link" in data.keys() and isinstance(data["link"], list): total_length = 0 @@ -141,7 +143,7 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path): for vsource in sources: layout[offset : offset + vsource.shape[0]] = vsource offset += vsource.shape[0] - grp.create_virtual_dataset(entry_name, layout, fillvalue=0) + dataset = grp.create_virtual_dataset(entry_name, layout, fillvalue=0) # internal and external links elif "link" in data.keys(): if ":/" not in data["link"]: @@ -159,7 +161,7 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path): ) if accept is True: strength = data["strength"] - grp.create_dataset( + dataset = grp.create_dataset( entry_name, data=data["compress"], compression="gzip", @@ -167,13 +169,20 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path): compression_opts=strength, ) else: - grp.create_dataset(entry_name, data=data["compress"]) + dataset = grp.create_dataset(entry_name, data=data["compress"]) else: raise InvalidDictProvided( "A dictionary was provided to the template but it didn't" " fall into any of the know cases of handling" " dictionaries. This occured for: " + entry_name ) + + if docs: + try: + dataset.attrs["docs"] = docs + except NameError: + pass + # Check whether link has been stabilished or not try: return grp[entry_name] @@ -198,10 +207,14 @@ class Writer: output_nexus (h5py.File): The h5py file object to manipulate output file. nxdl_data (dict): Stores xml data from given nxdl file to use during conversion. nxs_namespace (str): The namespace used in the NXDL tags. Helps search for XML children. + write_docs (bool): Write docs for the individual NeXus concepts as HDF5 attributes. """ def __init__( - self, data: dict = None, nxdl_f_path: str = None, output_path: str = None + self, + data: dict = None, + nxdl_f_path: str = None, + output_path: str = None, ): """Constructs the necessary objects required by the Writer class.""" self.data = data @@ -211,6 +224,8 @@ def __init__( self.nxdl_data = ET.parse(self.nxdl_f_path).getroot() self.nxs_namespace = get_namespace(self.nxdl_data) + self.write_docs: bool = False + def __nxdl_to_attrs(self, path: str = "/") -> dict: """ Return a dictionary of all the attributes at the given path in the NXDL and @@ -237,7 +252,7 @@ def __nxdl_to_attrs(self, path: str = "/") -> dict: return elem.attrib - def get_nxdl_docs(self, path: str = "/", attr: bool = False) -> dict: + def __nxdl_docs(self, path: str = "/") -> dict: """Get the NXDL docs for a path in the data.""" def extract_and_format_docs(elem: ET.Element) -> str: @@ -247,123 +262,27 @@ def extract_and_format_docs(elem: ET.Element) -> str: return docs[0].text.strip().replace("\\n", "\n") return "" + def get_nxdl_attr_doc(nxdl_path): + return "" + + docs: str = "" + + if not self.write_docs: + return docs + nxdl_path = helpers.convert_data_converter_dict_to_nxdl_path(path) - try: - elem = get_node_at_nxdl_path(nxdl_path, elem=copy.deepcopy(self.nxdl_data)) - except NxdlAttributeNotFoundError: - return None + _, _, elist = get_inherited_nodes(nxdl_path, elem=copy.deepcopy(self.nxdl_data)) - if not attr: - return extract_and_format_docs(elem) - else: - from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( - get_inherited_nodes, - ) + for elem in elist: + if not docs: + # Only use docs from superclasses if they are not extended. + docs += extract_and_format_docs(elem) - (class_path, nxdlpath, elist) = get_inherited_nodes( - nxdl_path, elem=copy.deepcopy(self.nxdl_data) - ) - print(class_path, nxdlpath, elist) - if "data/@signal" in path: - print((class_path, nxdlpath, elist)) - elem = get_node_at_nxdl_path( - nxdl_path, elem=copy.deepcopy(self.nxdl_data) - ) - print(path, nxdl_path, elem) - - # return get_nxdl_attr_doc(elem) - - # def append_docs(elem, doc: str): - # """Append docs to existing docs.""" - # if isinstance(elem, (h5py.Group, h5py.Dataset)): - # existing_doc = elem.attrs.get("doc", "") - # # if isinstance(elem, (h5py.Attribute)): - # # existing_doc = str(elem) - - # if doc: - # if isinstance(existing_doc, str): - # return existing_doc + doc - # else: - # return doc - - # def get_nxdl_attr_doc( # pylint: disable=too-many-arguments,too-many-locals - # elem, elist, attr, hdf_node, logger, doc, nxdl_path, req_str, path, hdf_info - # ): - # """Get nxdl documentation for an attribute""" - # new_elem = [] - # old_elem = elem - # attr_inheritance_chain = [] - # for elem_index, act_elem1 in enumerate(elist): - # act_elem = act_elem1 - # # NX_class is a compulsory attribute for groups in a nexus file - # # which should match the type of the corresponding NXDL element - # if ( - # attr == "NX_class" - # and not isinstance(hdf_node, h5py.Dataset) - # and elem_index == 0 - # ): - # elem = None - # logger, doc, attr = write_doc_string(logger, doc, attr) - # new_elem = elem - # break - # # units category is a compulsory attribute for any fields - # if attr == "units" and isinstance(hdf_node, h5py.Dataset): - # req_str = "<>" - # logger, act_elem, attr_inheritance_chain, doc, attr = try_find_units( - # logger, act_elem, attr_inheritance_chain, doc, attr - # ) - # # units for attributes can be given as ATTRIBUTENAME_units - # elif attr.endswith("_units"): - # logger, act_elem, attr_inheritance_chain, doc, attr, req_str = ( - # check_attr_name_nxdl( - # (logger, act_elem, attr_inheritance_chain, doc, attr, req_str) - # ) - # ) - # # default is allowed for groups - # elif attr == "default" and not isinstance(hdf_node, h5py.Dataset): - # req_str = "<>" - # # try to find if default is defined as a child of the NXDL element - # act_elem = get_nxdl_child( - # act_elem, attr, nexus_type="attribute", go_base=False - # ) - # logger, act_elem, attr_inheritance_chain, doc, attr = try_find_default( - # logger, act_elem1, act_elem, attr_inheritance_chain, doc, attr - # ) - # else: # other attributes - # act_elem = get_nxdl_child( - # act_elem, attr, nexus_type="attribute", go_base=False - # ) - # if act_elem is not None: - # logger, act_elem, attr_inheritance_chain, doc, attr = other_attrs( - # logger, act_elem1, act_elem, attr_inheritance_chain, doc, attr - # ) - # if act_elem is not None: - # new_elem.append(act_elem) - # if req_str is None: - # req_str = get_required_string(act_elem) # check for being required - # if doc: - # logger.debug(req_str) - # variables = [logger, act_elem, path] - # ( - # logger, - # elem, - # path, - # doc, - # elist, - # attr, - # hdf_node, - # ) = check_deprecation_enum_axis(variables, doc, elist, attr, hdf_node) - # elem = old_elem - # if req_str is None and doc: - # if attr != "NX_class": - # logger.debug("@" + attr + " - IS NOT IN SCHEMA") - # logger.debug("") - - # # Add the lowest child element to the nxdl_path - # if attr_inheritance_chain: - # nxdl_path.append(attr_inheritance_chain[0]) - # return (req_str, get_nxdl_entry(hdf_info), nxdl_path) + if not elist: + return get_nxdl_attr_doc(nxdl_path) + + return docs def ensure_and_get_parent_node(self, path: str, undocumented_paths) -> h5py.Group: """Returns the parent if it exists for a given path else creates the parent group.""" @@ -378,9 +297,9 @@ def ensure_and_get_parent_node(self, path: str, undocumented_paths) -> h5py.Grou if attrs is not None: grp.attrs["NX_class"] = attrs["type"] - docs = self.get_nxdl_docs(parent_path) + docs = self.__nxdl_docs(parent_path) if docs: - grp.attrs["doc"] = docs + grp.attrs["docs"] = docs return grp return self.output_nexus[parent_path_hdf5] @@ -396,10 +315,7 @@ def add_units_key(dataset, path): dataset.attrs["units"] = self.data[units_key] for path, value in self.data.items(): - if path.split("/")[-1][0] == "@": - docs = self.get_nxdl_docs(path, attr=True) - else: - docs = self.get_nxdl_docs(path) + docs = self.__nxdl_docs(path) try: if path[path.rindex("/") + 1 :] == "@units": @@ -421,19 +337,17 @@ def add_units_key(dataset, path): if isinstance(data, dict): if "compress" in data.keys(): dataset = handle_dicts_entries( - data, grp, entry_name, self.output_path, path + data, grp, entry_name, self.output_path, path, docs ) - if docs: - dataset.attrs["doc"] = docs else: hdf5_links_for_later.append( - [data, grp, entry_name, self.output_path, path] + [data, grp, entry_name, self.output_path, path, docs] ) else: dataset = grp.create_dataset(entry_name, data=data) if docs: - dataset.attrs["doc"] = docs + dataset.attrs["docs"] = docs except InvalidDictProvided as exc: print(str(exc)) @@ -450,7 +364,7 @@ def add_units_key(dataset, path): del self.data[links[-1]] for path, value in self.data.items(): - docs = self.get_nxdl_docs(path) + docs = self.__nxdl_docs(path) try: if path[path.rindex("/") + 1 :] == "@units": continue @@ -458,6 +372,7 @@ def add_units_key(dataset, path): entry_name = helpers.get_name_from_data_dict_entry( path[path.rindex("/") + 1 :] ) + if is_not_data_empty(value): data = value else: @@ -468,21 +383,27 @@ def add_units_key(dataset, path): add_units_key(self.output_nexus[path_hdf5], path) else: - # consider changing the name here the lvalue can also be group! dataset = self.ensure_and_get_parent_node( path, self.data.undocumented.keys() ) dataset.attrs[entry_name[1:]] = data if docs: - dataset.attrs["doc"] = docs + # Write docs for attributes like __docs + dataset.attrs[f"{entry_name[1:]}__docs"] = docs except Exception as exc: raise IOError( f"Unknown error occured writing the path: {path} " f"with the following message: {str(exc)}" ) from exc - def write(self): - """Writes the NeXus file with previously validated data from the reader with NXDL attrs.""" + def write(self, write_docs=False): + """ + Writes the NeXus file with previously validated data from the reader with NXDL attrs. + + Args: + write_docs (bool): Write docs for the individual NeXus concepts as HDF5 attributes. The default is False. + """ + self.write_docs = write_docs try: self._put_data_into_hdf5() finally: diff --git a/tests/dataconverter/test_writer.py b/tests/dataconverter/test_writer.py index acc84d8d5..6f6f7c898 100644 --- a/tests/dataconverter/test_writer.py +++ b/tests/dataconverter/test_writer.py @@ -59,6 +59,26 @@ def test_write(writer): assert test_nxs["/my_entry/nxodd_name/posint_value"].shape == (3,) # pylint: disable=no-member +def test_write_docs(writer): + """Test for the Writer's write_docs option. Checks whether docs are written for NeXus concepts.""" + writer.write(write_docs=True) + test_nxs = h5py.File(writer.output_path, "r") + # print(writer.output_path) + assert ( + test_nxs["/my_entry"].attrs["docs"] + == "This is a dummy NXDL to test out the dataconverter." + ) + # assert test_nxs["/my_entry/definition"].attrs["version__docs"] == "This is the version of the definition." + assert ( + test_nxs["/my_entry/nxodd_name/int_value"].attrs["docs"] + == "A dummy entry for an int value." + ) + assert ( + test_nxs["/my_entry/required_group"].attrs["docs"] + == "This is a required yet empty group." + ) + + def test_write_link(writer): """Test for the Writer's write function. From a8132430d2d10d620f1ebf8eb920e711c712010d Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:02:54 +0200 Subject: [PATCH 03/10] mypy fixes --- src/pynxtools/dataconverter/convert.py | 1 + src/pynxtools/dataconverter/writer.py | 10 ++-------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/pynxtools/dataconverter/convert.py b/src/pynxtools/dataconverter/convert.py index 0ca50fc44..8cf6cd0a6 100644 --- a/src/pynxtools/dataconverter/convert.py +++ b/src/pynxtools/dataconverter/convert.py @@ -373,6 +373,7 @@ def convert_cli( mapping: str, config_file: str, fail: bool, + write_docs: bool, **kwargs, ): """This command allows you to use the converter functionality of the dataconverter.""" diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index e7cd667b6..d4b44f46d 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -118,14 +118,12 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, docs): - Internal links - External links - compression label""" - if "link" in data: file, path = split_link(data, output_path) # generate virtual datasets from slices if "shape" in data.keys(): layout = handle_shape_entries(data, file, path) dataset = grp.create_virtual_dataset(entry_name, layout) - # multiple datasets to concatenate elif "link" in data.keys() and isinstance(data["link"], list): total_length = 0 @@ -211,10 +209,7 @@ class Writer: """ def __init__( - self, - data: dict = None, - nxdl_f_path: str = None, - output_path: str = None, + self, data: dict = None, nxdl_f_path: str = None, output_path: str = None ): """Constructs the necessary objects required by the Writer class.""" self.data = data @@ -252,7 +247,7 @@ def __nxdl_to_attrs(self, path: str = "/") -> dict: return elem.attrib - def __nxdl_docs(self, path: str = "/") -> dict: + def __nxdl_docs(self, path: str = "/") -> str: """Get the NXDL docs for a path in the data.""" def extract_and_format_docs(elem: ET.Element) -> str: @@ -372,7 +367,6 @@ def add_units_key(dataset, path): entry_name = helpers.get_name_from_data_dict_entry( path[path.rindex("/") + 1 :] ) - if is_not_data_empty(value): data = value else: From 19f6364f7015dd506002f77bcc066a1e86ac7570 Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Fri, 30 Aug 2024 10:26:07 +0200 Subject: [PATCH 04/10] add special logic for appdef dos and attributes --- src/pynxtools/dataconverter/writer.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index d4b44f46d..f52157f64 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -257,16 +257,19 @@ def extract_and_format_docs(elem: ET.Element) -> str: return docs[0].text.strip().replace("\\n", "\n") return "" - def get_nxdl_attr_doc(nxdl_path): - return "" - docs: str = "" if not self.write_docs: - return docs + return nxdl_path = helpers.convert_data_converter_dict_to_nxdl_path(path) + if nxdl_path == "/ENTRY": + # Special case for docs of application definition + app_def_docs = extract_and_format_docs(self.nxdl_data) + if app_def_docs: + return app_def_docs + _, _, elist = get_inherited_nodes(nxdl_path, elem=copy.deepcopy(self.nxdl_data)) for elem in elist: @@ -275,7 +278,14 @@ def get_nxdl_attr_doc(nxdl_path): docs += extract_and_format_docs(elem) if not elist: - return get_nxdl_attr_doc(nxdl_path) + # Handle docs for attributeS + (_, inherited_nodes, _) = get_inherited_nodes( + nxdl_path, elem=copy.deepcopy(self.nxdl_data) + ) + attrs = inherited_nodes[-1].findall(f"{self.nxs_namespace}attribute") + for attr in attrs: + if attr.attrib["name"] == path.split("@")[-1]: + docs += extract_and_format_docs(attr) return docs From 1d24501e241d8f901b760e6a0bb3676ec6fe9663 Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Fri, 30 Aug 2024 10:28:19 +0200 Subject: [PATCH 05/10] test for attribute docs --- tests/dataconverter/test_writer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/dataconverter/test_writer.py b/tests/dataconverter/test_writer.py index 6f6f7c898..a6502c519 100644 --- a/tests/dataconverter/test_writer.py +++ b/tests/dataconverter/test_writer.py @@ -68,7 +68,10 @@ def test_write_docs(writer): test_nxs["/my_entry"].attrs["docs"] == "This is a dummy NXDL to test out the dataconverter." ) - # assert test_nxs["/my_entry/definition"].attrs["version__docs"] == "This is the version of the definition." + assert ( + test_nxs["/my_entry/definition"].attrs["version__docs"] + == "This is the version of the definition." + ) assert ( test_nxs["/my_entry/nxodd_name/int_value"].attrs["docs"] == "A dummy entry for an int value." From b289e49cf8615944cdcde77ab7d44da864438934 Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Fri, 30 Aug 2024 12:05:19 +0200 Subject: [PATCH 06/10] mypy fix --- src/pynxtools/dataconverter/writer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index f52157f64..45a499d62 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -22,6 +22,7 @@ import copy import logging import xml.etree.ElementTree as ET +from typing import Optional import h5py import numpy as np @@ -247,7 +248,7 @@ def __nxdl_to_attrs(self, path: str = "/") -> dict: return elem.attrib - def __nxdl_docs(self, path: str = "/") -> str: + def __nxdl_docs(self, path: str = "/") -> Optional[str]: """Get the NXDL docs for a path in the data.""" def extract_and_format_docs(elem: ET.Element) -> str: @@ -260,7 +261,7 @@ def extract_and_format_docs(elem: ET.Element) -> str: docs: str = "" if not self.write_docs: - return + return None nxdl_path = helpers.convert_data_converter_dict_to_nxdl_path(path) From 81edf905c27cb8be7717b41e4f197ad1daa8c1db Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Fri, 30 Aug 2024 12:35:24 +0200 Subject: [PATCH 07/10] fix attribute docs in NXtest.nxdl.xml --- src/pynxtools/data/NXtest.nxdl.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pynxtools/data/NXtest.nxdl.xml b/src/pynxtools/data/NXtest.nxdl.xml index 01e39214d..04d9dd1b8 100644 --- a/src/pynxtools/data/NXtest.nxdl.xml +++ b/src/pynxtools/data/NXtest.nxdl.xml @@ -13,8 +13,9 @@ This is a dummy NXDL to test out the dataconverter. - + This is the version of the definition. + From 10c2987b0209bb4a43df0863591f77f9345f2d98 Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:06:09 +0200 Subject: [PATCH 08/10] allow for different docstyles --- src/pynxtools/dataconverter/convert.py | 24 ++++++++++++++++-- src/pynxtools/dataconverter/writer.py | 34 ++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/pynxtools/dataconverter/convert.py b/src/pynxtools/dataconverter/convert.py index 8cf6cd0a6..66aecff29 100644 --- a/src/pynxtools/dataconverter/convert.py +++ b/src/pynxtools/dataconverter/convert.py @@ -239,8 +239,10 @@ def convert( helpers.add_default_root_attributes(data=data, filename=os.path.basename(output)) write_docs = kwargs.pop("write_docs", False) + docs_format = kwargs.pop("docs_format", None) Writer(data=data, nxdl_f_path=nxdl_f_path, output_path=output).write( - write_docs=write_docs + write_docs=write_docs, + docs_format=docs_format, ) logger.info(f"The output file generated: {output}.") @@ -360,7 +362,15 @@ def main_cli(): default=False, help="Write docs for the individual NeXus concepts as HDF5 attributes.", ) -# pylint: disable=too-many-arguments +@click.option( + "--docs-format", + type=click.Choice(["default", "html", "html5", "xml", "pseudoxml"]), + default=None, + help=( + "Optionally specify the format in which the docs for the individual NeXus concepts is generated. " + "By default, the docs are formatted as in the NXDL file." + ), +) def convert_cli( files: Tuple[str, ...], input_file: Tuple[str, ...], @@ -374,6 +384,7 @@ def convert_cli( config_file: str, fail: bool, write_docs: bool, + docs_format: str, **kwargs, ): """This command allows you to use the converter functionality of the dataconverter.""" @@ -403,6 +414,15 @@ def convert_cli( if write_docs: kwargs["write_docs"] = write_docs + if not docs_format: + kwargs["docs_format"] = "default" + else: + kwargs["docs_format"] = docs_format + + elif docs_format is not None: + raise click.UsageError( + "Error: --docs-format can only be used with --write-docs." + ) file_list = [] for file in files: diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index 45a499d62..1945e5ab6 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -26,6 +26,7 @@ import h5py import numpy as np +from docutils.core import publish_string from pynxtools.dataconverter import helpers from pynxtools.dataconverter.exceptions import InvalidDictProvided @@ -210,7 +211,10 @@ class Writer: """ def __init__( - self, data: dict = None, nxdl_f_path: str = None, output_path: str = None + self, + data: dict = None, + nxdl_f_path: str = None, + output_path: str = None, ): """Constructs the necessary objects required by the Writer class.""" self.data = data @@ -221,6 +225,7 @@ def __init__( self.nxs_namespace = get_namespace(self.nxdl_data) self.write_docs: bool = False + self.docs_format: str = "default" def __nxdl_to_attrs(self, path: str = "/") -> dict: """ @@ -251,11 +256,29 @@ def __nxdl_to_attrs(self, path: str = "/") -> dict: def __nxdl_docs(self, path: str = "/") -> Optional[str]: """Get the NXDL docs for a path in the data.""" + def rst_to_html(rst_text: str) -> str: + """ + Convert reStructuredText to HTML using Docutils. + + Args: + rst_text (str): The input RST text to be converted. + + Returns: + str: The resulting HTML content. + """ + return publish_string(rst_text, writer_name="html").decode("utf-8") + def extract_and_format_docs(elem: ET.Element) -> str: """Get the docstring for a given element in the NDXL tree.""" - docs = elem.findall(f"{self.nxs_namespace}doc") - if docs: - return docs[0].text.strip().replace("\\n", "\n") + docs_elements = elem.findall(f"{self.nxs_namespace}doc") + if docs_elements: + docs = docs_elements[0].text + if self.docs_format != "default": + docs = publish_string(docs, writer_name=self.docs_format).decode( + "utf-8" + ) + print(docs.strip().replace("\\n", "\n")) + return docs.strip().replace("\\n", "\n") return "" docs: str = "" @@ -401,7 +424,7 @@ def add_units_key(dataset, path): f"with the following message: {str(exc)}" ) from exc - def write(self, write_docs=False): + def write(self, write_docs: bool = False, docs_format: str = "default"): """ Writes the NeXus file with previously validated data from the reader with NXDL attrs. @@ -409,6 +432,7 @@ def write(self, write_docs=False): write_docs (bool): Write docs for the individual NeXus concepts as HDF5 attributes. The default is False. """ self.write_docs = write_docs + self.docs_format = docs_format try: self._put_data_into_hdf5() finally: From 1245dbd5241cb36c66c49b0cca209e7cd219a474 Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:22:13 +0200 Subject: [PATCH 09/10] allow for different docstyles --- src/pynxtools/dataconverter/writer.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index 1945e5ab6..26a653929 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -19,6 +19,7 @@ # pylint: disable=R0912 +import io import copy import logging import xml.etree.ElementTree as ET @@ -120,6 +121,8 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, docs): - Internal links - External links - compression label""" + + # print(data, grp, entry_name, output_path, path, docs) if "link" in data: file, path = split_link(data, output_path) # generate virtual datasets from slices @@ -256,28 +259,17 @@ def __nxdl_to_attrs(self, path: str = "/") -> dict: def __nxdl_docs(self, path: str = "/") -> Optional[str]: """Get the NXDL docs for a path in the data.""" - def rst_to_html(rst_text: str) -> str: - """ - Convert reStructuredText to HTML using Docutils. - - Args: - rst_text (str): The input RST text to be converted. - - Returns: - str: The resulting HTML content. - """ - return publish_string(rst_text, writer_name="html").decode("utf-8") - def extract_and_format_docs(elem: ET.Element) -> str: """Get the docstring for a given element in the NDXL tree.""" docs_elements = elem.findall(f"{self.nxs_namespace}doc") if docs_elements: docs = docs_elements[0].text if self.docs_format != "default": - docs = publish_string(docs, writer_name=self.docs_format).decode( - "utf-8" - ) - print(docs.strip().replace("\\n", "\n")) + docs = publish_string( + docs, + writer_name=self.docs_format, + settings_overrides={"warning_stream": io.StringIO()}, + ).decode("utf-8") return docs.strip().replace("\\n", "\n") return "" @@ -417,7 +409,7 @@ def add_units_key(dataset, path): dataset.attrs[entry_name[1:]] = data if docs: # Write docs for attributes like __docs - dataset.attrs[f"{entry_name[1:]}__docs"] = docs + dataset.attrs[f"{entry_name[1:]}_docs"] = docs except Exception as exc: raise IOError( f"Unknown error occured writing the path: {path} " From 55ea46a8f2104006dfe3b7ff7a084cda4d9b8147 Mon Sep 17 00:00:00 2001 From: Lukas Pielsticker <50139597+lukaspie@users.noreply.github.com> Date: Thu, 26 Sep 2024 16:58:18 +0200 Subject: [PATCH 10/10] implement functionality for doc with concept path --- src/pynxtools/dataconverter/helpers.py | 58 ++++++++++++++++++++++++++ src/pynxtools/dataconverter/writer.py | 24 ++++++++++- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/src/pynxtools/dataconverter/helpers.py b/src/pynxtools/dataconverter/helpers.py index e9b35dffb..87c5691f0 100644 --- a/src/pynxtools/dataconverter/helpers.py +++ b/src/pynxtools/dataconverter/helpers.py @@ -790,6 +790,64 @@ def get_concept_basepath(path: str) -> str: return "/" + "/".join(concept_path) +def get_concept_path_from_elem(elem: ET.Element) -> str: + """ + Process individual XML element to generate the NeXus concept path. + + Output is e.g. "NXexperiment:/NXentry/NXinstrument/NXdetector". + """ + + name = elem.attrib.get("name", "") + elem_type = elem.attrib.get("type", "") + nxdlbase = elem.attrib.get("nxdlbase", "") # .split("/")[-1] + nxdlbase_class = elem.attrib.get("nxdlbase_class", "") + nxdlpath = elem.attrib.get("nxdlpath", "") + category = elem.attrib.get("category", "") + # optional = elem.attrib.get("optional", "") + # extends = elem.attrib.get("extends", "") + + # print(f"tag: {tag}") + # print(f"name: {name}") + # print(f"elem_type: {elem_type}") + # print(f"nxdlbase: {nxdlbase}") + # print(f"nxdlbase_class: {nxdlbase_class}") + # print(f"nxdlpath: {nxdlpath}") + # # print(f"optional: {optional}") + # # print(f"extends: {extends}") + # print("\n") + + concept_path = "" + + if elem.tag.endswith("group"): + if nxdlbase_class and nxdlbase_class == "application": + concept_path += "NXmpes:" + concept_path += nxdlpath # + = f"(elem_type)" + + else: + if nxdlbase: + concept_path += nxdlbase.replace(".nxdl.xml", "").split(os.path.sep)[-1] + concept_path += nxdlpath # + = f"(elem_type)" + + elif elem.tag.endswith("field"): + pass + + elif elem.tag.endswith("attribute"): + pass + elif elem.tag.endswith("definition"): + concept_path += name + + return concept_path + + # if nxdlpath: + # # Split the nxdlpath and construct the string + # path_parts = nxdlpath.strip("/").split("/") + # formatted_path = "/".join(path_parts) + # return f"{formatted_path}({elem_type})" + # else: + # # For elements with no path, return the name and type + # return f"{name}({elem_type})" + + def remove_namespace_from_tag(tag): """Helper function to remove the namespace from an XML tag.""" diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index 26a653929..83e0201e1 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -286,12 +286,34 @@ def extract_and_format_docs(elem: ET.Element) -> str: if app_def_docs: return app_def_docs - _, _, elist = get_inherited_nodes(nxdl_path, elem=copy.deepcopy(self.nxdl_data)) + class_path, nxdl_elem_path, elist = get_inherited_nodes( + nxdl_path, elem=copy.deepcopy(self.nxdl_data) + ) + path_to_check = "/ENTRY/INSTRUMENT/ELECTRONANALYSER/energy_resolution" # /physical_quantity" # == "/ENTRY/SAMPLE/flood_gun_current_env/flood_gun" + + if nxdl_path == path_to_check: + for thing in [ + # path, + # nxdl_path, + # class_path, + # nxdl_elem_path, + # elist + ]: + print(thing, "\n") for elem in elist: + if nxdl_path == path_to_check: + # print(elem.tag) + # print("\t elem.attrib:", elem.attrib.keys()) + + if elem.tag.endswith(("group", "field", "attribute", "definition")): + concept_path = helpers.get_concept_path_from_elem(elem), "\n" + # print(concept_path) + if not docs: # Only use docs from superclasses if they are not extended. docs += extract_and_format_docs(elem) + # print("\n") if not elist: # Handle docs for attributeS