FAIRmat-NFDI · lukaspie · Aug 23, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 30, 2024
diff --git a/src/pynxtools/dataconverter/convert.py b/src/pynxtools/dataconverter/convert.py
@@ -239,8 +239,10 @@ def convert(
     helpers.add_default_root_attributes(data=data, filename=os.path.basename(output))
 
     write_docs = kwargs.pop("write_docs", False)
+    docs_format = kwargs.pop("docs_format", None)
     Writer(data=data, nxdl_f_path=nxdl_f_path, output_path=output).write(
-        write_docs=write_docs
+        write_docs=write_docs,
+        docs_format=docs_format,
     )
 
     logger.info(f"The output file generated: {output}.")
@@ -360,7 +362,15 @@ def main_cli():
     default=False,
     help="Write docs for the individual NeXus concepts as HDF5 attributes.",
 )
-# pylint: disable=too-many-arguments
+@click.option(
+    "--docs-format",
+    type=click.Choice(["default", "html", "html5", "xml", "pseudoxml"]),
+    default=None,
+    help=(
+        "Optionally specify the format in which the docs for the individual NeXus concepts is generated. "
+        "By default, the docs are formatted as in the NXDL file."
+    ),
+)
 def convert_cli(
     files: Tuple[str, ...],
     input_file: Tuple[str, ...],
@@ -374,6 +384,7 @@ def convert_cli(
     config_file: str,
     fail: bool,
     write_docs: bool,
+    docs_format: str,
     **kwargs,
 ):
     """This command allows you to use the converter functionality of the dataconverter."""
@@ -403,6 +414,15 @@ def convert_cli(
 
     if write_docs:
         kwargs["write_docs"] = write_docs
+        if not docs_format:
+            kwargs["docs_format"] = "default"
+        else:
+            kwargs["docs_format"] = docs_format
+
+    elif docs_format is not None:
+        raise click.UsageError(
+            "Error: --docs-format can only be used with --write-docs."
+        )
 
     file_list = []
     for file in files:

diff --git a/src/pynxtools/dataconverter/helpers.py b/src/pynxtools/dataconverter/helpers.py
@@ -790,6 +790,64 @@ def get_concept_basepath(path: str) -> str:
     return "/" + "/".join(concept_path)
 
 
+def get_concept_path_from_elem(elem: ET.Element) -> str:
+    """
+    Process individual XML element to generate the NeXus concept path.
+
+    Output is e.g. "NXexperiment:/NXentry/NXinstrument/NXdetector".
+    """
+
+    name = elem.attrib.get("name", "")
+    elem_type = elem.attrib.get("type", "")
+    nxdlbase = elem.attrib.get("nxdlbase", "")  # .split("/")[-1]
+    nxdlbase_class = elem.attrib.get("nxdlbase_class", "")
+    nxdlpath = elem.attrib.get("nxdlpath", "")
+    category = elem.attrib.get("category", "")
+    # optional = elem.attrib.get("optional", "")
+    # extends = elem.attrib.get("extends", "")
+
+    # print(f"tag: {tag}")
+    # print(f"name: {name}")
+    # print(f"elem_type: {elem_type}")
+    # print(f"nxdlbase: {nxdlbase}")
+    # print(f"nxdlbase_class: {nxdlbase_class}")
+    # print(f"nxdlpath: {nxdlpath}")
+    # # print(f"optional: {optional}")
+    # # print(f"extends: {extends}")
+    # print("\n")
+
+    concept_path = ""
+
+    if elem.tag.endswith("group"):
+        if nxdlbase_class and nxdlbase_class == "application":
+            concept_path += "NXmpes:"
+            concept_path += nxdlpath  # + = f"(elem_type)"
+
+        else:
+            if nxdlbase:
+                concept_path += nxdlbase.replace(".nxdl.xml", "").split(os.path.sep)[-1]
+            concept_path += nxdlpath  # + = f"(elem_type)"
+
+    elif elem.tag.endswith("field"):
+        pass
+
+    elif elem.tag.endswith("attribute"):
+        pass
+    elif elem.tag.endswith("definition"):
+        concept_path += name
+
+    return concept_path
+
+    # if nxdlpath:
+    #     # Split the nxdlpath and construct the string
+    #     path_parts = nxdlpath.strip("/").split("/")
+    #     formatted_path = "/".join(path_parts)
+    #     return f"{formatted_path}({elem_type})"
+    # else:
+    #     # For elements with no path, return the name and type
+    #     return f"{name}({elem_type})"
+
+
 def remove_namespace_from_tag(tag):
     """Helper function to remove the namespace from an XML tag."""
 

diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py
@@ -19,13 +19,15 @@
 
 # pylint: disable=R0912
 
+import io
 import copy
 import logging
 import xml.etree.ElementTree as ET
 from typing import Optional
 
 import h5py
 import numpy as np
+from docutils.core import publish_string
 
 from pynxtools.dataconverter import helpers
 from pynxtools.dataconverter.exceptions import InvalidDictProvided
@@ -119,6 +121,8 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, docs):
     - Internal links
     - External links
     - compression label"""
+
+    # print(data, grp, entry_name, output_path, path, docs)
     if "link" in data:
         file, path = split_link(data, output_path)
     # generate virtual datasets from slices
@@ -210,7 +214,10 @@ class Writer:
     """
 
     def __init__(
-        self, data: dict = None, nxdl_f_path: str = None, output_path: str = None
+        self,
+        data: dict = None,
+        nxdl_f_path: str = None,
+        output_path: str = None,
     ):
         """Constructs the necessary objects required by the Writer class."""
         self.data = data
@@ -221,6 +228,7 @@ def __init__(
         self.nxs_namespace = get_namespace(self.nxdl_data)
 
         self.write_docs: bool = False
+        self.docs_format: str = "default"
 
     def __nxdl_to_attrs(self, path: str = "/") -> dict:
         """
@@ -253,9 +261,16 @@ def __nxdl_docs(self, path: str = "/") -> Optional[str]:
 
         def extract_and_format_docs(elem: ET.Element) -> str:
             """Get the docstring for a given element in the NDXL tree."""
-            docs = elem.findall(f"{self.nxs_namespace}doc")
-            if docs:
-                return docs[0].text.strip().replace("\\n", "\n")
+            docs_elements = elem.findall(f"{self.nxs_namespace}doc")
+            if docs_elements:
+                docs = docs_elements[0].text
+                if self.docs_format != "default":
+                    docs = publish_string(
+                        docs,
+                        writer_name=self.docs_format,
+                        settings_overrides={"warning_stream": io.StringIO()},
+                    ).decode("utf-8")
+                return docs.strip().replace("\\n", "\n")
             return ""
 
         docs: str = ""
@@ -271,12 +286,34 @@ def extract_and_format_docs(elem: ET.Element) -> str:
             if app_def_docs:
                 return app_def_docs
 
-        _, _, elist = get_inherited_nodes(nxdl_path, elem=copy.deepcopy(self.nxdl_data))
+        class_path, nxdl_elem_path, elist = get_inherited_nodes(
+            nxdl_path, elem=copy.deepcopy(self.nxdl_data)
+        )
 
+        path_to_check = "/ENTRY/INSTRUMENT/ELECTRONANALYSER/energy_resolution"  # /physical_quantity" # == "/ENTRY/SAMPLE/flood_gun_current_env/flood_gun"
+
+        if nxdl_path == path_to_check:
+            for thing in [
+                # path,
+                # nxdl_path,
+                # class_path,
+                # nxdl_elem_path,
+                # elist
+            ]:
+                print(thing, "\n")
         for elem in elist:
+            if nxdl_path == path_to_check:
+                # print(elem.tag)
+                # print("\t elem.attrib:", elem.attrib.keys())
+
+                if elem.tag.endswith(("group", "field", "attribute", "definition")):
+                    concept_path = helpers.get_concept_path_from_elem(elem), "\n"
+                #     print(concept_path)
+
             if not docs:
                 # Only use docs from superclasses if they are not extended.
                 docs += extract_and_format_docs(elem)
+        # print("\n")
 
         if not elist:
             # Handle docs for attributeS
@@ -394,21 +431,22 @@ def add_units_key(dataset, path):
                     dataset.attrs[entry_name[1:]] = data
                     if docs:
                         # Write docs for attributes like <attr>__docs
-                        dataset.attrs[f"{entry_name[1:]}__docs"] = docs
+                        dataset.attrs[f"{entry_name[1:]}_docs"] = docs
             except Exception as exc:
                 raise IOError(
                     f"Unknown error occured writing the path: {path} "
                     f"with the following message: {str(exc)}"
                 ) from exc
 
-    def write(self, write_docs=False):
+    def write(self, write_docs: bool = False, docs_format: str = "default"):
         """
         Writes the NeXus file with previously validated data from the reader with NXDL attrs.
 
         Args:
             write_docs (bool): Write docs for the individual NeXus concepts as HDF5 attributes. The default is False.
         """
         self.write_docs = write_docs
+        self.docs_format = docs_format
         try:
             self._put_data_into_hdf5()
         finally: