FAIRmat-NFDI · lukaspie · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/src/pynxtools/dataconverter/readers/multi/reader.py b/src/pynxtools/dataconverter/readers/multi/reader.py
@@ -140,6 +140,7 @@ def resolve_special_keys(
     key: str,
     value: Any,
     optional_groups_to_remove: List[str],
+    optional_groups_to_remove_from_links: List[str],
     callbacks: ParseJsonCallbacks,
     suppress_warning: bool = False,
 ) -> None:
@@ -188,6 +189,42 @@ def parse_config_value(value: str) -> Tuple[str, Any]:
             return ("", value)
         return prefixes[0]
 
+    def extract_inner_path(key):
+        """
+        Extracts the relevant path parts from a Nexus concept path by
+        removing everything outside of brackets and converting it to
+        a normalized HDF5 target path format.
+
+        Args:
+            key (str): The Nexus concept path, containing bracketed
+            sections for variable parts.
+
+        Returns:
+            str: A normalized HDF5 target path
+
+        Example:
+            For a Nexus path like
+            'ENTRY[entry_name]/GROUP[group]/DATA[data]/attribute',
+            the function will return a normalized HDF5 path like
+            'entry_name/group/data/attribute'.
+        """
+        key = key.lstrip("ENTRY")
+
+        # Use regex to match either bracketed parts or normal path segments
+        parts = re.findall(r"\[([^\]]+)\]|([^/\[]+)", key)
+
+        return "/".join(
+            filter(
+                None,
+                [
+                    item[0]
+                    if item[0]
+                    else (item[1] if item[1] and not item[1].isupper() else "")
+                    for item in parts
+                ],
+            )
+        )
+
     # Handle non-keyword values
     if not isinstance(value, str) or "@" not in str(value):
         new_entry_dict[key] = value
@@ -212,12 +249,38 @@ def parse_config_value(value: str) -> Tuple[str, Any]:
         if new_entry_dict[key] is not None:
             break
 
+    if isinstance(new_entry_dict[key], dict) and "link" in new_entry_dict[key]:
+        keys_as_hdf5_paths = {extract_inner_path(key) for key in new_entry_dict.keys()}
+
+        link_target = new_entry_dict[key]["link"]
+
+        if link_target.lstrip("/") not in keys_as_hdf5_paths:
+            if value.startswith("!"):
+                group_to_delete = key.rsplit("/", 1)[0]
+                if not suppress_warning:
+                    logger.info(
+                        f"Main element {key} not provided (broken link  at {link_target}). "
+                        f"Removing the parent group {group_to_delete}."
+                    )
+                optional_groups_to_remove_from_links.append(group_to_delete)
+
+            if not suppress_warning:
+                logger.info(
+                    f"There was no target at {link_target} "
+                    f"for the optional link defined for {key}. "
+                    f"Removing the link."
+                )
+            del new_entry_dict[key]
+
+            return
+
     if value.startswith("!") and new_entry_dict[key] is None:
         group_to_delete = key.rsplit("/", 1)[0]
-        logger.info(
-            f"Main element {key} not provided. "
-            f"Removing the parent group {group_to_delete}."
-        )
+        if not suppress_warning:
+            logger.info(
+                f"Main element {key} not provided. "
+                f"Removing the parent group {group_to_delete}."
+            )
         optional_groups_to_remove.append(group_to_delete)
         return
 
@@ -256,28 +319,54 @@ def has_missing_main(key: str) -> bool:
                 return True
         return False
 
-    def dict_sort_key(keyval: Tuple[str, Any]) -> bool:
+    def dict_sort_key(keyval: Tuple[str, Any]) -> Tuple[int, str]:
         """
-        The function to sort the dict by.
-        This just sets False for keys starting with "!" to put them at the beginning.
-        Besides, pythons sorted is stable, so this will keep the order of the keys
-        which have the same sort key.
+        Sort the dict by:
+        - Values starting with "link:" or "!link" go last (return 2).
+          This is for optional links that are first check to work.
+        - Values starting with "!" but not "!link" go first (return 0).
+        - All other values are sorted normally (return 1).
         """
-        if isinstance(keyval[1], str):
-            return not keyval[1].startswith("!")
-        return True
+        value = keyval[1]
+        if isinstance(value, str):
+            if value.startswith(("!@link:", "@link:")):
+                return (2, keyval[0])  # Last
+            if value.startswith("!"):
+                return (0, keyval[0])  # First
+        return (1, keyval[0])  # Middle
+
+    def remove_keys_matching_prefixes(d: dict, prefixes: list[str]) -> dict:
+        """
+        Removes all keys from the dictionary that start with any of the specified prefixes.
+
+        Args:
+            d (dict): The original dictionary.
+            prefixes (list[str]): A list of prefixes to check for.
+
+        Returns:
+            dict: A new dictionary with the matching keys removed.
+        """
+        # Create a new dictionary, keeping only the keys that do not match any prefix
+        return {
+            key: value
+            for key, value in d.items()
+            if not any(key.startswith(prefix) for prefix in prefixes)
+        }
 
     if callbacks is None:
         # Use default callbacks if none are explicitly provided
         callbacks = ParseJsonCallbacks()
 
     optional_groups_to_remove: List[str] = []
+    optional_groups_to_remove_from_links: List[str] = []
     new_entry_dict = {}
+
+    # Process '!...' keys first, but optional link last
+    sorted_keys = dict(sorted(config_dict.items(), key=dict_sort_key))
+
     for entry_name in entry_names:
         callbacks.entry_name = entry_name
 
-        # Process '!...' keys first
-        sorted_keys = dict(sorted(config_dict.items(), key=dict_sort_key))
         for key in sorted_keys:
             value = config_dict[key]
             key = key.replace("/ENTRY/", f"/ENTRY[{entry_name}]/")
@@ -295,6 +384,7 @@ def dict_sort_key(keyval: Tuple[str, Any]) -> bool:
                         k,
                         v,
                         optional_groups_to_remove,
+                        optional_groups_to_remove_from_links,
                         callbacks,
                         suppress_warning,
                     )
@@ -306,11 +396,15 @@ def dict_sort_key(keyval: Tuple[str, Any]) -> bool:
                 key,
                 value,
                 optional_groups_to_remove,
+                optional_groups_to_remove_from_links,
                 callbacks,
                 suppress_warning,
             )
 
-    return new_entry_dict
+    # This removes those groups that had a  link with a "!" prefix
+    return remove_keys_matching_prefixes(
+        new_entry_dict, optional_groups_to_remove_from_links
+    )
 
 
 class MultiFormatReader(BaseReader):