FAIRmat-NFDI · mmpsi · Feb 16, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/pynxtools/dataconverter/readers/json_map/README.md b/pynxtools/dataconverter/readers/json_map/README.md
@@ -33,7 +33,7 @@ This file is designed to let you fill in the requirements of a NeXus Application
 The mapping files will always be based on the Template the dataconverter generates. See above on how to generate a mapping file.
 The right hand side values of the Template keys are what you can modify.
 
-Here are the three different ways you can fill the right hand side of the Template keys:
+Here are the different ways you can fill the right hand side of the Template keys:
 * Write the nested path in your datafile. This is indicated by a leading `/` before the word `entry` to make `/entry/data/current_295C` below. 
 Example:
 
@@ -59,5 +59,63 @@ Note: This only works for HDF5 files currently.
   "/ENTRY[entry]/DATA[data]/current_300C": {"link": "current.nxs:/entry/data/current_300C"},
 ```
 
+* Convert custom date and time string to Nexus-compliant ISO format. 
+The following entry parses the date and time in a string array `/logs/logs` 
+with items like `22/10/22 15:18:26.0164 - Starting...`. 
+
+```json
+  "/ENTRY[entry]/end_time": {
+    "parse_string":  "/logs/logs",
+    "index": "-1",
+    "regexp": "[0-9.:/]+ [0-9.:/]+",
+    "dateutil": "dmy",
+    "timezone": "Europe/Berlin"
+  }
+```
+
+The properties correspond to operations that are applied to input data, in the order given below.
+The `datetime`, `dateutil` and `timestamp` properties are mutually exclusive.
+
+    "parse_string": (required) Data path of the string (array) like for regular datasets.
+    "index": (optional) Element index to extract from string array.
+        The original data must be a string array.
+        If this option is not specified, the original data must be a singular string.
+    "regexp": (optional) Match regular expression, keeping only the matching part.
+        If the expression contains groups, the result will be a space-delimited concatenation of the matching groups.
+        If the expression does not contain explicit groups, the whole match is used.
+    "datetime": (optional) Format string for datetime.datetime.strptime function.
+        If specified, use datetime.datetime.strptime for date parsing.
+    "dateutil": (optional) Date ordering for the dateutil.parser.parse function.
+        Possible values 'YMD', 'MDY', 'DMY' (or lower case).
+        The dateutil parsers recognizes many date and time formats, but may need the order of year, month and day.
+        If specified, use dateutil.parser.parse for date parsing.
+    "timestamp": (optional) Interpret the data item as POSIX timestamp.
+    "timezone": (optional) Specify the time zone if the date-time string does not include a UTC offset.
+        The time zone must be in a dateutil-supported format, e.g. "Europe/Berlin".
+        By default, the local time zone is used.
+
+The resulting string replaces the mapped value (dictionary) in the mapping dictionary.
+If date parsing is enabled, the resulting string is ISO-formatted as required by the Nexus standard.
+
+* Python expression.
+The following entry creates an axis array from scalar values in the input file.
+
+```json
+  "/ENTRY[entry]/DATA[image]/angular0": {
+    "eval": "np.linspace(arg0[0], arg1[0], int(arg2[0]))",
+    "arg0": "/scan1/attrs/ScientaSliceBegin",
+    "arg1": "/scan1/attrs/ScientaSliceEnd",
+    "arg2": "/scan1/attrs/ScientaNumSlices"
+  },
+```
+
+The properties of the mapping declare the expression and its arguments.
+
+    "eval": (required) Python expression to be evaluated by the `eval` built-in.
+        The expression can use the built-in and numpy (as np) namespaces
+        as well as the datasets declared by the `argXxx` values.
+    "argXxx", where Xxx is an integer number: (optional) path of dataset to read from the input data
+        and to be used in the expression under the same name.
+
 ## Contact person in FAIRmat for this reader
 Sherjeel Shabih
diff --git a/pynxtools/dataconverter/readers/json_map/reader.py b/pynxtools/dataconverter/readers/json_map/reader.py
@@ -17,9 +17,13 @@
 #
 """An example reader implementation for the DataConverter."""
 from typing import Tuple, Any
+import datetime
+import dateutil.parser
+import dateutil.tz
 import json
 import pickle
 import numpy as np
+import re
 import xarray
 from mergedeep import merge
 
@@ -152,6 +156,129 @@ def get_map_from_partials(partials, template, data):
     return mapping
 
 
+def parse_strings(mapping, data):
+    """
+    Parse strings, notably date and time, from custom format
+
+    The function can do the following operations, in the given order, on string data.
+    The result of each operation is passed on as input of the next one.
+
+    1. Extract element from array by index.
+    2. Match a regular expression.
+    3. Parse date and time using the datetime or dateutil parser.
+
+    The resulting string replaces the mapped value (dictionary) in the mapping dictionary.
+    If date parsing is enabled, the resulting string is ISO-formatted as required by the Nexus standard.
+    The operations are selected and tuned by the following dictionary items:
+
+    "parse_string": (required) Data path of the string (array) like for regular datasets.
+        If this item is missing, string parsing is skipped altogether.
+    "index": (optional) Element index to extract from string array.
+        The original data must be a string array.
+        If this option is not specified, the original data must be a singular string.
+    "regexp": (optional) Match regular expression, keeping only the matching part.
+        If the expression contains groups, the result will be a space-delimited concatenation of the matching groups.
+        If the expression does not contain explicit groups, the whole match is used.
+    "datetime": (optional) Format string for datetime.datetime.strptime function.
+        The "datetime" and "dateutil" options are mutually exclusive.
+    "dateutil": (optional) Date ordering for the dateutil.parser.parse function.
+        Possible values 'YMD', 'MDY', 'DMY' (or lower case).
+        The dateutil parsers recognizes many date and time formats, but may need the order of year, month and day.
+        The "datetime" and "dateutil" options are mutually exclusive.
+    "timestamp": (optional) Interpret the data item as POSIX timestamp.
+    "timezone": (optional) Specify the time zone if the date-time string does not include a UTC offset.
+        The time zone must be in a dateutil-supported format, e.g. "Europe/Berlin".
+        By default, the local time zone is used.
+    """
+
+    for key in mapping:
+        parse_opts = mapping[key]
+
+        try:
+            value = parse_opts["parse_string"]
+            if is_path(value):
+                value = get_val_nested_keystring_from_dict(value[1:], data)
+        except (KeyError, TypeError):
+            continue
-        try:
-            value = parse_opts["parse_string"]
-            if is_path(value):
-                value = get_val_nested_keystring_from_dict(value[1:], data)
-        except (KeyError, TypeError):
-            continue
+        if "parse_string" not in parse_opts:
+            continue
+        value = parse_opts["parse_string"]
+        if is_path(value):
+            value = get_val_nested_keystring_from_dict(value[1:], data)
+    
-        try:
-            value = parse_opts["parse_string"]
-            if is_path(value):
-                value = get_val_nested_keystring_from_dict(value[1:], data)
-        except (KeyError, TypeError):
-            continue
+        if "parse_string" not in parse_opts:
+            continue
+        value = parse_opts["parse_string"]
+        if is_path(value):
+            value = get_val_nested_keystring_from_dict(value[1:], data)
+    
+
+        if "index" in parse_opts:
+            value = value[int(parse_opts["index"])]
+
+        if "regexp" in parse_opts:
+            match = re.match(parse_opts["regexp"], value)
+            groups = match.groups('')
-            groups = match.groups('')
+            groups = match.groups("")
-            groups = match.groups('')
+            groups = match.groups("")
+            if groups:
+                value = " ".join(match.groups(""))
+            else:
+                value = match.group(0)
+
+        if "timezone" in parse_opts:
+            tz = dateutil.tz.gettz(parse_opts["timezone"])
+        else:
+            tz = dateutil.tz.gettz()
+
+        if "datetime" in parse_opts:
+            dt = datetime.datetime.strptime(value, parse_opts["datetime"])
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=tz)
+            value = dt.isoformat()
+        elif "dateutil" in parse_opts:
+            order = parse_opts["dateutil"].lower()
+            y = order.index("y")
+            m = order.index("m")
+            d = order.index("d")
+            dt = dateutil.parser.parse(value, yearfirst=y < m, dayfirst=d < m)
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=tz)
+            value = dt.isoformat()
+        elif "timestamp" in parse_opts:
+            dt = datetime.datetime.fromtimestamp(float(value), tz=tz)
+            value = dt.isoformat()
+
+        mapping[key] = value
+
+
+def eval_expressions(mapping, data):
+    """
+    Evaluate Python expressions in mapping.
+
+    If a mapping entry contains a dictionary with a `eval` key,
+    the `eval` expression is evaluated using the Python built-in `eval`.
+    The expression can use built-in functions, numpy functions in namespace `np`,
+    and argXxx variables that are defined in the mapping and can refer to dataset paths.
+
+    The result of the expression replaces the value of the mapping.
+
+    :param mapping: Mapping dictionary
+    :param data: Data dictionary
+    :return: None
+    """
+
+    for key in mapping:
+        eval_args = mapping[key]
+
+        try:
+            expression = eval_args["eval"]
+        except (KeyError, TypeError):
+            continue
+
+        args = {}
+        for arg, value in eval_args.items():
+            if arg[0:3] == "arg":
+                if is_path(value):
+                    value = get_val_nested_keystring_from_dict(value[1:], data)
+                else:
+                    try:
+                        value = float(value)
+                    except TypeError:
+                        pass
+
+                args[arg] = value
+
+        value = eval(expression, {"np": np}, args)
+        mapping[key] = value
+
+
 class JsonMapReader(BaseReader):
     """A reader that takes a mapping json file and a data file/object to return a template."""
 
@@ -217,6 +344,8 @@ def read(
                 )
 
         new_template = Template()
+        parse_strings(mapping, data)
+        eval_expressions(mapping, data)
         convert_shapes_to_slice_objects(mapping)
 
         fill_documented(new_template, mapping, template, data)