Merge pull request #112 from h2020charisma/nexusformat

NeXus file format
h2020charisma · Apr 23, 2024 · d81c194 · d81c194
2 parents 56ea4bb + fe4911f
commit d81c194
Show file tree

Hide file tree

Showing 13 changed files with 866 additions and 40 deletions.
diff --git a/examples/nexusformat.ipynb b/examples/nexusformat.ipynb
@@ -0,0 +1,215 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ramanchada2 as rc2\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "SpeMetadataModel(__root__={'Original file': SpeMetadataFieldModel(__root__='PST10_iR785_OP03_8000msx2.txt')})"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "kwargs = {\"sample\":['PST'], \"provider\" : ['FNMT'], \"OP\":['03'], \"laser_wl\":['785']}\n",
+    "spe = rc2.spectrum.from_test_spe(**kwargs)\n",
+    "spe.meta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{   'assay_uuid': None,\n",
+      "    'citation': Citation(year='2022', title='Round Robin 1', owner='FNMT'),\n",
+      "    'effects': [   EffectArray(endpoint='Raman spectrum', endpointtype=None, result=None, conditions=None, idresult=None, endpointGroup=None, endpointSynonyms=None, sampleID=None, signal=ValueArray(unit=None, values=array([1354.36, 1355.04, 1349.8 , ..., 1031.89, 1031.96, 1031.44]), errQualifier=None, errorValue=None), axes={'x': ValueArray(unit='cm-1', values=array([ 120.111,  120.505,  120.899, ..., 2499.82 , 2499.94 , 2499.98 ]), errQualifier=None, errorValue=None)})],\n",
+      "    'interpretationCriteria': None,\n",
+      "    'interpretationResult': None,\n",
+      "    'investigation_uuid': None,\n",
+      "    'owner': SampleLink(substance=Sample(uuid='PST'), company=Company(uuid=None, name='Default company')),\n",
+      "    'parameters': {'E.method': 'Raman spectrometry', 'wavelength': 785},\n",
+      "    'protocol': Protocol(topcategory='P-CHEM', category=EndpointCategory(code='ANALYTICAL_METHODS_SECTION', term=None, title=None), endpoint=None, guideline=None),\n",
+      "    'updated': None,\n",
+      "    'uuid': 'cbf04397-9352-4382-89e1-81be6d99f473'}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "NXroot('spectrum')"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pynanomapper.datamodel.ambit as mx\n",
+    "import numpy as np\n",
+    "from typing import Dict, Optional, Union, List\n",
+    "from pynanomapper.datamodel.nexus_utils import to_nexus\n",
+    "import json \n",
+    "import nexusformat.nexus.tree as nx\n",
+    "import pprint\n",
+    "import uuid\n",
+    "pp = pprint.PrettyPrinter(indent=4)\n",
+    "\n",
+    "\n",
+    "data_dict: Dict[str, mx.ValueArray] = {\n",
+    "    'x': mx.ValueArray(values = spe.x, unit=\"cm-1\")\n",
+    "}\n",
+    "ea = mx.EffectArray(endpoint=\"Raman spectrum\",  \n",
+    "                                signal = mx.ValueArray(values = spe.y),\n",
+    "                                axes = data_dict)\n",
+    "#ea.to_json()\n",
+    "effect_list: List[Union[mx.EffectRecord,mx.EffectArray]] = []\n",
+    "effect_list.append(ea)\n",
+    "papp = mx.ProtocolApplication(protocol=mx.Protocol(topcategory=\"P-CHEM\",category=mx.EndpointCategory(code=\"ANALYTICAL_METHODS_SECTION\")),\n",
+    "                           effects=effect_list)\n",
+    "papp.citation = mx.Citation(owner=\"FNMT\",title=\"Round Robin 1\",year=2022)\n",
+    "papp.parameters = {\"E.method\" : \"Raman spectrometry\" , \"wavelength\" : 785}\n",
+    "\n",
+    "papp.uuid = str(uuid.uuid4())\n",
+    "company=mx.Company(name = \"FNMT\")\n",
+    "substance = mx.Sample(uuid = \"PST\")\n",
+    "papp.owner = mx.SampleLink(substance = substance)\n",
+    "#papp.to_json()\n",
+    "study = mx.Study(study=[papp])\n",
+    "\n",
+    "pp.pprint(papp.__dict__)\n",
+    "#print(papp.to_json())\n",
+    "#print(study.to_json())\n",
+    "nxroot = nx.NXroot()\n",
+    "study.to_nexus(nxroot)\n",
+    "nxroot.save(\"spectrum.nxs\",mode=\"w\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def spe2pap(spe:rc2.spectrum.Spectrum):\n",
+    "    effect_list: List[EffectRecord] = []\n",
+    "    effect_list.append(EffectRecord(endpoint=\"Endpoint 1\", unit=\"Unit 1\", loValue=5.0))\n",
+    "    protocol = Protocol(topcategory=\"P-CHEM\",category=EndpointCategory(code=\"ANALYTICAL_METHODS_SECTION\"))\n",
+    "    papp = ProtocolApplication(protocol=protocol,effects=effect_list)\n",
+    "    return papp\n",
+    "\n",
+    "\n",
+    "spe2pap(spe)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spe.plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spe.write_nexus(\"nexus_test.cha\",\"entry\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List\n",
+    "from pynanomapper.datamodel.ambit import EffectRecord, Protocol, EndpointCategory, ProtocolApplication\n",
+    "\n",
+    "effect_list: List[EffectRecord] = []\n",
+    "\n",
+    "effect_list.append(EffectRecord(endpoint=\"Endpoint 1\", unit=\"Unit 1\", loValue=5.0))\n",
+    "effect_list.append(EffectRecord(endpoint=\"Endpoint 2\", unit=\"Unit 2\", loValue=10.0))\n",
+    "\n",
+    "papp = ProtocolApplication(protocol=Protocol(topcategory=\"P-CHEM\",category=EndpointCategory(code=\"XYZ\")),effects=effect_list)\n",
+    "papp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "papp = ProtocolApplication(effects=effect_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "papp.protocol,papp.effects"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pynanomapper.datamodel.ambit import  SubstanceRecord,Sample,SampleLink\n",
+    "SubstanceRecord(name=\"xky\")\n",
+    "substance=Sample(uuid=\"xxx\")\n",
+    "SampleLink(substance=substance)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ramanchada2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/ramanchada2/io/HSDS.py b/src/ramanchada2/io/HSDS.py
@@ -13,6 +13,63 @@
 logger = logging.getLogger()
 
 
+# https://manual.nexusformat.org/examples/napi/python.html
+# https://manual.nexusformat.org/examples/python/simple_example_basic/index.html
+@pydantic.validate_arguments(config=dict(arbitrary_types_allowed=True))
+def write_nexus(filename: str,
+                dataset: str,
+                x: npt.NDArray, y: npt.NDArray, meta: Dict, h5module=None):
+    _h5 = h5module or h5py
+    try:
+        with _h5.File(filename, 'a') as f:
+            f.attrs['default'] = dataset
+            try:
+                nxentry = f.require_group('sample')
+            except:  # noqa: E722
+                pass
+
+            nxentry = f.require_group('instrument')
+            for m in meta:
+                print(m, meta[m])
+
+            try:
+                nxentry = f.require_group(dataset)
+                nxentry.attrs["NX_class"] = 'NXentry'
+                nxentry.attrs['default'] = 'data'
+            except:  # noqa: E722
+                pass
+
+            try:
+                nxdata = nxentry.require_group('data')
+                nxdata.attrs["NX_class"] = 'NXdata'
+                nxdata.attrs['signal'] = 'spectrum'
+                nxdata.attrs['axes'] = 'raman_shift'
+                nxdata.attrs['raman_shift_indices'] = [0,]
+            except:  # noqa: E722
+                pass
+
+            try:
+                tth = nxdata.require_group('raman_shift', data=x)
+                tth.attrs['units'] = 'cm-1'
+                tth.attrs['long_name'] = 'Raman shift (cm-1)'
+            except:  # noqa: E722
+                pass
+
+            try:
+                counts = nxdata.create_dataset('spectrum', data=y)
+                counts.attrs['units'] = 'au'
+                counts.attrs['long_name'] = 'spectrum'
+            except:  # noqa: E722
+                pass
+
+    except ValueError as e:
+        logger.warning(repr(e))
+
+
+class DatasetExistsError(Exception):
+    pass
+
+
 @pydantic.validate_arguments(config=dict(arbitrary_types_allowed=True))
 def write_cha(filename: str,
               dataset: str,
@@ -25,9 +82,9 @@ def write_cha(filename: str,
                 ds = h5.create_dataset(dataset, data=data)
                 ds.attrs.update(meta)
             else:
-                logger.warning(f'dataset `{dataset}` already exists in file `{filename}`')
+                raise DatasetExistsError(f'dataset `{dataset}` already exists in file `{filename}`')
     except ValueError as e:
-        logger.warning(repr(e))
+        raise e
 
 
 def read_cha(filename: str,

diff --git a/src/ramanchada2/misc/utils/__init__.py b/src/ramanchada2/misc/utils/__init__.py
@@ -15,4 +15,5 @@
                        find_closest_pairs,
                        find_closest_pairs_idx,
                        align, align_shift,
+                       match_peaks
                        )
diff --git a/src/ramanchada2/misc/utils/argmin2d.py b/src/ramanchada2/misc/utils/argmin2d.py
@@ -1,7 +1,10 @@
-import pydantic
 import numpy as np
 import numpy.typing as npt
+import pandas as pd
+import pydantic
 from scipy import linalg
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import euclidean_distances
 from typing import List, Union
 
 
@@ -77,3 +80,67 @@ def align_shift(x, y,
         if loss > loss_bak:
             return p_bak
     return p
+
+
+def match_peaks(spe_pos_dict, ref):
+    # Min-Max normalize the reference values
+    min_value = min(ref.values())
+    max_value = max(ref.values())
+    if len(ref.keys()) > 1:
+        normalized_ref = {key: (value - min_value) / (max_value - min_value) for key, value in ref.items()}
+    else:
+        normalized_ref = ref
+
+    min_value_spe = min(spe_pos_dict.values())
+    max_value_spe = max(spe_pos_dict.values())
+    # Min-Max normalize the spe_pos_dict
+    if len(spe_pos_dict.keys()) > 1:
+        normalized_spe = {
+                key: (value - min_value_spe) / (max_value_spe - min_value_spe) for key, value in spe_pos_dict.items()
+                }
+    else:
+        normalized_spe = spe_pos_dict
+    data_list = [
+        {'Wavelength': key, 'Intensity': value, 'Source': 'spe'} for key, value in normalized_spe.items()
+    ] + [
+        {'Wavelength': key, 'Intensity': value, 'Source': 'reference'} for key, value in normalized_ref.items()
+    ]
+
+    # Create a DataFrame from the list of dictionaries
+    df = pd.DataFrame(data_list)
+    feature_matrix = df[['Wavelength', 'Intensity']].to_numpy()
+
+    n_ref = len(ref.keys())
+    n_spe = len(spe_pos_dict.keys())
+    kmeans = KMeans(n_clusters=n_ref if n_ref > n_spe else n_spe)
+    kmeans.fit(feature_matrix)
+    labels = kmeans.labels_
+    # Extract cluster labels, x values, and y values
+    df['Cluster'] = labels
+    grouped = df.groupby('Cluster')
+    x_spe = np.array([])
+    x_reference = np.array([])
+    x_distance = np.array([])
+    clusters = np.array([])
+    # Iterate through each group
+    for cluster, group in grouped:
+        # Get the unique sources within the group
+        unique_sources = group['Source'].unique()
+        if 'reference' in unique_sources and 'spe' in unique_sources:
+            # Pivot the DataFrame to create the desired structure
+            for w_spe in group.loc[group["Source"] == "spe"]["Wavelength"].values:
+                x = None
+                r = None
+                e_min = None
+                for w_ref in group.loc[group["Source"] == "reference"]["Wavelength"].values:
+                    e = euclidean_distances(w_spe.reshape(-1, 1), w_ref.reshape(-1, 1))[0][0]
+                    if (e_min is None) or (e < e_min):
+                        x = w_spe
+                        r = w_ref
+                        e_min = e
+                x_spe = np.append(x_spe, x)
+                x_reference = np.append(x_reference, r)
+                x_distance = np.append(x_distance, e_min)
+                clusters = np.append(clusters, cluster)
+    sort_indices = np.argsort(x_spe)
+    return (x_spe[sort_indices], x_reference[sort_indices], x_distance[sort_indices], df)