From 97ac2baf8b1e57a1bc42623310e9c9555241cb09 Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Fri, 15 Mar 2024 17:43:53 +0000
Subject: [PATCH 1/5] revMD17

---
 openqdc/__init__.py                    |   4 +-
 openqdc/datasets/potential/__init__.py |   2 +
 openqdc/datasets/potential/revmd17.py  | 100 +++++++++++++++++++++++++
 openqdc/raws/config_factory.py         |   4 +
 4 files changed, 108 insertions(+), 2 deletions(-)
 create mode 100644 openqdc/datasets/potential/revmd17.py

diff --git a/openqdc/__init__.py b/openqdc/__init__.py
index b2c953e..b36a191 100644
--- a/openqdc/__init__.py
+++ b/openqdc/__init__.py
@@ -23,7 +23,6 @@
     "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
     "SN2RXN": "openqdc.datasets.potential.sn2_rxn",
     "QM7X": "openqdc.datasets.potential.qm7x",
-    "DES": "openqdc.datasets.interaction.des",
     "NablaDFT": "openqdc.datasets.potential.nabladft",
     "SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
     "WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
@@ -31,6 +30,7 @@
     "Dummy": "openqdc.datasets.potential.dummy",
     "PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
     "PCQM_PM6": "openqdc.datasets.potential.pcqm",
+    "RevMD17": "openqdc.datasets.potential.revmd17",
     "Transition1X": "openqdc.datasets.potential.transition1x",
     "MultixcQM9": "openqdc.datasets.potential.multixcqm9",
     "AVAILABLE_DATASETS": "openqdc.datasets",
@@ -70,7 +70,6 @@ def __dir__():
     from ._version import __version__  # noqa
     from .datasets import AVAILABLE_DATASETS  # noqa
     from .datasets.base import BaseDataset  # noqa
-    from .datasets.interaction.des import DES  # noqa
     from .datasets.potential.ani import ANI1, ANI1CCX, ANI1X  # noqa
     from .datasets.potential.comp6 import COMP6  # noqa
     from .datasets.potential.dummy import Dummy  # noqa
@@ -84,6 +83,7 @@ def __dir__():
     from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6  # noqa
     from .datasets.potential.qm7x import QM7X  # noqa
     from .datasets.potential.qmugs import QMugs  # noqa
+    from .datasets.potential.revmd17 import RevMD17  # noqa
     from .datasets.potential.sn2_rxn import SN2RXN  # noqa
     from .datasets.potential.solvated_peptides import SolvatedPeptides  # noqa
     from .datasets.potential.spice import Spice  # noqa
diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py
index 1704555..07b8b16 100644
--- a/openqdc/datasets/potential/__init__.py
+++ b/openqdc/datasets/potential/__init__.py
@@ -11,6 +11,7 @@
 from .pcqm import PCQM_B3LYP, PCQM_PM6  # noqa
 from .qm7x import QM7X  # noqa
 from .qmugs import QMugs  # noqa
+from .revmd17 import RevMD17  # noqa
 from .sn2_rxn import SN2RXN  # noqa
 from .solvated_peptides import SolvatedPeptides  # noqa
 from .spice import Spice  # noqa
@@ -40,4 +41,5 @@
     "transition1x": Transition1X,
     "watercluster": WaterClusters,
     "multixcqm9": MultixcQM9,
+    "revmd17": RevMD17,
 }
diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py
new file mode 100644
index 0000000..d45b2f4
--- /dev/null
+++ b/openqdc/datasets/potential/revmd17.py
@@ -0,0 +1,100 @@
+from os.path import join as p_join
+
+import numpy as np
+
+from openqdc.datasets.base import BaseDataset
+
+trajectories = {
+    "rmd17_aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
+    "rmd17_benzene": "c1ccccc1",
+    "rmd17_malonaldehyde": "C(C=O)C=O",
+    "rmd17_paracetamol": "CC(=O)Nc1ccc(cc1)O",
+    "rmd17_toluene": "Cc1ccccc1",
+    "rmd17_azobenzene": "C1=CC=C(C=C1)N=NC2=CC=CC=C2",
+    "rmd17_ethanol": "CCO",
+    "rmd17_naphthalene": "C1=CC=C2C=CC=CC2=C1",
+    "rmd17_salicylic": "C1=CC=C(C(=C1)C(=O)O)O",
+    "rmd17_uracil": "C1=CNC(=O)NC1=O",
+}
+
+
+def shape_atom_inputs(coords, atom_species):
+    reshaped_coords = coords.reshape(-1, 3)
+    frame, atoms, _ = coords.shape
+    z = np.tile(atom_species, frame)
+    xs = np.stack((z, np.zeros_like(z)), axis=-1)
+    return np.concatenate((xs, reshaped_coords), axis=-1, dtype=np.float32)
+
+
+def read_npz_entry(filename, root):
+    data = np.load(create_path(filename, root))
+    nuclear_charges, coords, energies, forces = (
+        data["nuclear_charges"],
+        data["coords"],
+        data["energies"],
+        data["forces"],
+    )
+    frames = coords.shape[0]
+    res = dict(
+        name=np.array([trajectories[filename]] * frames),
+        subset=np.array([filename] * frames),
+        energies=energies[:, None].astype(np.float32),
+        forces=forces.reshape(-1, 3, 1),
+        atomic_inputs=shape_atom_inputs(coords, nuclear_charges),
+        n_atoms=np.array([len(nuclear_charges)] * frames, dtype=np.int32),
+    )
+    return res
+
+
+def create_path(filename, root):
+    return p_join(root, "rmd17", "npz_data", filename + ".npz")
+
+
+class RevMD17(BaseDataset):
+    """
+    - Benzene: 627000 samples
+    - Uracil: 133000 samples
+    - Naptalene: 326000 samples
+    - Aspirin: 211000 samples
+    - Salicylic Acid: 320000 samples
+    - Malonaldehyde: 993000 samples
+    - Ethanol: 555000 samples
+    - Toluene: 100000 samples
+
+    Usage
+    ```python
+    from openqdc.datasets import RevMD17
+    dataset = RevMD17()
+    ```
+
+    References:
+    - https://arxiv.org/abs/2007.09593
+    """
+
+    __name__ = "revmd17"
+
+    __energy_methods__ = [
+        "pbe/vdw-ts",  # MD17
+    ]
+
+    energy_target_names = [
+        "PBE-TS Energy",
+    ]
+
+    __force_methods__ = [
+        "pbe/vdw-ts",  # MD17
+    ]
+
+    force_target_names = [
+        "PBE-TS Gradient",
+    ]
+
+    __energy_unit__ = "kcal/mol"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "kcal/mol/ang"
+
+    def read_raw_entries(self):
+        entries_list = []
+        for trajectory in trajectories:
+            entries_list.append(read_npz_entry(trajectory, self.root))
+        return entries_list
diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py
index 6205a3e..e9ca176 100644
--- a/openqdc/raws/config_factory.py
+++ b/openqdc/raws/config_factory.py
@@ -171,6 +171,10 @@ class DataConfigFactory:
         dataset_name="des_s66x8",
         links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"},
     )
+    revmd17 = dict(
+        dataset_name="revmd17",
+        links={"revmd17.zip": "https://figshare.com/ndownloader/articles/12672038/versions/3"},
+    )
 
     available_datasets = [k for k in locals().keys() if not k.startswith("__")]
 

From c659c18e743acab5dd64855880d192a88f14f3d6 Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Fri, 15 Mar 2024 18:10:14 +0000
Subject: [PATCH 2/5] Atom energies FIX

---
 openqdc/utils/atomization_energies.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/openqdc/utils/atomization_energies.py b/openqdc/utils/atomization_energies.py
index 95464f6..d9ff2b3 100644
--- a/openqdc/utils/atomization_energies.py
+++ b/openqdc/utils/atomization_energies.py
@@ -2313,7 +2313,19 @@ def get_matrix(level_of_theory: str) -> np.ndarray:
 }
 
 
-ISOLATED_ATOM_ENERGIES = {
+def merge(a: dict, b: dict, path=[]):
+    for key in b:
+        if key in a:
+            if isinstance(a[key], dict) and isinstance(b[key], dict):
+                merge(a[key], b[key], path + [str(key)])
+            elif a[key] != b[key]:
+                raise Exception("Conflict at " + ".".join(path + [str(key)]))
+        else:
+            a[key] = b[key]
+    return a
+
+
+ISOLATED_ATOM_ENERGIES_ORIGINAL = {
     # DFT
     "wb97x": {
         "6-31g*": COMP6_1,
@@ -2386,7 +2398,6 @@ def get_matrix(level_of_theory: str) -> np.ndarray:
     "pm6": PM6,
     # FF
     "ttm2.1-f": TTM2,
-    **ISOLATED_ATOM_ENERGIES_ADDON,
 }
-
-# TODO: Talk with ivan about cbs extrapolation from from av[TQ]z. For now this should be ok
+# update dictionary without overriding the dictionary inside
+ISOLATED_ATOM_ENERGIES = merge(ISOLATED_ATOM_ENERGIES_ORIGINAL, ISOLATED_ATOM_ENERGIES_ADDON)

From 677ce47282a604413ae2d34389b5901175433e09 Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Fri, 15 Mar 2024 18:17:20 +0000
Subject: [PATCH 3/5] correct type arrays

---
 openqdc/datasets/potential/revmd17.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py
index d45b2f4..870ef7f 100644
--- a/openqdc/datasets/potential/revmd17.py
+++ b/openqdc/datasets/potential/revmd17.py
@@ -39,7 +39,7 @@ def read_npz_entry(filename, root):
         name=np.array([trajectories[filename]] * frames),
         subset=np.array([filename] * frames),
         energies=energies[:, None].astype(np.float32),
-        forces=forces.reshape(-1, 3, 1),
+        forces=forces.reshape(-1, 3, 1).astype(np.float32),
         atomic_inputs=shape_atom_inputs(coords, nuclear_charges),
         n_atoms=np.array([len(nuclear_charges)] * frames, dtype=np.int32),
     )

From 8ec7dcebac721a003a761b7f104f8037809a83dc Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Mon, 18 Mar 2024 14:13:42 +0000
Subject: [PATCH 4/5] Fetch RevMD17

---
 openqdc/datasets/potential/revmd17.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py
index 870ef7f..85702ed 100644
--- a/openqdc/datasets/potential/revmd17.py
+++ b/openqdc/datasets/potential/revmd17.py
@@ -3,6 +3,7 @@
 import numpy as np
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.raws.fetch import decompress_tar_gz
 
 trajectories = {
     "rmd17_aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
@@ -74,7 +75,7 @@ class RevMD17(BaseDataset):
     __name__ = "revmd17"
 
     __energy_methods__ = [
-        "pbe/vdw-ts",  # MD17
+        "pbe/vdw-ts",
     ]
 
     energy_target_names = [
@@ -82,7 +83,7 @@ class RevMD17(BaseDataset):
     ]
 
     __force_methods__ = [
-        "pbe/vdw-ts",  # MD17
+        "pbe/vdw-ts",
     ]
 
     force_target_names = [
@@ -95,6 +96,7 @@ class RevMD17(BaseDataset):
 
     def read_raw_entries(self):
         entries_list = []
+        decompress_tar_gz(p_join(self.root, "rmd17.tar.bz2"))
         for trajectory in trajectories:
             entries_list.append(read_npz_entry(trajectory, self.root))
         return entries_list

From 33c8d25e5a33f9d1a2ed4bbc72e91f4e9fbcfe05 Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Mon, 18 Mar 2024 14:25:27 +0000
Subject: [PATCH 5/5] Compatibility issue fsspec

---
 env.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/env.yml b/env.yml
index d7a19db..16ccc3c 100644
--- a/env.yml
+++ b/env.yml
@@ -7,7 +7,7 @@ dependencies:
   - pip
   - tqdm
   - loguru
-  - fsspec
+  - fsspec <=2023.12.2 # 2024.3.0 not compatible rn
   - gcsfs
   - typer
   - prettytable