valence-labs · FNTwin · Mar 20, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/env.yml b/env.yml
@@ -7,7 +7,7 @@ dependencies:
   - pip
   - tqdm
   - loguru
-  - fsspec
+  - fsspec <=2023.12.2 # 2024.3.0 not compatible rn
   - gcsfs
   - typer
   - prettytable

diff --git a/openqdc/__init__.py b/openqdc/__init__.py
@@ -23,14 +23,14 @@
     "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
     "SN2RXN": "openqdc.datasets.potential.sn2_rxn",
     "QM7X": "openqdc.datasets.potential.qm7x",
-    "DES": "openqdc.datasets.interaction.des",
     "NablaDFT": "openqdc.datasets.potential.nabladft",
     "SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
     "WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
     "TMQM": "openqdc.datasets.potential.tmqm",
     "Dummy": "openqdc.datasets.potential.dummy",
     "PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
     "PCQM_PM6": "openqdc.datasets.potential.pcqm",
+    "RevMD17": "openqdc.datasets.potential.revmd17",
     "Transition1X": "openqdc.datasets.potential.transition1x",
     "MultixcQM9": "openqdc.datasets.potential.multixcqm9",
     "AVAILABLE_DATASETS": "openqdc.datasets",
@@ -70,7 +70,6 @@ def __dir__():
     from ._version import __version__  # noqa
     from .datasets import AVAILABLE_DATASETS  # noqa
     from .datasets.base import BaseDataset  # noqa
-    from .datasets.interaction.des import DES  # noqa
     from .datasets.potential.ani import ANI1, ANI1CCX, ANI1X  # noqa
     from .datasets.potential.comp6 import COMP6  # noqa
     from .datasets.potential.dummy import Dummy  # noqa
@@ -84,6 +83,7 @@ def __dir__():
     from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6  # noqa
     from .datasets.potential.qm7x import QM7X  # noqa
     from .datasets.potential.qmugs import QMugs  # noqa
+    from .datasets.potential.revmd17 import RevMD17  # noqa
     from .datasets.potential.sn2_rxn import SN2RXN  # noqa
     from .datasets.potential.solvated_peptides import SolvatedPeptides  # noqa
     from .datasets.potential.spice import Spice  # noqa

diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py
@@ -11,6 +11,7 @@
 from .pcqm import PCQM_B3LYP, PCQM_PM6  # noqa
 from .qm7x import QM7X  # noqa
 from .qmugs import QMugs  # noqa
+from .revmd17 import RevMD17  # noqa
 from .sn2_rxn import SN2RXN  # noqa
 from .solvated_peptides import SolvatedPeptides  # noqa
 from .spice import Spice  # noqa
@@ -40,4 +41,5 @@
     "transition1x": Transition1X,
     "watercluster": WaterClusters,
     "multixcqm9": MultixcQM9,
+    "revmd17": RevMD17,
 }
diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py
@@ -0,0 +1,102 @@
+from os.path import join as p_join
+
+import numpy as np
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.raws.fetch import decompress_tar_gz
+
+trajectories = {
+    "rmd17_aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
+    "rmd17_benzene": "c1ccccc1",
+    "rmd17_malonaldehyde": "C(C=O)C=O",
+    "rmd17_paracetamol": "CC(=O)Nc1ccc(cc1)O",
+    "rmd17_toluene": "Cc1ccccc1",
+    "rmd17_azobenzene": "C1=CC=C(C=C1)N=NC2=CC=CC=C2",
+    "rmd17_ethanol": "CCO",
+    "rmd17_naphthalene": "C1=CC=C2C=CC=CC2=C1",
+    "rmd17_salicylic": "C1=CC=C(C(=C1)C(=O)O)O",
+    "rmd17_uracil": "C1=CNC(=O)NC1=O",
+}
+
+
+def shape_atom_inputs(coords, atom_species):
+    reshaped_coords = coords.reshape(-1, 3)
+    frame, atoms, _ = coords.shape
+    z = np.tile(atom_species, frame)
+    xs = np.stack((z, np.zeros_like(z)), axis=-1)
+    return np.concatenate((xs, reshaped_coords), axis=-1, dtype=np.float32)
+
+
+def read_npz_entry(filename, root):
+    data = np.load(create_path(filename, root))
+    nuclear_charges, coords, energies, forces = (
+        data["nuclear_charges"],
+        data["coords"],
+        data["energies"],
+        data["forces"],
+    )
+    frames = coords.shape[0]
+    res = dict(
+        name=np.array([trajectories[filename]] * frames),
+        subset=np.array([filename] * frames),
+        energies=energies[:, None].astype(np.float32),
+        forces=forces.reshape(-1, 3, 1).astype(np.float32),
+        atomic_inputs=shape_atom_inputs(coords, nuclear_charges),
+        n_atoms=np.array([len(nuclear_charges)] * frames, dtype=np.int32),
+    )
+    return res
+
+
+def create_path(filename, root):
+    return p_join(root, "rmd17", "npz_data", filename + ".npz")
+
+
+class RevMD17(BaseDataset):
+    """
+    - Benzene: 627000 samples
+    - Uracil: 133000 samples
+    - Naptalene: 326000 samples
+    - Aspirin: 211000 samples
+    - Salicylic Acid: 320000 samples
+    - Malonaldehyde: 993000 samples
+    - Ethanol: 555000 samples
+    - Toluene: 100000 samples
+
+    Usage
+    ```python
+    from openqdc.datasets import RevMD17
+    dataset = RevMD17()
+    ```
+
+    References:
+    - https://arxiv.org/abs/2007.09593
+    """
+
+    __name__ = "revmd17"
+
+    __energy_methods__ = [
+        "pbe/vdw-ts",
+    ]
+
+    energy_target_names = [
+        "PBE-TS Energy",
+    ]
+
+    __force_methods__ = [
+        "pbe/vdw-ts",
+    ]
+
+    force_target_names = [
+        "PBE-TS Gradient",
+    ]
+
+    __energy_unit__ = "kcal/mol"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "kcal/mol/ang"
+
+    def read_raw_entries(self):
+        entries_list = []
+        decompress_tar_gz(p_join(self.root, "rmd17.tar.bz2"))
+        for trajectory in trajectories:
+            entries_list.append(read_npz_entry(trajectory, self.root))
+        return entries_list
diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py
@@ -171,6 +171,10 @@ class DataConfigFactory:
         dataset_name="des_s66x8",
         links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"},
     )
+    revmd17 = dict(
+        dataset_name="revmd17",
+        links={"revmd17.zip": "https://figshare.com/ndownloader/articles/12672038/versions/3"},
+    )
 
     available_datasets = [k for k in locals().keys() if not k.startswith("__")]
 

diff --git a/openqdc/utils/atomization_energies.py b/openqdc/utils/atomization_energies.py
@@ -2313,7 +2313,19 @@ def get_matrix(level_of_theory: str) -> np.ndarray:
 }
 
 
-ISOLATED_ATOM_ENERGIES = {
+def merge(a: dict, b: dict, path=[]):
+    for key in b:
+        if key in a:
+            if isinstance(a[key], dict) and isinstance(b[key], dict):
+                merge(a[key], b[key], path + [str(key)])
+            elif a[key] != b[key]:
+                raise Exception("Conflict at " + ".".join(path + [str(key)]))
+        else:
+            a[key] = b[key]
+    return a
+
+
+ISOLATED_ATOM_ENERGIES_ORIGINAL = {
     # DFT
     "wb97x": {
         "6-31g*": COMP6_1,
@@ -2386,7 +2398,6 @@ def get_matrix(level_of_theory: str) -> np.ndarray:
     "pm6": PM6,
     # FF
     "ttm2.1-f": TTM2,
-    **ISOLATED_ATOM_ENERGIES_ADDON,
 }
-
-# TODO: Talk with ivan about cbs extrapolation from from av[TQ]z. For now this should be ok
+# update dictionary without overriding the dictionary inside
+ISOLATED_ATOM_ENERGIES = merge(ISOLATED_ATOM_ENERGIES_ORIGINAL, ISOLATED_ATOM_ENERGIES_ADDON)
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,7 +7,7 @@ dependencies: @@
       - pip
       - tqdm
       - loguru
-      - fsspec
+      - fsspec <=2023.12.2 # 2024.3.0 not compatible rn
       - gcsfs
       - typer
       - prettytable
@@ Expand Down @@