From 97ac2baf8b1e57a1bc42623310e9c9555241cb09 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 15 Mar 2024 17:43:53 +0000 Subject: [PATCH 1/5] revMD17 --- openqdc/__init__.py | 4 +- openqdc/datasets/potential/__init__.py | 2 + openqdc/datasets/potential/revmd17.py | 100 +++++++++++++++++++++++++ openqdc/raws/config_factory.py | 4 + 4 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 openqdc/datasets/potential/revmd17.py diff --git a/openqdc/__init__.py b/openqdc/__init__.py index b2c953e..b36a191 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -23,7 +23,6 @@ "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali", "SN2RXN": "openqdc.datasets.potential.sn2_rxn", "QM7X": "openqdc.datasets.potential.qm7x", - "DES": "openqdc.datasets.interaction.des", "NablaDFT": "openqdc.datasets.potential.nabladft", "SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides", "WaterClusters": "openqdc.datasets.potential.waterclusters3_30", @@ -31,6 +30,7 @@ "Dummy": "openqdc.datasets.potential.dummy", "PCQM_B3LYP": "openqdc.datasets.potential.pcqm", "PCQM_PM6": "openqdc.datasets.potential.pcqm", + "RevMD17": "openqdc.datasets.potential.revmd17", "Transition1X": "openqdc.datasets.potential.transition1x", "MultixcQM9": "openqdc.datasets.potential.multixcqm9", "AVAILABLE_DATASETS": "openqdc.datasets", @@ -70,7 +70,6 @@ def __dir__(): from ._version import __version__ # noqa from .datasets import AVAILABLE_DATASETS # noqa from .datasets.base import BaseDataset # noqa - from .datasets.interaction.des import DES # noqa from .datasets.potential.ani import ANI1, ANI1CCX, ANI1X # noqa from .datasets.potential.comp6 import COMP6 # noqa from .datasets.potential.dummy import Dummy # noqa @@ -84,6 +83,7 @@ def __dir__(): from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6 # noqa from .datasets.potential.qm7x import QM7X # noqa from .datasets.potential.qmugs import QMugs # noqa + from .datasets.potential.revmd17 import RevMD17 # noqa from .datasets.potential.sn2_rxn import SN2RXN # noqa from .datasets.potential.solvated_peptides import SolvatedPeptides # noqa from .datasets.potential.spice import Spice # noqa diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py index 1704555..07b8b16 100644 --- a/openqdc/datasets/potential/__init__.py +++ b/openqdc/datasets/potential/__init__.py @@ -11,6 +11,7 @@ from .pcqm import PCQM_B3LYP, PCQM_PM6 # noqa from .qm7x import QM7X # noqa from .qmugs import QMugs # noqa +from .revmd17 import RevMD17 # noqa from .sn2_rxn import SN2RXN # noqa from .solvated_peptides import SolvatedPeptides # noqa from .spice import Spice # noqa @@ -40,4 +41,5 @@ "transition1x": Transition1X, "watercluster": WaterClusters, "multixcqm9": MultixcQM9, + "revmd17": RevMD17, } diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py new file mode 100644 index 0000000..d45b2f4 --- /dev/null +++ b/openqdc/datasets/potential/revmd17.py @@ -0,0 +1,100 @@ +from os.path import join as p_join + +import numpy as np + +from openqdc.datasets.base import BaseDataset + +trajectories = { + "rmd17_aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O", + "rmd17_benzene": "c1ccccc1", + "rmd17_malonaldehyde": "C(C=O)C=O", + "rmd17_paracetamol": "CC(=O)Nc1ccc(cc1)O", + "rmd17_toluene": "Cc1ccccc1", + "rmd17_azobenzene": "C1=CC=C(C=C1)N=NC2=CC=CC=C2", + "rmd17_ethanol": "CCO", + "rmd17_naphthalene": "C1=CC=C2C=CC=CC2=C1", + "rmd17_salicylic": "C1=CC=C(C(=C1)C(=O)O)O", + "rmd17_uracil": "C1=CNC(=O)NC1=O", +} + + +def shape_atom_inputs(coords, atom_species): + reshaped_coords = coords.reshape(-1, 3) + frame, atoms, _ = coords.shape + z = np.tile(atom_species, frame) + xs = np.stack((z, np.zeros_like(z)), axis=-1) + return np.concatenate((xs, reshaped_coords), axis=-1, dtype=np.float32) + + +def read_npz_entry(filename, root): + data = np.load(create_path(filename, root)) + nuclear_charges, coords, energies, forces = ( + data["nuclear_charges"], + data["coords"], + data["energies"], + data["forces"], + ) + frames = coords.shape[0] + res = dict( + name=np.array([trajectories[filename]] * frames), + subset=np.array([filename] * frames), + energies=energies[:, None].astype(np.float32), + forces=forces.reshape(-1, 3, 1), + atomic_inputs=shape_atom_inputs(coords, nuclear_charges), + n_atoms=np.array([len(nuclear_charges)] * frames, dtype=np.int32), + ) + return res + + +def create_path(filename, root): + return p_join(root, "rmd17", "npz_data", filename + ".npz") + + +class RevMD17(BaseDataset): + """ + - Benzene: 627000 samples + - Uracil: 133000 samples + - Naptalene: 326000 samples + - Aspirin: 211000 samples + - Salicylic Acid: 320000 samples + - Malonaldehyde: 993000 samples + - Ethanol: 555000 samples + - Toluene: 100000 samples + + Usage + ```python + from openqdc.datasets import RevMD17 + dataset = RevMD17() + ``` + + References: + - https://arxiv.org/abs/2007.09593 + """ + + __name__ = "revmd17" + + __energy_methods__ = [ + "pbe/vdw-ts", # MD17 + ] + + energy_target_names = [ + "PBE-TS Energy", + ] + + __force_methods__ = [ + "pbe/vdw-ts", # MD17 + ] + + force_target_names = [ + "PBE-TS Gradient", + ] + + __energy_unit__ = "kcal/mol" + __distance_unit__ = "ang" + __forces_unit__ = "kcal/mol/ang" + + def read_raw_entries(self): + entries_list = [] + for trajectory in trajectories: + entries_list.append(read_npz_entry(trajectory, self.root)) + return entries_list diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py index 6205a3e..e9ca176 100644 --- a/openqdc/raws/config_factory.py +++ b/openqdc/raws/config_factory.py @@ -171,6 +171,10 @@ class DataConfigFactory: dataset_name="des_s66x8", links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"}, ) + revmd17 = dict( + dataset_name="revmd17", + links={"revmd17.zip": "https://figshare.com/ndownloader/articles/12672038/versions/3"}, + ) available_datasets = [k for k in locals().keys() if not k.startswith("__")] From c659c18e743acab5dd64855880d192a88f14f3d6 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 15 Mar 2024 18:10:14 +0000 Subject: [PATCH 2/5] Atom energies FIX --- openqdc/utils/atomization_energies.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/openqdc/utils/atomization_energies.py b/openqdc/utils/atomization_energies.py index 95464f6..d9ff2b3 100644 --- a/openqdc/utils/atomization_energies.py +++ b/openqdc/utils/atomization_energies.py @@ -2313,7 +2313,19 @@ def get_matrix(level_of_theory: str) -> np.ndarray: } -ISOLATED_ATOM_ENERGIES = { +def merge(a: dict, b: dict, path=[]): + for key in b: + if key in a: + if isinstance(a[key], dict) and isinstance(b[key], dict): + merge(a[key], b[key], path + [str(key)]) + elif a[key] != b[key]: + raise Exception("Conflict at " + ".".join(path + [str(key)])) + else: + a[key] = b[key] + return a + + +ISOLATED_ATOM_ENERGIES_ORIGINAL = { # DFT "wb97x": { "6-31g*": COMP6_1, @@ -2386,7 +2398,6 @@ def get_matrix(level_of_theory: str) -> np.ndarray: "pm6": PM6, # FF "ttm2.1-f": TTM2, - **ISOLATED_ATOM_ENERGIES_ADDON, } - -# TODO: Talk with ivan about cbs extrapolation from from av[TQ]z. For now this should be ok +# update dictionary without overriding the dictionary inside +ISOLATED_ATOM_ENERGIES = merge(ISOLATED_ATOM_ENERGIES_ORIGINAL, ISOLATED_ATOM_ENERGIES_ADDON) From 677ce47282a604413ae2d34389b5901175433e09 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 15 Mar 2024 18:17:20 +0000 Subject: [PATCH 3/5] correct type arrays --- openqdc/datasets/potential/revmd17.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py index d45b2f4..870ef7f 100644 --- a/openqdc/datasets/potential/revmd17.py +++ b/openqdc/datasets/potential/revmd17.py @@ -39,7 +39,7 @@ def read_npz_entry(filename, root): name=np.array([trajectories[filename]] * frames), subset=np.array([filename] * frames), energies=energies[:, None].astype(np.float32), - forces=forces.reshape(-1, 3, 1), + forces=forces.reshape(-1, 3, 1).astype(np.float32), atomic_inputs=shape_atom_inputs(coords, nuclear_charges), n_atoms=np.array([len(nuclear_charges)] * frames, dtype=np.int32), ) From 8ec7dcebac721a003a761b7f104f8037809a83dc Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 18 Mar 2024 14:13:42 +0000 Subject: [PATCH 4/5] Fetch RevMD17 --- openqdc/datasets/potential/revmd17.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py index 870ef7f..85702ed 100644 --- a/openqdc/datasets/potential/revmd17.py +++ b/openqdc/datasets/potential/revmd17.py @@ -3,6 +3,7 @@ import numpy as np from openqdc.datasets.base import BaseDataset +from openqdc.raws.fetch import decompress_tar_gz trajectories = { "rmd17_aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O", @@ -74,7 +75,7 @@ class RevMD17(BaseDataset): __name__ = "revmd17" __energy_methods__ = [ - "pbe/vdw-ts", # MD17 + "pbe/vdw-ts", ] energy_target_names = [ @@ -82,7 +83,7 @@ class RevMD17(BaseDataset): ] __force_methods__ = [ - "pbe/vdw-ts", # MD17 + "pbe/vdw-ts", ] force_target_names = [ @@ -95,6 +96,7 @@ class RevMD17(BaseDataset): def read_raw_entries(self): entries_list = [] + decompress_tar_gz(p_join(self.root, "rmd17.tar.bz2")) for trajectory in trajectories: entries_list.append(read_npz_entry(trajectory, self.root)) return entries_list From 33c8d25e5a33f9d1a2ed4bbc72e91f4e9fbcfe05 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 18 Mar 2024 14:25:27 +0000 Subject: [PATCH 5/5] Compatibility issue fsspec --- env.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/env.yml b/env.yml index d7a19db..16ccc3c 100644 --- a/env.yml +++ b/env.yml @@ -7,7 +7,7 @@ dependencies: - pip - tqdm - loguru - - fsspec + - fsspec <=2023.12.2 # 2024.3.0 not compatible rn - gcsfs - typer - prettytable