Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revmd17 #48

Merged
merged 5 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ dependencies:
- pip
- tqdm
- loguru
- fsspec
- fsspec <=2023.12.2 # 2024.3.0 not compatible rn
- gcsfs
- typer
- prettytable
Expand Down
4 changes: 2 additions & 2 deletions openqdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@
"OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
"SN2RXN": "openqdc.datasets.potential.sn2_rxn",
"QM7X": "openqdc.datasets.potential.qm7x",
"DES": "openqdc.datasets.interaction.des",
"NablaDFT": "openqdc.datasets.potential.nabladft",
"SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
"WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
"TMQM": "openqdc.datasets.potential.tmqm",
"Dummy": "openqdc.datasets.potential.dummy",
"PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
"PCQM_PM6": "openqdc.datasets.potential.pcqm",
"RevMD17": "openqdc.datasets.potential.revmd17",
"Transition1X": "openqdc.datasets.potential.transition1x",
"MultixcQM9": "openqdc.datasets.potential.multixcqm9",
"AVAILABLE_DATASETS": "openqdc.datasets",
Expand Down Expand Up @@ -70,7 +70,6 @@ def __dir__():
from ._version import __version__ # noqa
from .datasets import AVAILABLE_DATASETS # noqa
from .datasets.base import BaseDataset # noqa
from .datasets.interaction.des import DES # noqa
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1X # noqa
from .datasets.potential.comp6 import COMP6 # noqa
from .datasets.potential.dummy import Dummy # noqa
Expand All @@ -84,6 +83,7 @@ def __dir__():
from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6 # noqa
from .datasets.potential.qm7x import QM7X # noqa
from .datasets.potential.qmugs import QMugs # noqa
from .datasets.potential.revmd17 import RevMD17 # noqa
from .datasets.potential.sn2_rxn import SN2RXN # noqa
from .datasets.potential.solvated_peptides import SolvatedPeptides # noqa
from .datasets.potential.spice import Spice # noqa
Expand Down
2 changes: 2 additions & 0 deletions openqdc/datasets/potential/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .pcqm import PCQM_B3LYP, PCQM_PM6 # noqa
from .qm7x import QM7X # noqa
from .qmugs import QMugs # noqa
from .revmd17 import RevMD17 # noqa
from .sn2_rxn import SN2RXN # noqa
from .solvated_peptides import SolvatedPeptides # noqa
from .spice import Spice # noqa
Expand Down Expand Up @@ -40,4 +41,5 @@
"transition1x": Transition1X,
"watercluster": WaterClusters,
"multixcqm9": MultixcQM9,
"revmd17": RevMD17,
}
102 changes: 102 additions & 0 deletions openqdc/datasets/potential/revmd17.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from os.path import join as p_join

import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.raws.fetch import decompress_tar_gz

trajectories = {
"rmd17_aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
"rmd17_benzene": "c1ccccc1",
"rmd17_malonaldehyde": "C(C=O)C=O",
"rmd17_paracetamol": "CC(=O)Nc1ccc(cc1)O",
"rmd17_toluene": "Cc1ccccc1",
"rmd17_azobenzene": "C1=CC=C(C=C1)N=NC2=CC=CC=C2",
"rmd17_ethanol": "CCO",
"rmd17_naphthalene": "C1=CC=C2C=CC=CC2=C1",
"rmd17_salicylic": "C1=CC=C(C(=C1)C(=O)O)O",
"rmd17_uracil": "C1=CNC(=O)NC1=O",
}


def shape_atom_inputs(coords, atom_species):
reshaped_coords = coords.reshape(-1, 3)
frame, atoms, _ = coords.shape
z = np.tile(atom_species, frame)
xs = np.stack((z, np.zeros_like(z)), axis=-1)
return np.concatenate((xs, reshaped_coords), axis=-1, dtype=np.float32)


def read_npz_entry(filename, root):
data = np.load(create_path(filename, root))
nuclear_charges, coords, energies, forces = (
data["nuclear_charges"],
data["coords"],
data["energies"],
data["forces"],
)
frames = coords.shape[0]
res = dict(
name=np.array([trajectories[filename]] * frames),
subset=np.array([filename] * frames),
energies=energies[:, None].astype(np.float32),
forces=forces.reshape(-1, 3, 1).astype(np.float32),
atomic_inputs=shape_atom_inputs(coords, nuclear_charges),
n_atoms=np.array([len(nuclear_charges)] * frames, dtype=np.int32),
)
return res


def create_path(filename, root):
return p_join(root, "rmd17", "npz_data", filename + ".npz")


class RevMD17(BaseDataset):
"""
- Benzene: 627000 samples
- Uracil: 133000 samples
- Naptalene: 326000 samples
- Aspirin: 211000 samples
- Salicylic Acid: 320000 samples
- Malonaldehyde: 993000 samples
- Ethanol: 555000 samples
- Toluene: 100000 samples

Usage
```python
from openqdc.datasets import RevMD17
dataset = RevMD17()
```

References:
- https://arxiv.org/abs/2007.09593
"""

__name__ = "revmd17"

__energy_methods__ = [
"pbe/vdw-ts",
]

energy_target_names = [
"PBE-TS Energy",
]

__force_methods__ = [
"pbe/vdw-ts",
]

force_target_names = [
"PBE-TS Gradient",
]

__energy_unit__ = "kcal/mol"
__distance_unit__ = "ang"
__forces_unit__ = "kcal/mol/ang"

def read_raw_entries(self):
entries_list = []
decompress_tar_gz(p_join(self.root, "rmd17.tar.bz2"))
for trajectory in trajectories:
entries_list.append(read_npz_entry(trajectory, self.root))
return entries_list
4 changes: 4 additions & 0 deletions openqdc/raws/config_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ class DataConfigFactory:
dataset_name="des_s66x8",
links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"},
)
revmd17 = dict(
dataset_name="revmd17",
links={"revmd17.zip": "https://figshare.com/ndownloader/articles/12672038/versions/3"},
)

available_datasets = [k for k in locals().keys() if not k.startswith("__")]

Expand Down
19 changes: 15 additions & 4 deletions openqdc/utils/atomization_energies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2313,7 +2313,19 @@ def get_matrix(level_of_theory: str) -> np.ndarray:
}


ISOLATED_ATOM_ENERGIES = {
def merge(a: dict, b: dict, path=[]):
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif a[key] != b[key]:
raise Exception("Conflict at " + ".".join(path + [str(key)]))
else:
a[key] = b[key]
return a


ISOLATED_ATOM_ENERGIES_ORIGINAL = {
# DFT
"wb97x": {
"6-31g*": COMP6_1,
Expand Down Expand Up @@ -2386,7 +2398,6 @@ def get_matrix(level_of_theory: str) -> np.ndarray:
"pm6": PM6,
# FF
"ttm2.1-f": TTM2,
**ISOLATED_ATOM_ENERGIES_ADDON,
}

# TODO: Talk with ivan about cbs extrapolation from from av[TQ]z. For now this should be ok
# update dictionary without overriding the dictionary inside
ISOLATED_ATOM_ENERGIES = merge(ISOLATED_ATOM_ENERGIES_ORIGINAL, ISOLATED_ATOM_ENERGIES_ADDON)
Loading