Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements to Dummy Dataset #34

Merged
merged 6 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 41 additions & 41 deletions openqdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,28 @@
_lazy_imports_obj = {
"__version__": "openqdc._version",
"BaseDataset": "openqdc.datasets.base",
"ANI1": "openqdc.datasets.ani",
"ANI1CCX": "openqdc.datasets.ani",
"ANI1X": "openqdc.datasets.ani",
"Spice": "openqdc.datasets.spice",
"GEOM": "openqdc.datasets.geom",
"QMugs": "openqdc.datasets.qmugs",
"ISO17": "openqdc.datasets.iso_17",
"COMP6": "openqdc.datasets.comp6",
"GDML": "openqdc.datasets.gdml",
"Molecule3D": "openqdc.datasets.molecule3d",
"OrbnetDenali": "openqdc.datasets.orbnet_denali",
"SN2RXN": "openqdc.datasets.sn2_rxn",
"QM7X": "openqdc.datasets.qm7x",
"DES": "openqdc.datasets.des",
"NablaDFT": "openqdc.datasets.nabladft",
"SolvatedPeptides": "openqdc.datasets.solvated_peptides",
"WaterClusters": "openqdc.datasets.waterclusters3_30",
"TMQM": "openqdc.datasets.tmqm",
"Dummy": "openqdc.datasets.dummy",
"PCQM_B3LYP": "openqdc.datasets.pcqm",
"PCQM_PM6": "openqdc.datasets.pcqm",
"Transition1X": "openqdc.datasets.transition1x",
"ANI1": "openqdc.datasets.potential.ani",
"ANI1CCX": "openqdc.datasets.potential.ani",
"ANI1X": "openqdc.datasets.potential.ani",
"Spice": "openqdc.datasets.potential.spice",
"GEOM": "openqdc.datasets.potential.geom",
"QMugs": "openqdc.datasets.potential.qmugs",
"ISO17": "openqdc.datasets.potential.iso_17",
"COMP6": "openqdc.datasets.potential.comp6",
"GDML": "openqdc.datasets.potential.gdml",
"Molecule3D": "openqdc.datasets.potential.molecule3d",
"OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
"SN2RXN": "openqdc.datasets.potential.sn2_rxn",
"QM7X": "openqdc.datasets.potential.qm7x",
"DES": "openqdc.datasets.interaction.des",
"NablaDFT": "openqdc.datasets.potential.nabladft",
"SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
"WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
"TMQM": "openqdc.datasets.potential.tmqm",
"Dummy": "openqdc.datasets.potential.dummy",
"PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
"PCQM_PM6": "openqdc.datasets.potential.pcqm",
"Transition1X": "openqdc.datasets.potential.transition1x",
"AVAILABLE_DATASETS": "openqdc.datasets",
}

Expand Down Expand Up @@ -66,23 +66,23 @@ def __dir__():
# checkers what they are.
from ._version import __version__ # noqa
from .datasets import AVAILABLE_DATASETS # noqa
from .datasets.ani import ANI1, ANI1CCX, ANI1X # noqa
from .datasets.base import BaseDataset # noqa
from .datasets.comp6 import COMP6 # noqa
from .datasets.des import DES # noqa
from .datasets.dummy import Dummy # noqa
from .datasets.gdml import GDML # noqa
from .datasets.geom import GEOM # noqa
from .datasets.iso_17 import ISO17 # noqa
from .datasets.molecule3d import Molecule3D # noqa
from .datasets.nabladft import NablaDFT # noqa
from .datasets.orbnet_denali import OrbnetDenali # noqa
from .datasets.pcqm import PCQM_B3LYP, PCQM_PM6 # noqa
from .datasets.qm7x import QM7X # noqa
from .datasets.qmugs import QMugs # noqa
from .datasets.sn2_rxn import SN2RXN # noqa
from .datasets.solvated_peptides import SolvatedPeptides # noqa
from .datasets.spice import Spice # noqa
from .datasets.tmqm import TMQM # noqa
from .datasets.transition1x import Transition1X # noqa
from .datasets.waterclusters3_30 import WaterClusters # noqa
from .datasets.interaction.des import DES # noqa
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1X # noqa
from .datasets.potential.comp6 import COMP6 # noqa
from .datasets.potential.dummy import Dummy # noqa
from .datasets.potential.gdml import GDML # noqa
from .datasets.potential.geom import GEOM # noqa
from .datasets.potential.iso_17 import ISO17 # noqa
from .datasets.potential.molecule3d import Molecule3D # noqa
from .datasets.potential.nabladft import NablaDFT # noqa
from .datasets.potential.orbnet_denali import OrbnetDenali # noqa
from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6 # noqa
from .datasets.potential.qm7x import QM7X # noqa
from .datasets.potential.qmugs import QMugs # noqa
from .datasets.potential.sn2_rxn import SN2RXN # noqa
from .datasets.potential.solvated_peptides import SolvatedPeptides # noqa
from .datasets.potential.spice import Spice # noqa
from .datasets.potential.tmqm import TMQM # noqa
from .datasets.potential.transition1x import Transition1X # noqa
from .datasets.potential.waterclusters3_30 import WaterClusters # noqa
8 changes: 3 additions & 5 deletions openqdc/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,16 +443,14 @@ def preprocess(self, overwrite=False):
res = self.collate_list(entries)
self.save_preprocess(res)

def save_xyz(self, idx: int, path: Optional[str] = None, name=None):
def save_xyz(self, idx: int, path: Optional[str] = None, ext=True):
"""
Save the entry at index idx as an extxyz file.
"""
if path is None:
path = os.getcwd()
at = self.get_ase_atoms(idx, ext=True)
if name is not None:
name = at.info["name"]
write_extxyz(p_join(path, f"{name}.xyz"), at)
at = self.get_ase_atoms(idx, ext=ext)
write_extxyz(p_join(path, f"mol_{idx}.xyz"), at)

def get_ase_atoms(self, idx: int, ext=True):
"""
Expand Down
71 changes: 43 additions & 28 deletions openqdc/datasets/potential/dummy.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import numpy as np # noqa
from numpy import array
from sklearn.utils import Bunch
import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
from openqdc.utils.constants import NOT_DEFINED


class Dummy(BaseDataset):
"""
Dummy dataset
Dummy dataset for testing.
"""

__name__ = "dummy"
Expand All @@ -30,21 +27,26 @@ def _stats(self):
return {
"formation": {
"energy": {
"mean": array([[-12.94348027, -9.83037297]]),
"std": array([[4.39971409, 3.3574188]]),
"mean": np.array([[-12.94348027, -9.83037297]]),
"std": np.array([[4.39971409, 3.3574188]]),
},
"forces": NOT_DEFINED,
},
"total": {
"energy": {
"mean": array([[-89.44242, -1740.5336]]),
"std": array([[29.599571, 791.48663]]),
"mean": np.array([[-89.44242, -1740.5336]]),
"std": np.array([[29.599571, 791.48663]]),
},
"forces": NOT_DEFINED,
},
}

def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None:
def __init__(
self,
energy_unit=None,
distance_unit=None,
cache_dir=None,
) -> None:
try:
super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir)

Expand All @@ -54,8 +56,37 @@ def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None
self.setup_dummy()

def setup_dummy(self):
self._n_atoms = np.array([np.random.randint(1, 100) for _ in range(self.__len__())])
self.__average_nb_atoms__ = self._n_atoms.mean()
n_atoms = np.array([np.random.randint(1, 100) for _ in range(len(self))])
position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2)
atomic_inputs = np.concatenate(
[
np.concatenate(
[
# z, c, x, y, z
np.random.randint(1, 100, size=(size, 1)),
np.random.randint(-1, 2, size=(size, 1)),
np.random.randn(size, 3),
],
axis=1,
)
for size in n_atoms
],
axis=0,
) # (sum(n_atoms), 5)
name = [f"dummy_{i}" for i in range(len(self))]
subset = ["dummy" for i in range(len(self))]
energies = np.random.rand(len(self), len(self.__energy_methods__))
forces = np.concatenate([np.random.randn(size, 3, len(self.__force_methods__)) * 100 for size in n_atoms])
self.data = dict(
n_atoms=n_atoms,
position_idx_range=position_idx_range,
name=name,
atomic_inputs=atomic_inputs,
subset=subset,
energies=energies,
forces=forces,
)
self.__average_nb_atoms__ = self.data["n_atoms"].mean()

def is_preprocessed(self):
return True
Expand All @@ -65,19 +96,3 @@ def read_raw_entries(self):

def __len__(self):
return 9999

def __getitem__(self, idx: int):
shift = IsolatedAtomEnergyFactory.max_charge
size = self._n_atoms[idx]
z = np.random.randint(1, 100, size)
c = np.random.randint(-1, 2, size)
return Bunch(
positions=np.random.rand(size, 3) * 10,
atomic_numbers=z,
charges=c,
e0=self.__isolated_atom_energies__[..., z, c + shift].T,
energies=np.random.randn(len(self.__energy_methods__)),
name="dummy_{}".format(idx),
subset="dummy",
forces=(np.random.randn(size, 3, len(self.__force_methods__)) * 100),
)
4 changes: 2 additions & 2 deletions openqdc/datasets/potential/transition1x.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ class Transition1X(BaseDataset):
"wB97x_6-31G(d).forces",
]

__energy_unit__ = "hartree"
__energy_unit__ = "ev"
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"
__forces_unit__ = "ev/ang"

def read_raw_entries(self):
raw_path = p_join(self.root, "Transition1x.h5")
Expand Down
Loading