From 1c1077d513bbfb1426cf8382a12e2edcc4cf208e Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 24 Jul 2024 09:55:49 -0400 Subject: [PATCH] Improved docs --- docs/API/e0_dispatcher.md | 1 + docs/API/properties.md | 3 ++ docs/API/statistics.md | 1 + docs/index.md | 8 +-- docs/usage.md | 8 +++ mkdocs.yml | 6 ++- openqdc/datasets/base.py | 8 ++- openqdc/datasets/energies.py | 92 ++++++++++++++++++--------------- openqdc/datasets/properties.py | 39 ++++++++++++-- openqdc/datasets/statistics.py | 93 ++++++++++++++++++++++------------ 10 files changed, 175 insertions(+), 84 deletions(-) create mode 100644 docs/API/e0_dispatcher.md create mode 100644 docs/API/properties.md create mode 100644 docs/API/statistics.md diff --git a/docs/API/e0_dispatcher.md b/docs/API/e0_dispatcher.md new file mode 100644 index 00000000..dfc29fdf --- /dev/null +++ b/docs/API/e0_dispatcher.md @@ -0,0 +1 @@ +::: openqdc.datasets.energies \ No newline at end of file diff --git a/docs/API/properties.md b/docs/API/properties.md new file mode 100644 index 00000000..846406a7 --- /dev/null +++ b/docs/API/properties.md @@ -0,0 +1,3 @@ +# Defined properties for datasets + +:::openqdc.datasets.properties \ No newline at end of file diff --git a/docs/API/statistics.md b/docs/API/statistics.md new file mode 100644 index 00000000..3f7d32bd --- /dev/null +++ b/docs/API/statistics.md @@ -0,0 +1 @@ +::: openqdc.datasets.statistics \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index db497b10..65b1ea94 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,17 +9,17 @@ OpenQDC is a python library to work with quantum datasets. It's a package aimed - 🧠 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc). - 📈 Data: have access to 1.5+ billion datapoints -Visit our website at TOFILL . +Visit our website at https://openqdc.io . ## Installation Use mamba: ```bash -mamba install -c conda-forge openqdc +conda install -c conda-forge openqdc ``` -_**Tips:** You can replace `mamba` by `conda`._ +_**Tips:** You can replace `conda` by `mamba`._ _**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install OpenQDC. The package is also pip installable if you need it: `pip install openqdc`._ @@ -58,7 +58,7 @@ dataset.calculate_descriptors( ## How to cite -Please cite OpenQDC if you use it in your research: [![DOI](zenodo_badge)](zenodo_link). +Please cite OpenQDC if you use it in your research: [![Pending Publication](Pending Publication)](Pending Publication). ## Compatibilities diff --git a/docs/usage.md b/docs/usage.md index af62f453..02874c41 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -37,6 +37,14 @@ for data in dataset.as_iter(atoms=True): break ``` +or if you want to just iterate over the data: + +```python +for data in dataset: + print(data) # dict of arrays + break +``` + ## Lazy loading OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during `import openqdc as qdc`. In case of trouble you can always disable lazy loading by setting the environment variable `OPENQDC_DISABLE_LAZY_LOADING` to `1`. diff --git a/mkdocs.yml b/mkdocs.yml index 4c649c00..cdeb8cbb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,7 +25,11 @@ nav: - API: - QM methods: API/methods.md - Normalization regressor: API/regressor.md - - Main class: API/basedataset.md + - Main classes: + - BaseDataset: API/basedataset.md + - Available Properties: API/properties.md + - e0 Dispatcher: API/e0_dispatcher.md + - Statistics: API/statistics.md - Format loading: API/formats.md - Datasets: - Potential Energy: diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index 8a480125..1e0c7165 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -237,7 +237,13 @@ def force_methods(self): return list(compress(self.energy_methods, self.force_mask)) @property - def e0s_dispatcher(self): + def e0s_dispatcher(self) -> AtomEnergies: + """ + Property to get the object that dispatched the isolated atom energies of the QM methods. + + Returns: + Object wrapping the isolated atom energies of the QM methods. + """ if not hasattr(self, "_e0s_dispatcher"): # Automatically fetch/compute formation or regression energies self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs) diff --git a/openqdc/datasets/energies.py b/openqdc/datasets/energies.py index 676f65c6..4ca61c82 100644 --- a/openqdc/datasets/energies.py +++ b/openqdc/datasets/energies.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field from os.path import join as p_join -from typing import Dict, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np from loguru import logger @@ -14,22 +14,24 @@ POSSIBLE_ENERGIES = ["formation", "regression", "null"] -def dispatch_factory(data, **kwargs) -> "IsolatedEnergyInterface": +def dispatch_factory(data: Any, **kwargs: Dict) -> "IsolatedEnergyInterface": """ Factory function that select the correct energy class for the fetching/calculation of isolated atom energies. - Parameters - ---------- - data : openqdc.datasets.Dataset - Dataset object that contains the information - about the isolated atom energies. Info will be passed - by references - kwargs : dict - Additional arguments that will be passed to the - selected energy class. Mostly used for regression - to pass the regressor_kwargs. + Parameters: + data : openqdc.datasets.Dataset + Dataset object that contains the information + about the isolated atom energies. Info will be passed + by references + kwargs : dict + Additional arguments that will be passed to the + selected energy class. Mostly used for regression + to pass the regressor_kwargs. + + Returns: + Initialized IsolatedEnergyInterface-like object """ if data.energy_type == "formation": return PhysicalEnergy(data, **kwargs) @@ -100,26 +102,16 @@ class AtomEnergies: """ def __init__(self, data, **kwargs) -> None: - """ - Parameters - ---------- - data : openqdc.datasets.Dataset - Dataset object that contains the information - about the isolated atom energies. Info will be passed - by references - kwargs : dict - Additional arguments that will be passed to the - selected energy class. Mostly used for regression - to pass the regressor_kwargs. - """ - self.atom_energies = data.energy_type self.factory = dispatch_factory(data, **kwargs) @property def e0s_matrix(self) -> np.ndarray: """ - Returns the isolated atom energies matrixes + Return the isolated atom energies dictionary + + Returns: + Matrix Array with the isolated atom energies """ return self.factory.e0_matrix @@ -127,6 +119,9 @@ def e0s_matrix(self) -> np.ndarray: def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]: """ Return the isolated atom energies dictionary + + Returns: + Dictionary with the isolated atom energies """ return self.factory.e0_dict @@ -142,10 +137,18 @@ def __getitem__(self, item: AtomSpecies) -> AtomEnergy: Item can be written as tuple(Symbol, charge), tuple(Chemical number, charge). If no charge is passed, it will be automatically set to 0. + Examples: - AtomEnergies[6], AtomEnergies[6,1], - AtomEnergies["C",1], AtomEnergies[(6,1)] + AtomEnergies[6], AtomEnergies[6,1], \n + AtomEnergies["C",1], AtomEnergies[(6,1)], \n AtomEnergies[("C,1)] + + Parameters: + item: + AtomSpecies object or tuple with the atom symbol and charge + + Returns: + AtomEnergy object with the isolated atom energy """ try: atom, charge = item[0], item[1] @@ -168,16 +171,15 @@ class IsolatedEnergyInterface(ABC): def __init__(self, data, **kwargs): """ - Parameters - ---------- - data : openqdc.datasets.Dataset - Dataset object that contains the information - about the isolated atom energies. Info will be passed - by references - kwargs : dict - Additional arguments that will be passed to the - selected energy class. Mostly used for regression - to pass the regressor_kwargs. + Parameters: + data : openqdc.datasets.Dataset + Dataset object that contains the information + about the isolated atom energies. Info will be passed + by references + kwargs : dict + Additional arguments that will be passed to the + selected energy class. Mostly used for regression + to pass the regressor_kwargs. """ self._e0_matrixs = [] self._e0_dict = None @@ -204,6 +206,9 @@ def __len__(self): def e0_matrix(self) -> np.ndarray: """ Return the isolated atom energies matrixes + + Returns: + Matrix Array with the isolated atom energies """ return np.array(self._e0_matrixs) @@ -211,6 +216,9 @@ def e0_matrix(self) -> np.ndarray: def e0_dict(self) -> Dict: """ Return the isolated atom energies dict + + Returns: + Dictionary with the isolated atom energies """ return self._e0s_dict @@ -276,11 +284,15 @@ def _post_init(self): self._set_lin_atom_species_dict(E0s, cov) self._set_linear_e0s() - def _compute_regression_e0s(self): + def _compute_regression_e0s(self) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Try to compute the regressed isolated atom energies. raise an error if the regression fails. return the regressed isolated atom energies and the uncertainty values. + + Returns: + Tuple with the regressed isolated atom energies and the uncertainty values of the regression + if available. """ try: E0s, cov = self.regressor.solve() @@ -305,7 +317,7 @@ def _set_lin_atom_species_dict(self, E0s, covs) -> None: def _set_linear_e0s(self) -> None: """ Transform the e0s dictionary into the correct e0s - matrix format + matrix format. """ new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))] for z, e0 in self._e0s_dict.items(): diff --git a/openqdc/datasets/properties.py b/openqdc/datasets/properties.py index 81bc568b..9874330a 100644 --- a/openqdc/datasets/properties.py +++ b/openqdc/datasets/properties.py @@ -1,3 +1,5 @@ +from typing import Tuple + import numpy as np import pandas as pd @@ -29,35 +31,62 @@ def _compute_average_nb_atoms(self): self.__average_nb_atoms__ = np.mean(self.data["n_atoms"]) @property - def average_n_atoms(self): + def average_n_atoms(self) -> int: """ Average number of atoms in a molecule in the dataset. + + Returns: + Average number of atoms in a molecule in the dataset. """ if self.__average_nb_atoms__ is None: raise StatisticsNotAvailableError(self.__name__) return self.__average_nb_atoms__ @property - def numbers(self): + def numbers(self) -> np.ndarray: + """ + Unique atomic numbers in the dataset + + Returns: + Array of the unique atomic numbers in the dataset + """ if hasattr(self, "_numbers"): return self._numbers self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32) return self._numbers @property - def charges(self): + def charges(self) -> np.ndarray: + """ + Unique charges in the dataset + + Returns: + Array of the unique charges in the dataset + """ if hasattr(self, "_charges"): return self._charges self._charges = np.unique(self.data["atomic_inputs"][..., :2], axis=0).astype(np.int32) return self._charges @property - def min_max_charges(self): + def min_max_charges(self) -> Tuple[int, int]: + """ + Minimum and maximum charges in the dataset + + Returns: + (min_charge, max_charge) + """ if hasattr(self, "_min_max_charges"): return self._min_max_charges self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1]) return self._min_max_charges @property - def chemical_species(self): + def chemical_species(self) -> np.ndarray: + """ + Chemical symbols in the dataset + + Returns: + Array of the chemical symbols in the dataset + """ return np.array(ATOM_SYMBOLS)[self.numbers] diff --git a/openqdc/datasets/statistics.py b/openqdc/datasets/statistics.py index 6b1adeb5..8b9ae135 100644 --- a/openqdc/datasets/statistics.py +++ b/openqdc/datasets/statistics.py @@ -2,7 +2,7 @@ from copy import deepcopy from dataclasses import asdict, dataclass from os.path import join as p_join -from typing import Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional import numpy as np from loguru import logger @@ -16,15 +16,22 @@ class StatisticsResults: to provide general methods. """ - def to_dict(self): + def to_dict(self) -> Dict: """ Convert the class to a dictionary + + Returns: + Dictionary representation of the class """ return asdict(self) def transform(self, func: Callable): """ Apply a function to all the attributes of the class + + Parameters: + func: + Function to apply to the attributes """ for k, v in self.to_dict().items(): if v is not None: @@ -56,18 +63,19 @@ class ForceStatistics(StatisticsResults): class StatisticManager: """ - Manager class to share the state between all + Manager class that automatically handle the shared state between the statistic calculators """ - def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"): + def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"): """ - dataset : openqdc.datasets.base.BaseDataset - The dataset object to compute the statistics - recompute : bool, default = False - Flag to recompute the statistics - *statistic_calculators : AbstractStatsCalculator - statistic calculators to run + Parameters: + dataset : openqdc.datasets.base.BaseDataset + The dataset object to compute the statistics + recompute: + Flag to recompute the statistics + *statistic_calculators: + List of statistic calculators to run """ self._state = {} self._results = {} @@ -80,6 +88,9 @@ def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "Ab def state(self) -> Dict: """ Return the dictionary state of the manager + + Returns: + State of the StatisticManager """ return self._state @@ -95,25 +106,40 @@ def reset_results(self): """ self._results = {} - def get_state(self, key: Optional[str] = None): + def get_state(self, key: Optional[str] = None) -> Optional[Any]: """ - key : str, default = None Return the value of the key in the state dictionary + + Parameters: + key: str, default = None + Returns: + the value of the key in the state dictionary or the whole state dictionary if key is None """ if key is None: return self._state return self._state.get(key, None) - def has_state(self, key: str): + def has_state(self, key: str) -> bool: """ Check is state has key + + Parameters: + key: + Key to check in the state dictionary + + Returns: + True if the key is in the state dictionary """ return key in self._state def get_results(self, as_dict: bool = False): """ Aggregate results from all the calculators + + Parameters: + as_dict: + Flag to return the results as a dictionary """ results = deepcopy(self._results) if as_dict: @@ -155,26 +181,27 @@ def __init__( forces: Optional[np.ndarray] = None, ): """ - name : str - Name of the dataset for saving and loading. - energy_type : str, default = None - Type of the energy for the computation of the statistics. Used for loading and saving. - force_recompute : bool, default = False - Flag to force the recomputation of the statistics - energies : np.ndarray, default = None - Energies of the dataset - n_atoms : np.ndarray, default = None - Number of atoms in the dataset - atom_species : np.ndarray, default = None - Atomic species of the dataset - position_idx_range : np.ndarray, default = None - Position index range of the dataset - e0_matrix : np.ndarray, default = None - Isolated atom energies matrix of the dataset - atom_charges : np.ndarray, default = None - Atomic charges of the dataset - forces : np.ndarray, default = None - Forces of the dataset + Parameters: + name : + Name of the dataset for saving and loading. + energy_type : + Type of the energy for the computation of the statistics. Used for loading and saving. + force_recompute : + Flag to force the recomputation of the statistics + energies : n + Energies of the dataset + n_atoms : + Number of atoms in the dataset + atom_species : + Atomic species of the dataset + position_idx_range : n + Position index range of the dataset + e0_matrix : + Isolated atom energies matrix of the dataset + atom_charges : + Atomic charges of the dataset + forces : + Forces of the dataset """ self.name = name self.energy_type = energy_type