From 1c1077d513bbfb1426cf8382a12e2edcc4cf208e Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Wed, 24 Jul 2024 09:55:49 -0400
Subject: [PATCH] Improved docs

---
 docs/API/e0_dispatcher.md      |  1 +
 docs/API/properties.md         |  3 ++
 docs/API/statistics.md         |  1 +
 docs/index.md                  |  8 +--
 docs/usage.md                  |  8 +++
 mkdocs.yml                     |  6 ++-
 openqdc/datasets/base.py       |  8 ++-
 openqdc/datasets/energies.py   | 92 ++++++++++++++++++---------------
 openqdc/datasets/properties.py | 39 ++++++++++++--
 openqdc/datasets/statistics.py | 93 ++++++++++++++++++++++------------
 10 files changed, 175 insertions(+), 84 deletions(-)
 create mode 100644 docs/API/e0_dispatcher.md
 create mode 100644 docs/API/properties.md
 create mode 100644 docs/API/statistics.md
diff --git a/docs/API/e0_dispatcher.md b/docs/API/e0_dispatcher.md
new file mode 100644
index 00000000..dfc29fdf
--- /dev/null
+++ b/docs/API/e0_dispatcher.md
@@ -0,0 +1 @@
+::: openqdc.datasets.energies
\ No newline at end of file
diff --git a/docs/API/properties.md b/docs/API/properties.md
new file mode 100644
index 00000000..846406a7
--- /dev/null
+++ b/docs/API/properties.md
@@ -0,0 +1,3 @@
+# Defined properties for datasets
+
+:::openqdc.datasets.properties
\ No newline at end of file
diff --git a/docs/API/statistics.md b/docs/API/statistics.md
new file mode 100644
index 00000000..3f7d32bd
--- /dev/null
+++ b/docs/API/statistics.md
@@ -0,0 +1 @@
+::: openqdc.datasets.statistics
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index db497b10..65b1ea94 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -9,17 +9,17 @@ OpenQDC is a python library to work with quantum datasets. It's a package aimed
 - 🧠 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc).
 - 📈 Data: have access to 1.5+ billion datapoints
 
-Visit our website at TOFILL <IDK>.
+Visit our website at https://openqdc.io .
 
 ## Installation
 
 Use mamba:
 
 ```bash
-mamba install -c conda-forge openqdc
+conda install -c conda-forge openqdc
 ```
 
-_**Tips:** You can replace `mamba` by `conda`._
+_**Tips:** You can replace `conda` by `mamba`._
 
 _**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install OpenQDC. The package is also pip installable if you need it: `pip install openqdc`._
 
@@ -58,7 +58,7 @@ dataset.calculate_descriptors(
 
 ## How to cite
 
-Please cite OpenQDC if you use it in your research: [![DOI](zenodo_badge)](zenodo_link).
+Please cite OpenQDC if you use it in your research: [![Pending Publication](Pending Publication)](Pending Publication).
 
 ## Compatibilities
 
diff --git a/docs/usage.md b/docs/usage.md
index af62f453..02874c41 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -37,6 +37,14 @@ for data in dataset.as_iter(atoms=True):
     break
 ```
 
+or if you want to just iterate over the data:
+
+```python
+for data in dataset:
+    print(data) # dict of arrays
+    break
+```
+
 ## Lazy loading
 
 OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during `import openqdc as qdc`. In case of trouble you can always disable lazy loading by setting the environment variable `OPENQDC_DISABLE_LAZY_LOADING` to `1`.
diff --git a/mkdocs.yml b/mkdocs.yml
index 4c649c00..cdeb8cbb 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -25,7 +25,11 @@ nav:
   - API:
     - QM methods: API/methods.md
     - Normalization regressor: API/regressor.md
-    - Main class: API/basedataset.md
+    - Main classes:
+      - BaseDataset: API/basedataset.md
+      - Available Properties: API/properties.md
+      - e0 Dispatcher: API/e0_dispatcher.md
+      - Statistics: API/statistics.md
     - Format loading: API/formats.md
     - Datasets:
       - Potential Energy:
diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index 8a480125..1e0c7165 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -237,7 +237,13 @@ def force_methods(self):
         return list(compress(self.energy_methods, self.force_mask))
 
     @property
-    def e0s_dispatcher(self):
+    def e0s_dispatcher(self) -> AtomEnergies:
+        """
+        Property to get the object that dispatched the isolated atom energies of the QM methods.
+
+        Returns:
+            Object wrapping the isolated atom energies of the QM methods.
+        """
         if not hasattr(self, "_e0s_dispatcher"):
             # Automatically fetch/compute formation or regression energies
             self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs)
diff --git a/openqdc/datasets/energies.py b/openqdc/datasets/energies.py
index 676f65c6..4ca61c82 100644
--- a/openqdc/datasets/energies.py
+++ b/openqdc/datasets/energies.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from os.path import join as p_join
-from typing import Dict, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 from loguru import logger
@@ -14,22 +14,24 @@
 POSSIBLE_ENERGIES = ["formation", "regression", "null"]
 
 
-def dispatch_factory(data, **kwargs) -> "IsolatedEnergyInterface":
+def dispatch_factory(data: Any, **kwargs: Dict) -> "IsolatedEnergyInterface":
     """
     Factory function that select the correct
     energy class for the fetching/calculation
     of isolated atom energies.
 
-    Parameters
-    ----------
-    data : openqdc.datasets.Dataset
-        Dataset object that contains the information
-        about the isolated atom energies. Info will be passed
-        by references
-    kwargs : dict
-        Additional arguments that will be passed to the
-        selected energy class. Mostly used for regression
-        to pass the regressor_kwargs.
+    Parameters:
+        data : openqdc.datasets.Dataset
+            Dataset object that contains the information
+            about the isolated atom energies. Info will be passed
+            by references
+        kwargs : dict
+            Additional arguments that will be passed to the
+            selected energy class. Mostly used for regression
+            to pass the regressor_kwargs.
+
+    Returns:
+        Initialized IsolatedEnergyInterface-like object
     """
     if data.energy_type == "formation":
         return PhysicalEnergy(data, **kwargs)
@@ -100,26 +102,16 @@ class AtomEnergies:
     """
 
     def __init__(self, data, **kwargs) -> None:
-        """
-        Parameters
-        ----------
-        data : openqdc.datasets.Dataset
-            Dataset object that contains the information
-            about the isolated atom energies. Info will be passed
-            by references
-        kwargs : dict
-            Additional arguments that will be passed to the
-            selected energy class. Mostly used for regression
-            to pass the regressor_kwargs.
-        """
-
         self.atom_energies = data.energy_type
         self.factory = dispatch_factory(data, **kwargs)
 
     @property
     def e0s_matrix(self) -> np.ndarray:
         """
-        Returns the isolated atom energies matrixes
+        Return the isolated atom energies dictionary
+
+        Returns:
+            Matrix Array with the isolated atom energies
         """
         return self.factory.e0_matrix
 
@@ -127,6 +119,9 @@ def e0s_matrix(self) -> np.ndarray:
     def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]:
         """
         Return the isolated atom energies dictionary
+
+        Returns:
+            Dictionary with the isolated atom energies
         """
         return self.factory.e0_dict
 
@@ -142,10 +137,18 @@ def __getitem__(self, item: AtomSpecies) -> AtomEnergy:
         Item can be written as tuple(Symbol, charge),
         tuple(Chemical number, charge). If no charge is passed,
         it will be automatically set to 0.
+
         Examples:
-            AtomEnergies[6], AtomEnergies[6,1],
-            AtomEnergies["C",1], AtomEnergies[(6,1)]
+            AtomEnergies[6], AtomEnergies[6,1], \n
+            AtomEnergies["C",1], AtomEnergies[(6,1)], \n
             AtomEnergies[("C,1)]
+
+        Parameters:
+            item:
+                AtomSpecies object or tuple with the atom symbol and charge
+
+        Returns:
+            AtomEnergy object with the isolated atom energy
         """
         try:
             atom, charge = item[0], item[1]
@@ -168,16 +171,15 @@ class IsolatedEnergyInterface(ABC):
 
     def __init__(self, data, **kwargs):
         """
-        Parameters
-        ----------
-        data : openqdc.datasets.Dataset
-            Dataset object that contains the information
-            about the isolated atom energies. Info will be passed
-            by references
-        kwargs : dict
-            Additional arguments that will be passed to the
-            selected energy class. Mostly used for regression
-            to pass the regressor_kwargs.
+        Parameters:
+            data : openqdc.datasets.Dataset
+                Dataset object that contains the information
+                about the isolated atom energies. Info will be passed
+                by references
+            kwargs : dict
+                Additional arguments that will be passed to the
+                selected energy class. Mostly used for regression
+                to pass the regressor_kwargs.
         """
         self._e0_matrixs = []
         self._e0_dict = None
@@ -204,6 +206,9 @@ def __len__(self):
     def e0_matrix(self) -> np.ndarray:
         """
         Return the isolated atom energies matrixes
+
+        Returns:
+            Matrix Array with the isolated atom energies
         """
         return np.array(self._e0_matrixs)
 
@@ -211,6 +216,9 @@ def e0_matrix(self) -> np.ndarray:
     def e0_dict(self) -> Dict:
         """
         Return the isolated atom energies dict
+
+        Returns:
+            Dictionary with the isolated atom energies
         """
 
         return self._e0s_dict
@@ -276,11 +284,15 @@ def _post_init(self):
             self._set_lin_atom_species_dict(E0s, cov)
         self._set_linear_e0s()
 
-    def _compute_regression_e0s(self):
+    def _compute_regression_e0s(self) -> Tuple[np.ndarray, Optional[np.ndarray]]:
         """
         Try to compute the regressed isolated atom energies.
         raise an error if the regression fails.
         return the regressed isolated atom energies and the uncertainty values.
+
+        Returns:
+            Tuple with the regressed isolated atom energies and the uncertainty values of the regression
+            if available.
         """
         try:
             E0s, cov = self.regressor.solve()
@@ -305,7 +317,7 @@ def _set_lin_atom_species_dict(self, E0s, covs) -> None:
     def _set_linear_e0s(self) -> None:
         """
         Transform the e0s dictionary into the correct e0s
-        matrix format
+        matrix format.
         """
         new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))]
         for z, e0 in self._e0s_dict.items():
diff --git a/openqdc/datasets/properties.py b/openqdc/datasets/properties.py
index 81bc568b..9874330a 100644
--- a/openqdc/datasets/properties.py
+++ b/openqdc/datasets/properties.py
@@ -1,3 +1,5 @@
+from typing import Tuple
+
 import numpy as np
 import pandas as pd
 
@@ -29,35 +31,62 @@ def _compute_average_nb_atoms(self):
         self.__average_nb_atoms__ = np.mean(self.data["n_atoms"])
 
     @property
-    def average_n_atoms(self):
+    def average_n_atoms(self) -> int:
         """
         Average number of atoms in a molecule in the dataset.
+
+        Returns:
+            Average number of atoms in a molecule in the dataset.
         """
         if self.__average_nb_atoms__ is None:
             raise StatisticsNotAvailableError(self.__name__)
         return self.__average_nb_atoms__
 
     @property
-    def numbers(self):
+    def numbers(self) -> np.ndarray:
+        """
+        Unique atomic numbers in the dataset
+
+        Returns:
+            Array of the unique atomic numbers in the dataset
+        """
         if hasattr(self, "_numbers"):
             return self._numbers
         self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
         return self._numbers
 
     @property
-    def charges(self):
+    def charges(self) -> np.ndarray:
+        """
+        Unique charges in the dataset
+
+        Returns:
+            Array of the unique charges in the dataset
+        """
         if hasattr(self, "_charges"):
             return self._charges
         self._charges = np.unique(self.data["atomic_inputs"][..., :2], axis=0).astype(np.int32)
         return self._charges
 
     @property
-    def min_max_charges(self):
+    def min_max_charges(self) -> Tuple[int, int]:
+        """
+        Minimum and maximum charges in the dataset
+
+        Returns:
+            (min_charge, max_charge)
+        """
         if hasattr(self, "_min_max_charges"):
             return self._min_max_charges
         self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1])
         return self._min_max_charges
 
     @property
-    def chemical_species(self):
+    def chemical_species(self) -> np.ndarray:
+        """
+        Chemical symbols in the dataset
+
+        Returns:
+            Array of the chemical symbols in the dataset
+        """
         return np.array(ATOM_SYMBOLS)[self.numbers]
diff --git a/openqdc/datasets/statistics.py b/openqdc/datasets/statistics.py
index 6b1adeb5..8b9ae135 100644
--- a/openqdc/datasets/statistics.py
+++ b/openqdc/datasets/statistics.py
@@ -2,7 +2,7 @@
 from copy import deepcopy
 from dataclasses import asdict, dataclass
 from os.path import join as p_join
-from typing import Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional
 
 import numpy as np
 from loguru import logger
@@ -16,15 +16,22 @@ class StatisticsResults:
     to provide general methods.
     """
 
-    def to_dict(self):
+    def to_dict(self) -> Dict:
         """
         Convert the class to a dictionary
+
+        Returns:
+            Dictionary representation of the class
         """
         return asdict(self)
 
     def transform(self, func: Callable):
         """
         Apply a function to all the attributes of the class
+
+        Parameters:
+            func:
+                Function to apply to the attributes
         """
         for k, v in self.to_dict().items():
             if v is not None:
@@ -56,18 +63,19 @@ class ForceStatistics(StatisticsResults):
 
 class StatisticManager:
     """
-    Manager class to share the state between all
+    Manager class that automatically handle the shared state between
     the statistic calculators
     """
 
-    def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"):
+    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"):
         """
-        dataset : openqdc.datasets.base.BaseDataset
-            The dataset object to compute the statistics
-        recompute : bool, default = False
-            Flag to recompute the statistics
-        *statistic_calculators :  AbstractStatsCalculator
-            statistic calculators to run
+        Parameters:
+            dataset : openqdc.datasets.base.BaseDataset
+                The dataset object to compute the statistics
+            recompute:
+                Flag to recompute the statistics
+            *statistic_calculators:
+                List of statistic calculators to run
         """
         self._state = {}
         self._results = {}
@@ -80,6 +88,9 @@ def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "Ab
     def state(self) -> Dict:
         """
         Return the dictionary state of the manager
+
+        Returns:
+            State of the StatisticManager
         """
         return self._state
 
@@ -95,25 +106,40 @@ def reset_results(self):
         """
         self._results = {}
 
-    def get_state(self, key: Optional[str] = None):
+    def get_state(self, key: Optional[str] = None) -> Optional[Any]:
         """
-        key : str, default = None
         Return the value of the key in the state dictionary
+
+        Parameters:
+            key: str, default = None
+        Returns:
+            the value of the key in the state dictionary
             or the whole state dictionary if key is None
         """
         if key is None:
             return self._state
         return self._state.get(key, None)
 
-    def has_state(self, key: str):
+    def has_state(self, key: str) -> bool:
         """
         Check is state has key
+
+        Parameters:
+            key:
+                Key to check in the state dictionary
+
+        Returns:
+            True if the key is in the state dictionary
         """
         return key in self._state
 
     def get_results(self, as_dict: bool = False):
         """
         Aggregate results from all the calculators
+
+        Parameters:
+            as_dict:
+                Flag to return the results as a dictionary
         """
         results = deepcopy(self._results)
         if as_dict:
@@ -155,26 +181,27 @@ def __init__(
         forces: Optional[np.ndarray] = None,
     ):
         """
-        name : str
-            Name of the dataset for saving and loading.
-        energy_type : str, default = None
-            Type of the energy for the computation of the statistics. Used for loading and saving.
-        force_recompute : bool, default = False
-            Flag to force the recomputation of the statistics
-        energies : np.ndarray, default = None
-            Energies of the dataset
-        n_atoms : np.ndarray, default = None
-            Number of atoms in the dataset
-        atom_species : np.ndarray, default = None
-            Atomic species of the dataset
-        position_idx_range : np.ndarray, default = None
-            Position index range of the dataset
-        e0_matrix : np.ndarray, default = None
-            Isolated atom energies matrix of the dataset
-        atom_charges : np.ndarray, default = None
-            Atomic charges of the dataset
-        forces : np.ndarray, default = None
-            Forces of the dataset
+        Parameters:
+            name :
+                Name of the dataset for saving and loading.
+            energy_type :
+                Type of the energy for the computation of the statistics. Used for loading and saving.
+            force_recompute :
+                Flag to force the recomputation of the statistics
+            energies : n
+                Energies of the dataset
+            n_atoms :
+                Number of atoms in the dataset
+            atom_species :
+                Atomic species of the dataset
+            position_idx_range : n
+                Position index range of the dataset
+            e0_matrix :
+                Isolated atom energies matrix of the dataset
+            atom_charges :
+                Atomic charges of the dataset
+            forces :
+                Forces of the dataset
         """
         self.name = name
         self.energy_type = energy_type