Merge pull request #198 from y0z/feature/smac_output

Add `output_directory` option to `SMACSampler` and update `README.md` to describe APIs.
optuna · Dec 11, 2024 · 684ccc9 · 684ccc9
2 parents 035036e + 7e5e8b1
commit 684ccc9
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 3 deletions.
diff --git a/package/samplers/smac_sampler/README.md b/package/samplers/smac_sampler/README.md
@@ -7,9 +7,30 @@ optuna_versions: [3.6.1]
 license: MIT License
 ---
 
-## Class or Function Names
+## APIs
 
-- SAMCSampler
+A sampler that uses SMAC3 v2.2.0.
+
+Please check the API reference for more details:
+
+- https://automl.github.io/SMAC3/main/5_api.html
+
+### `SMACSampler(search_space: dict[str, BaseDistribution], n_trials: int = 100, seed: int | None = None, *, surrogate_model_type: str = "rf", acq_func_type: str = "ei_log", init_design_type: str = "sobol", surrogate_model_rf_num_trees: int = 10, surrogate_model_rf_ratio_features: float = 1.0, surrogate_model_rf_min_samples_split: int = 2, surrogate_model_rf_min_samples_leaf: int = 1, init_design_n_configs: int | None = None, init_design_n_configs_per_hyperparameter: int = 10, init_design_max_ratio: float = 0.25, output_directory: str = "smac3_output")`
+
+- `search_space`: A dictionary of Optuna distributions.
+- `n_trials`: Number of trials to be evaluated in a study. This argument is used to determine the number of initial configurations by SMAC3. Use at most `n_trials * init_design_max_ratio` number of configurations in the initial design. This argument does not have to be precise, but it is better to be exact for better performance.
+- `seed`: Seed for random number generator. If `None` is given, seed is generated randomly.
+- `surrogate_model_type`: What model to use for the probabilistic model. Either `"gp"` (Gaussian process), `"gp_mcmc"` (Gaussian process with MCMC), or `"rf"` (random forest). Default to `"rf"` (random forest).
+- `acq_func_type`: What acquisition function to use. Either `"ei"` (expected improvement), `"ei_log"` (expected improvement with log-scaled function), `"pi"` (probability of improvement), or `"lcb"` (lower confidence bound). Default to `"ei_log"`.
+- `init_design_type`: What initialization sampler to use. Either `"sobol"` (Sobol sequence), `"lhd"` (Latin hypercube), or `"random"`. Default to `"sobol"`.
+- `surrogate_model_rf_num_trees`: The number of trees used for random forest. Equivalent to `n_estimators` in `RandomForestRegressor` in sklearn.
+- `surrogate_model_rf_ratio_features`: The ratio of features to use for each tree training in random forest. Equivalent to `max_features` in `RandomForestRegressor` in sklearn.
+- `surrogate_model_rf_min_samples_split`: The minimum number of samples required to split an internal node: Equivalent to `min_samples_split` in `RandomForestRegressor` in sklearn.
+- `surrogate_model_rf_min_samples_leaf`: The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least `min_samples_leaf` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. Equivalent to `min_samples_leaf` in `RandomForestRegressor` in sklearn.
+- `init_design_n_configs`: Number of initial configurations.
+- `init_design_n_configs_per_hyperparameter`: Number of initial configurations per hyperparameter. For example, if my configuration space covers five hyperparameters and `n_configs_per_hyperparameter` is set to 10, then 50 initial configurations will be sampled.
+- `init_design_max_ratio`: Use at most `n_trials * init_design_max_ratio` number of configurations in the initial design. Additional configurations are not affected by this parameter.
+- `output_directy`: Output directory path, defaults to `"smac3_output"`. The directory in which to save the output. The files are saved in `./output_directory/name/seed`.
 
 ## Installation
 
@@ -41,6 +62,7 @@ sampler = SMACSampler(
         "y": optuna.distributions.IntDistribution(-10, 10),
     },
     n_trials=n_trials,
+    output_directory="smac3_output",
 )
 study = optuna.create_study(sampler=sampler)
 study.optimize(objective, n_trials=n_trials)

diff --git a/package/samplers/smac_sampler/sampler.py b/package/samplers/smac_sampler/sampler.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from collections.abc import Sequence
+from pathlib import Path
 
 from ConfigSpace import Categorical
 from ConfigSpace import Configuration
@@ -96,6 +97,10 @@ class SMACSampler(optunahub.samplers.SimpleBaseSampler):
         init_design_max_ratio:
             Use at most ``n_trials * init_design_max_ratio`` number of configurations in the
             initial design. Additional configurations are not affected by this parameter.
+        output_directory:
+            Output directory path, defaults to "smac3_output".
+            The directory in which to save the output.
+            The files are saved in `./output_directory/name/seed`.
     """
 
     def __init__(
@@ -114,11 +119,16 @@ def __init__(
         init_design_n_configs: int | None = None,
         init_design_n_configs_per_hyperparameter: int = 10,
         init_design_max_ratio: float = 0.25,
+        output_directory: str = "smac3_output",
     ) -> None:
         super().__init__(search_space)
         self._cs, self._hp_scale_value = self._convert_to_config_space_design_space(search_space)
         scenario = Scenario(
-            configspace=self._cs, deterministic=True, n_trials=n_trials, seed=seed or -1
+            configspace=self._cs,
+            deterministic=True,
+            n_trials=n_trials,
+            seed=seed or -1,
+            output_directory=Path(output_directory),
         )
         surrogate_model = self._get_surrogate_model(
             scenario,