diff --git a/README.md b/README.md index 4d8f0281..92c1a086 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Optuna-Integration API reference is [here](https://optuna-integration.readthedoc * [Keras](https://optuna-integration.readthedocs.io/en/stable/reference/index.html#keras) ([example](https://github.com/optuna/optuna-examples/tree/main/keras)) * [MXNet](https://optuna-integration.readthedocs.io/en/stable/reference/index.html#mxnet) ([example](https://github.com/optuna/optuna-examples/tree/main/mxnet)) * [SHAP](https://optuna-integration.readthedocs.io/en/stable/reference/index.html#shap) +* [sklearn](https://optuna-integration.readthedocs.io/en/stable/reference/index.html#sklearn) ([example](https://github.com/optuna/optuna-examples/tree/main/sklearn/sklearn_optuna_search_cv_simple.py)) * [skorch](https://optuna-integration.readthedocs.io/en/stable/reference/index.html#skorch) ([example](https://github.com/optuna/optuna-examples/tree/main/pytorch/skorch_simple.py)) * [TensorBoard](https://optuna-integration.readthedocs.io/en/stable/reference/index.html#tensorboard) ([example](https://github.com/optuna/optuna-examples/tree/main/tensorboard/tensorboard_simple.py)) * [tf.keras](https://optuna-integration.readthedocs.io/en/stable/reference/index.html#tensorflow) ([example](https://github.com/optuna/optuna-examples/tree/main/tfkeras/tfkeras_integration.py)) diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 128d5bd9..1cdd6629 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -10,11 +10,11 @@ The former is provided for backward compatibility. For most of the ML frameworks supported by Optuna, the corresponding Optuna integration class serves only to implement a callback object and functions, compliant with the framework's specific callback API, to be called with each intermediate step in the model training. The functionality implemented in these callbacks across the different ML frameworks includes: -(1) Reporting intermediate model scores back to the Optuna trial using :func:`optuna.trial.Trial.report`, -(2) According to the results of :func:`optuna.trial.Trial.should_prune`, pruning the current model by raising :func:`optuna.TrialPruned`, and -(3) Reporting intermediate Optuna data such as the current trial number back to the framework, as done in :class:`~optuna.integration.MLflowCallback`. +(1) Reporting intermediate model scores back to the Optuna trial using `optuna.trial.Trial.report `_, +(2) According to the results of `optuna.trial.Trial.should_prune `_, pruning the current model by raising `optuna.TrialPruned `_, and +(3) Reporting intermediate Optuna data such as the current trial number back to the framework, as done in :class:`~optuna_integration.MLflowCallback`. -For scikit-learn, an integrated :class:`~optuna.integration.OptunaSearchCV` estimator is available that combines scikit-learn BaseEstimator functionality with access to a class-level ``Study`` object. +For scikit-learn, an integrated :class:`~optuna_integration.OptunaSearchCV` estimator is available that combines scikit-learn BaseEstimator functionality with access to a class-level ``Study`` object. AllenNLP -------- @@ -23,9 +23,9 @@ AllenNLP :toctree: generated/ :nosignatures: - optuna.integration.AllenNLPExecutor - optuna.integration.allennlp.dump_best_config - optuna.integration.AllenNLPPruningCallback + optuna_integration.AllenNLPExecutor + optuna_integration.allennlp.dump_best_config + optuna_integration.AllenNLPPruningCallback Catalyst -------- @@ -34,7 +34,7 @@ Catalyst :toctree: generated/ :nosignatures: - optuna.integration.CatalystPruningCallback + optuna_integration.CatalystPruningCallback CatBoost -------- @@ -43,7 +43,7 @@ CatBoost :toctree: generated/ :nosignatures: - optuna.integration.CatBoostPruningCallback + optuna_integration.CatBoostPruningCallback Chainer ------- @@ -52,8 +52,8 @@ Chainer :toctree: generated/ :nosignatures: - optuna.integration.ChainerPruningExtension - optuna.integration.ChainerMNStudy + optuna_integration.ChainerPruningExtension + optuna_integration.ChainerMNStudy Dask ---- @@ -71,9 +71,9 @@ fast.ai :toctree: generated/ :nosignatures: - optuna.integration.FastAIV1PruningCallback - optuna.integration.FastAIV2PruningCallback - optuna.integration.FastAIPruningCallback + optuna_integration.FastAIV1PruningCallback + optuna_integration.FastAIV2PruningCallback + optuna_integration.FastAIPruningCallback Keras ----- @@ -82,7 +82,7 @@ Keras :toctree: generated/ :nosignatures: - optuna.integration.KerasPruningCallback + optuna_integration.KerasPruningCallback MXNet ----- @@ -91,7 +91,7 @@ MXNet :toctree: generated/ :nosignatures: - optuna.integration.MXNetPruningCallback + optuna_integration.MXNetPruningCallback SHAP ---- @@ -100,7 +100,16 @@ SHAP :toctree: generated/ :nosignatures: - optuna.integration.ShapleyImportanceEvaluator + optuna_integration.ShapleyImportanceEvaluator + +sklearn +------- + +.. autosummary:: + :toctree: generated/ + :nosignatures: + + optuna_integration.OptunaSearchCV skorch ------ @@ -109,7 +118,7 @@ skorch :toctree: generated/ :nosignatures: - optuna.integration.SkorchPruningCallback + optuna_integration.SkorchPruningCallback TensorBoard ----------- @@ -118,7 +127,7 @@ TensorBoard :toctree: generated/ :nosignatures: - optuna.integration.TensorBoardCallback + optuna_integration.TensorBoardCallback TensorFlow ---------- @@ -127,4 +136,4 @@ TensorFlow :toctree: generated/ :nosignatures: - optuna.integration.TFKerasPruningCallback + optuna_integration.TFKerasPruningCallback diff --git a/optuna_integration/__init__.py b/optuna_integration/__init__.py index e69de29b..10d47ef6 100644 --- a/optuna_integration/__init__.py +++ b/optuna_integration/__init__.py @@ -0,0 +1,110 @@ +import os +import sys +from types import ModuleType +from typing import Any +from typing import TYPE_CHECKING + + +_import_structure = { + "allennlp": ["AllenNLPExecutor", "AllenNLPPruningCallback"], + "catalyst": ["CatalystPruningCallback"], + "catboost": ["CatBoostPruningCallback"], + "chainer": ["ChainerPruningExtension"], + "chainermn": ["ChainerMNStudy"], + "fastaiv1": ["FastAIV1PruningCallback"], + "fastaiv2": ["FastAIV2PruningCallback", "FastAIPruningCallback"], + "keras": ["KerasPruningCallback"], + "mxnet": ["MXNetPruningCallback"], + "shap": ["ShapleyImportanceEvaluator"], + "sklearn": ["OptunaSearchCV"], + "skorch": ["SkorchPruningCallback"], + "tensorboard": ["TensorBoardCallback"], + "tensorflow": ["TensorFlowPruningHook"], + "tfkeras": ["TFKerasPruningCallback"], +} + + +if TYPE_CHECKING: + from optuna_integration.allennlp import AllenNLPExecutor + from optuna_integration.allennlp import AllenNLPPruningCallback + from optuna_integration.catalyst import CatalystPruningCallback + from optuna_integration.catboost import CatBoostPruningCallback + from optuna_integration.chainer import ChainerPruningExtension + from optuna_integration.chainermn import ChainerMNStudy + from optuna_integration.fastaiv1 import FastAIV1PruningCallback + from optuna_integration.fastaiv2 import FastAIPruningCallback + from optuna_integration.fastaiv2 import FastAIV2PruningCallback + from optuna_integration.keras import KerasPruningCallback + from optuna_integration.mxnet import MXNetPruningCallback + from optuna_integration.shap import ShapleyImportanceEvaluator + from optuna_integration.sklearn import OptunaSearchCV + from optuna_integration.skorch import SkorchPruningCallback + from optuna_integration.tensorboard import TensorBoardCallback + from optuna_integration.tensorflow import TensorFlowPruningHook + from optuna_integration.tfkeras import TFKerasPruningCallback +else: + + class _IntegrationModule(ModuleType): + """Module class that implements `optuna_integration` package. + + This class applies lazy import under `optuna_integration`, where submodules are imported + when they are actually accessed. Otherwise, `import optuna` becomes much slower because it + imports all submodules and their dependencies (e.g., chainer, keras, lightgbm) all at once. + """ + + __file__ = globals()["__file__"] + __path__ = [os.path.dirname(__file__)] + + _modules = set(_import_structure.keys()) + _class_to_module = {} + for key, values in _import_structure.items(): + for value in values: + _class_to_module[value] = key + + def __getattr__(self, name: str) -> Any: + if name in self._modules: + value = self._get_module(name) + elif name in self._class_to_module.keys(): + module = self._get_module(self._class_to_module[name]) + value = getattr(module, name) + else: + raise AttributeError("module {} has no attribute {}".format(self.__name__, name)) + + setattr(self, name, value) + return value + + def _get_module(self, module_name: str) -> ModuleType: + import importlib + + try: + return importlib.import_module("." + module_name, self.__name__) + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Optuna's integration modules for third-party libraries have started " + "migrating from Optuna itself to a package called `optuna-integration`. " + "The module you are trying to use has already been migrated to " + "`optuna-integration`. Please install the package by running " + "`pip install optuna-integration`." + ) + + sys.modules[__name__] = _IntegrationModule(__name__) + +__all__ = [ + "AllenNLPExecutor", + "AllenNLPPruningCallback", + "CatalystPruningCallback", + "CatBoostPruningCallback", + "ChainerMNStudy", + "ChainerPruningExtension", + "FastAIPruningCallback", + "FastAIV1PruningCallback", + "FastAIV2PruningCallback", + "KerasPruningCallback", + "MXNetPruningCallback", + "OptunaSearchCV", + "ShapleyImportanceEvaluator", + "SkorchPruningCallback", + "TensorBoardCallback", + "TensorFlowPruningHook", + "TFKerasPruningCallback", +] diff --git a/optuna_integration/shap.py b/optuna_integration/shap.py index 327c97c6..d20d5d13 100644 --- a/optuna_integration/shap.py +++ b/optuna_integration/shap.py @@ -18,9 +18,8 @@ with try_import() as _imports: - from sklearn.ensemble import RandomForestRegressor - from shap import TreeExplainer + from sklearn.ensemble import RandomForestRegressor @experimental_class("3.0.0") diff --git a/optuna_integration/sklearn.py b/optuna_integration/sklearn.py new file mode 100644 index 00000000..86ab2270 --- /dev/null +++ b/optuna_integration/sklearn.py @@ -0,0 +1,967 @@ +from __future__ import annotations + +from collections.abc import Callable +from collections.abc import Iterable +from collections.abc import Mapping +from logging import DEBUG +from logging import INFO +from logging import WARNING +from numbers import Integral +from numbers import Number +from time import time +from typing import Any +from typing import List +from typing import Union + +import numpy as np +from optuna import distributions +from optuna import logging +from optuna import samplers +from optuna import study as study_module +from optuna import TrialPruned +from optuna._experimental import experimental_class +from optuna._imports import try_import +from optuna.distributions import _convert_old_distribution_to_new_distribution +from optuna.study import StudyDirection +from optuna.terminator import report_cross_validation_scores +from optuna.trial import FrozenTrial +from optuna.trial import Trial + + +with try_import() as _imports: + import pandas as pd + import scipy as sp + from scipy.sparse import spmatrix + + import sklearn + from sklearn.base import BaseEstimator + from sklearn.base import clone + from sklearn.base import is_classifier + from sklearn.metrics import check_scoring + from sklearn.model_selection import BaseCrossValidator + from sklearn.model_selection import check_cv + from sklearn.model_selection import cross_validate + from sklearn.utils import _safe_indexing as sklearn_safe_indexing + from sklearn.utils import check_random_state + from sklearn.utils.metaestimators import _safe_split + from sklearn.utils.validation import check_is_fitted + + +if not _imports.is_successful(): + BaseEstimator = object # NOQA + +ArrayLikeType = Union[List, np.ndarray, "pd.Series", "spmatrix"] +OneDimArrayLikeType = Union[List[float], np.ndarray, "pd.Series"] +TwoDimArrayLikeType = Union[List[List[float]], np.ndarray, "pd.DataFrame", "spmatrix"] +IterableType = Union[List, "pd.DataFrame", np.ndarray, "pd.Series", "spmatrix", None] +IndexableType = Union[Iterable, None] + +_logger = logging.get_logger(__name__) + + +def _check_fit_params( + X: TwoDimArrayLikeType, fit_params: dict, indices: OneDimArrayLikeType +) -> dict: + fit_params_validated = {} + for key, value in fit_params.items(): + # NOTE Original implementation: + # https://github.com/scikit-learn/scikit-learn/blob/ \ + # 2467e1b84aeb493a22533fa15ff92e0d7c05ed1c/sklearn/utils/validation.py#L1324-L1328 + # Scikit-learn does not accept non-iterable inputs. + # This line is for keeping backward compatibility. + # (See: https://github.com/scikit-learn/scikit-learn/issues/15805) + if not _is_arraylike(value) or _num_samples(value) != _num_samples(X): + fit_params_validated[key] = value + else: + fit_params_validated[key] = _make_indexable(value) + fit_params_validated[key] = _safe_indexing(fit_params_validated[key], indices) + return fit_params_validated + + +# NOTE Original implementation: +# https://github.com/scikit-learn/scikit-learn/blob/ \ +# 8caa93889f85254fc3ca84caa0a24a1640eebdd1/sklearn/utils/validation.py#L131-L135 +def _is_arraylike(x: Any) -> bool: + return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__") + + +# NOTE Original implementation: +# https://github.com/scikit-learn/scikit-learn/blob/ \ +# 8caa93889f85254fc3ca84caa0a24a1640eebdd1/sklearn/utils/validation.py#L217-L234 +def _make_indexable(iterable: IterableType) -> IndexableType: + tocsr_func = getattr(iterable, "tocsr", None) + if tocsr_func is not None and sp.sparse.issparse(iterable): + return tocsr_func(iterable) + elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"): + return iterable + elif iterable is None: + return iterable + return np.array(iterable) + + +def _num_samples(x: ArrayLikeType) -> int: + # NOTE For dask dataframes + # https://github.com/scikit-learn/scikit-learn/blob/ \ + # 8caa93889f85254fc3ca84caa0a24a1640eebdd1/sklearn/utils/validation.py#L155-L158 + x_shape = getattr(x, "shape", None) + if x_shape is not None: + if isinstance(x_shape[0], Integral): + return int(x_shape[0]) + + try: + return len(x) + except TypeError: + raise TypeError("Expected sequence or array-like, got %s." % type(x)) from None + + +def _safe_indexing( + X: OneDimArrayLikeType | TwoDimArrayLikeType, indices: OneDimArrayLikeType +) -> OneDimArrayLikeType | TwoDimArrayLikeType: + if X is None: + return X + + return sklearn_safe_indexing(X, indices) + + +class _Objective: + """Callable that implements objective function. + + Args: + estimator: + Object to use to fit the data. This is assumed to implement the + scikit-learn estimator interface. Either this needs to provide + ``score``, or ``scoring`` must be passed. + + param_distributions: + Dictionary where keys are parameters and values are distributions. + Distributions are assumed to implement the optuna distribution + interface. + + X: + Training data. + + y: + Target variable. + + cv: + Cross-validation strategy. + + enable_pruning: + If :obj:`True`, pruning is performed in the case where the + underlying estimator supports ``partial_fit``. + + error_score: + Value to assign to the score if an error occurs in fitting. If + 'raise', the error is raised. If numeric, + ``sklearn.exceptions.FitFailedWarning`` is raised. This does not + affect the refit step, which will always raise the error. + + fit_params: + Parameters passed to ``fit`` one the estimator. + + groups: + Group labels for the samples used while splitting the dataset into + train/validation set. + + max_iter: + Maximum number of epochs. This is only used if the underlying + estimator supports ``partial_fit``. + + return_train_score: + If :obj:`True`, training scores will be included. Computing + training scores is used to get insights on how different + hyperparameter settings impact the overfitting/underfitting + trade-off. However computing training scores can be + computationally expensive and is not strictly required to select + the hyperparameters that yield the best generalization + performance. + + scoring: + Scorer function. + """ + + def __init__( + self, + estimator: "sklearn.base.BaseEstimator", + param_distributions: Mapping[str, distributions.BaseDistribution], + X: TwoDimArrayLikeType, + y: OneDimArrayLikeType | TwoDimArrayLikeType | None, + cv: "BaseCrossValidator", + enable_pruning: bool, + error_score: Number | float | str, + fit_params: dict[str, Any], + groups: OneDimArrayLikeType | None, + max_iter: int, + return_train_score: bool, + scoring: Callable[..., Number], + ) -> None: + self.cv = cv + self.enable_pruning = enable_pruning + self.error_score = error_score + self.estimator = estimator + self.fit_params = fit_params + self.groups = groups + self.max_iter = max_iter + self.param_distributions = param_distributions + self.return_train_score = return_train_score + self.scoring = scoring + self.X = X + self.y = y + + def __call__(self, trial: Trial) -> float: + estimator = clone(self.estimator) + params = self._get_params(trial) + + estimator.set_params(**params) + + if self.enable_pruning: + scores = self._cross_validate_with_pruning(trial, estimator) + else: + sklearn_version = sklearn.__version__.split(".") + sklearn_major_version = int(sklearn_version[0]) + sklearn_minor_version = int(sklearn_version[1]) + try: + if sklearn_major_version == 1 and sklearn_minor_version >= 4: + scores = cross_validate( + estimator, + self.X, + self.y, + cv=self.cv, + error_score=self.error_score, + params=self.fit_params, + groups=self.groups, + return_train_score=self.return_train_score, + scoring=self.scoring, + ) + else: + scores = cross_validate( + estimator, + self.X, + self.y, + cv=self.cv, + error_score=self.error_score, + fit_params=self.fit_params, + groups=self.groups, + return_train_score=self.return_train_score, + scoring=self.scoring, + ) + except ValueError: + n_splits = self.cv.get_n_splits(self.X, self.y, self.groups) + fit_time = np.array([np.nan] * n_splits) + score_time = np.array([np.nan] * n_splits) + test_score = np.array( + [self.error_score if self.error_score is not None else np.nan] * n_splits + ) + + scores = { + "fit_time": fit_time, + "score_time": score_time, + "test_score": test_score, + } + + self._store_scores(trial, scores) + + test_scores = scores["test_score"] + scores_list = test_scores if isinstance(test_scores, list) else test_scores.tolist() + report_cross_validation_scores(trial, scores_list) + + return trial.user_attrs["mean_test_score"] + + def _cross_validate_with_pruning( + self, trial: Trial, estimator: "sklearn.base.BaseEstimator" + ) -> Mapping[str, OneDimArrayLikeType]: + if is_classifier(estimator): + partial_fit_params = self.fit_params.copy() + y = self.y.values if isinstance(self.y, pd.Series) else self.y + classes = np.unique(y) + + partial_fit_params.setdefault("classes", classes) + + else: + partial_fit_params = self.fit_params + + n_splits = self.cv.get_n_splits(self.X, self.y, groups=self.groups) + estimators = [clone(estimator) for _ in range(n_splits)] + scores = { + "fit_time": np.zeros(n_splits), + "score_time": np.zeros(n_splits), + "test_score": np.empty(n_splits), + } + + if self.return_train_score: + scores["train_score"] = np.empty(n_splits) + + for step in range(self.max_iter): + for i, (train, test) in enumerate(self.cv.split(self.X, self.y, groups=self.groups)): + out = self._partial_fit_and_score(estimators[i], train, test, partial_fit_params) + + if self.return_train_score: + scores["train_score"][i] = out.pop(0) + + scores["test_score"][i] = out[0] + scores["fit_time"][i] += out[1] + scores["score_time"][i] += out[2] + + intermediate_value = np.nanmean(scores["test_score"]) + + trial.report(intermediate_value, step=step) + + if trial.should_prune(): + self._store_scores(trial, scores) + + raise TrialPruned("trial was pruned at iteration {}.".format(step)) + + return scores + + def _get_params(self, trial: Trial) -> dict[str, Any]: + return { + name: trial._suggest(name, distribution) + for name, distribution in self.param_distributions.items() + } + + def _partial_fit_and_score( + self, + estimator: "sklearn.base.BaseEstimator", + train: list[int], + test: list[int], + partial_fit_params: dict[str, Any], + ) -> list[Number]: + X_train, y_train = _safe_split(estimator, self.X, self.y, train) + X_test, y_test = _safe_split(estimator, self.X, self.y, test, train_indices=train) + + start_time = time() + + try: + estimator.partial_fit(X_train, y_train, **partial_fit_params) + + except Exception as e: + if self.error_score == "raise": + raise e + + elif isinstance(self.error_score, Number): + fit_time = time() - start_time + test_score = self.error_score + score_time = 0.0 + + if self.return_train_score: + train_score = self.error_score + + else: + raise ValueError("error_score must be 'raise' or numeric.") from e + + else: + fit_time = time() - start_time + test_score = self.scoring(estimator, X_test, y_test) + score_time = time() - fit_time - start_time + + if self.return_train_score: + train_score = self.scoring(estimator, X_train, y_train) + + # Required for type checking but is never expected to fail. + assert isinstance(fit_time, Number) + assert isinstance(score_time, Number) + + ret = [test_score, fit_time, score_time] + + if self.return_train_score: + ret.insert(0, train_score) + + return ret + + def _store_scores(self, trial: Trial, scores: Mapping[str, OneDimArrayLikeType]) -> None: + for name, array in scores.items(): + if name in ["test_score", "train_score"]: + for i, score in enumerate(array): + trial.set_user_attr("split{}_{}".format(i, name), score) + + trial.set_user_attr("mean_{}".format(name), np.nanmean(array)) + trial.set_user_attr("std_{}".format(name), np.nanstd(array)) + + +@experimental_class("0.17.0") +class OptunaSearchCV(BaseEstimator): + """Hyperparameter search with cross-validation. + + Args: + estimator: + Object to use to fit the data. This is assumed to implement the + scikit-learn estimator interface. Either this needs to provide + ``score``, or ``scoring`` must be passed. + + param_distributions: + Dictionary where keys are parameters and values are distributions. + Distributions are assumed to implement the optuna distribution + interface. + + cv: + Cross-validation strategy. Possible inputs for cv are: + + - :obj:`None`, to use the default 5-fold cross validation, + - integer to specify the number of folds in a CV splitter, + - `CV splitter `_, + - an iterable yielding (train, validation) splits as arrays of indices. + + For integer, if ``estimator`` is a classifier and ``y`` is + either binary or multiclass, + ``sklearn.model_selection.StratifiedKFold`` is used. otherwise, + ``sklearn.model_selection.KFold`` is used. + + enable_pruning: + If :obj:`True`, pruning is performed in the case where the + underlying estimator supports ``partial_fit``. + + error_score: + Value to assign to the score if an error occurs in fitting. If + 'raise', the error is raised. If numeric, + ``sklearn.exceptions.FitFailedWarning`` is raised. This does not + affect the refit step, which will always raise the error. + + max_iter: + Maximum number of epochs. This is only used if the underlying + estimator supports ``partial_fit``. + + n_jobs: + Number of :obj:`threading` based parallel jobs. :obj:`None` means ``1``. + ``-1`` means using the number is set to CPU count. + + .. note:: + ``n_jobs`` allows parallelization using :obj:`threading` and may suffer from + `Python's GIL `_. + It is recommended to use `process-based optimization `_ + if ``func`` is CPU bound. + + n_trials: + Number of trials. If :obj:`None`, there is no limitation on the + number of trials. If ``timeout`` is also set to :obj:`None`, + the study continues to create trials until it receives a + termination signal such as Ctrl+C or SIGTERM. This trades off + runtime vs quality of the solution. + + random_state: + Seed of the pseudo random number generator. If int, this is the + seed used by the random number generator. If + ``numpy.random.RandomState`` object, this is the random number + generator. If :obj:`None`, the global random state from + ``numpy.random`` is used. + + refit: + If :obj:`True`, refit the estimator with the best found + hyperparameters. The refitted estimator is made available at the + ``best_estimator_`` attribute and permits using ``predict`` + directly. + + return_train_score: + If :obj:`True`, training scores will be included. Computing + training scores is used to get insights on how different + hyperparameter settings impact the overfitting/underfitting + trade-off. However computing training scores can be + computationally expensive and is not strictly required to select + the hyperparameters that yield the best generalization + performance. + + scoring: + String or callable to evaluate the predictions on the validation data. + If :obj:`None`, ``score`` on the estimator is used. + + study: + Study corresponds to the optimization task. If :obj:`None`, a new + study is created. + + subsample: + Proportion of samples that are used during hyperparameter search. + + - If int, then draw ``subsample`` samples. + - If float, then draw ``subsample`` * ``X.shape[0]`` samples. + + timeout: + Time limit in seconds for the search of appropriate models. If + :obj:`None`, the study is executed without time limitation. If + ``n_trials`` is also set to :obj:`None`, the study continues to + create trials until it receives a termination signal such as + Ctrl+C or SIGTERM. This trades off runtime vs quality of the + solution. + + verbose: + Verbosity level. The higher, the more messages. + + callbacks: + List of callback functions that are invoked at the end of each trial. Each function + must accept two parameters with the following types in this order: + :class:`~optuna.study.Study` and :class:`~optuna.trial.FrozenTrial`. + + .. seealso:: + + See the tutorial of `Callback for Study.optimize `_ + for how to use and implement callback functions. + + Attributes: + best_estimator_: + Estimator that was chosen by the search. This is present only if + ``refit`` is set to :obj:`True`. + + n_splits_: + Number of cross-validation splits. + + refit_time_: + Time for refitting the best estimator. This is present only if + ``refit`` is set to :obj:`True`. + + sample_indices_: + Indices of samples that are used during hyperparameter search. + + scorer_: + Scorer function. + + study_: + Actual study. + + Examples: + + .. testcode:: + + import optuna + import optuna_integration + + from sklearn.datasets import load_iris + from sklearn.svm import SVC + + clf = SVC(gamma="auto") + param_distributions = { + "C": optuna.distributions.FloatDistribution(1e-10, 1e10, log=True) + } + optuna_search = optuna_integration.OptunaSearchCV(clf, param_distributions) + X, y = load_iris(return_X_y=True) + optuna_search.fit(X, y) + y_pred = optuna_search.predict(X) + + .. note:: + By following the scikit-learn convention for scorers, the direction of optimization is + ``maximize``. See https://scikit-learn.org/stable/modules/model_evaluation.html. + For the minimization problem, please multiply ``-1``. + """ # NOQA: E501 + + _required_parameters = ["estimator", "param_distributions"] + + @property + def _estimator_type(self) -> str: + return self.estimator._estimator_type + + @property + def best_index_(self) -> int: + """Trial number which corresponds to the best candidate parameter setting. + + Returned value is equivalent to ``optuna_search.best_trial_.number``. + """ + + return self.best_trial_.number + + @property + def best_params_(self) -> dict[str, Any]: + """Parameters of the best trial in the :class:`~optuna.study.Study`.""" + + self._check_is_fitted() + + return self.study_.best_params + + @property + def best_score_(self) -> float: + """Mean cross-validated score of the best estimator.""" + + self._check_is_fitted() + + return self.study_.best_value + + @property + def best_trial_(self) -> FrozenTrial: + """Best trial in the :class:`~optuna.study.Study`.""" + + self._check_is_fitted() + + return self.study_.best_trial + + @property + def classes_(self) -> OneDimArrayLikeType: + """Class labels.""" + + self._check_is_fitted() + + return self.best_estimator_.classes_ + + @property + def cv_results_(self) -> dict[str, Any]: + """A dictionary mapping a metric name to a list of Cross-Validation results of all trials.""" # NOQA: E501 + + cv_results_dict_in_list = [trial_.user_attrs for trial_ in self.trials_] + if len(cv_results_dict_in_list) == 0: + cv_results_list_in_dict = {} + else: + cv_results_list_in_dict = { + key: [dict_[key] for dict_ in cv_results_dict_in_list] + for key in cv_results_dict_in_list[0] + } + return cv_results_list_in_dict + + @property + def n_trials_(self) -> int: + """Actual number of trials.""" + + return len(self.trials_) + + @property + def trials_(self) -> list[FrozenTrial]: + """All trials in the :class:`~optuna.study.Study`.""" + + self._check_is_fitted() + + return self.study_.trials + + @property + def user_attrs_(self) -> dict[str, Any]: + """User attributes in the :class:`~optuna.study.Study`.""" + + self._check_is_fitted() + + return self.study_.user_attrs + + @property + def decision_function(self) -> Callable[..., OneDimArrayLikeType | TwoDimArrayLikeType]: + """Call ``decision_function`` on the best estimator. + + This is available only if the underlying estimator supports + ``decision_function`` and ``refit`` is set to :obj:`True`. + """ + + self._check_is_fitted() + + return self.best_estimator_.decision_function + + @property + def inverse_transform(self) -> Callable[..., TwoDimArrayLikeType]: + """Call ``inverse_transform`` on the best estimator. + + This is available only if the underlying estimator supports + ``inverse_transform`` and ``refit`` is set to :obj:`True`. + """ + + self._check_is_fitted() + + return self.best_estimator_.inverse_transform + + @property + def predict(self) -> Callable[..., OneDimArrayLikeType | TwoDimArrayLikeType]: + """Call ``predict`` on the best estimator. + + This is available only if the underlying estimator supports ``predict`` + and ``refit`` is set to :obj:`True`. + """ + + self._check_is_fitted() + + return self.best_estimator_.predict + + @property + def predict_log_proba(self) -> Callable[..., TwoDimArrayLikeType]: + """Call ``predict_log_proba`` on the best estimator. + + This is available only if the underlying estimator supports + ``predict_log_proba`` and ``refit`` is set to :obj:`True`. + """ + + self._check_is_fitted() + + return self.best_estimator_.predict_log_proba + + @property + def predict_proba(self) -> Callable[..., TwoDimArrayLikeType]: + """Call ``predict_proba`` on the best estimator. + + This is available only if the underlying estimator supports + ``predict_proba`` and ``refit`` is set to :obj:`True`. + """ + + self._check_is_fitted() + + return self.best_estimator_.predict_proba + + @property + def score_samples(self) -> Callable[..., OneDimArrayLikeType]: + """Call ``score_samples`` on the best estimator. + + This is available only if the underlying estimator supports + ``score_samples`` and ``refit`` is set to :obj:`True`. + """ + + self._check_is_fitted() + + return self.best_estimator_.score_samples + + @property + def set_user_attr(self) -> Callable[..., None]: + """Call ``set_user_attr`` on the :class:`~optuna.study.Study`.""" + + self._check_is_fitted() + + return self.study_.set_user_attr + + @property + def transform(self) -> Callable[..., TwoDimArrayLikeType]: + """Call ``transform`` on the best estimator. + + This is available only if the underlying estimator supports + ``transform`` and ``refit`` is set to :obj:`True`. + """ + + self._check_is_fitted() + + return self.best_estimator_.transform + + @property + def trials_dataframe(self) -> Callable[..., "pd.DataFrame"]: + """Call ``trials_dataframe`` on the :class:`~optuna.study.Study`.""" + + self._check_is_fitted() + + return self.study_.trials_dataframe + + def __init__( + self, + estimator: "sklearn.base.BaseEstimator", + param_distributions: Mapping[str, distributions.BaseDistribution], + *, + cv: int | "BaseCrossValidator" | Iterable | None = None, + enable_pruning: bool = False, + error_score: Number | float | str = np.nan, + max_iter: int = 1000, + n_jobs: int | None = None, + n_trials: int | None = 10, + random_state: int | np.random.RandomState | None = None, + refit: bool = True, + return_train_score: bool = False, + scoring: Callable[..., float] | str | None = None, + study: study_module.Study | None = None, + subsample: float | int = 1.0, + timeout: float | None = None, + verbose: int = 0, + callbacks: list[Callable[[study_module.Study, FrozenTrial], None]] | None = None, + ) -> None: + _imports.check() + + if not isinstance(param_distributions, dict): + raise TypeError("param_distributions must be a dictionary.") + + # Rejecting deprecated distributions as they may cause cryptic error + # when cloning OptunaSearchCV instance. + # https://github.com/optuna/optuna/issues/4084 + for key, dist in param_distributions.items(): + if dist != _convert_old_distribution_to_new_distribution(dist): + raise ValueError( + f"Deprecated distribution is specified in `{key}` of param_distributions. " + "Rejecting this because it may cause unexpected behavior. " + "Please use new distributions such as FloatDistribution etc." + ) + + self.cv = cv + self.enable_pruning = enable_pruning + self.error_score = error_score + self.estimator = estimator + self.max_iter = max_iter + self.n_trials = n_trials + self.n_jobs = n_jobs if n_jobs else 1 + self.param_distributions = param_distributions + self.random_state = random_state + self.refit = refit + self.return_train_score = return_train_score + self.scoring = scoring + self.study = study + self.subsample = subsample + self.timeout = timeout + self.verbose = verbose + self.callbacks = callbacks + + def _check_is_fitted(self) -> None: + attributes = ["n_splits_", "sample_indices_", "scorer_", "study_"] + + if self.refit: + attributes += ["best_estimator_", "refit_time_"] + + check_is_fitted(self, attributes) + + def _check_params(self) -> None: + if not hasattr(self.estimator, "fit"): + raise ValueError("estimator must be a scikit-learn estimator.") + + for name, distribution in self.param_distributions.items(): + if not isinstance(distribution, distributions.BaseDistribution): + raise ValueError("Value of {} must be a optuna distribution.".format(name)) + + if self.enable_pruning and not hasattr(self.estimator, "partial_fit"): + raise ValueError("estimator must support partial_fit.") + + if self.max_iter <= 0: + raise ValueError("max_iter must be > 0, got {}.".format(self.max_iter)) + + if self.study is not None and self.study.direction != StudyDirection.MAXIMIZE: + raise ValueError("direction of study must be 'maximize'.") + + def _more_tags(self) -> dict[str, bool]: + return {"non_deterministic": True, "no_validation": True} + + def _refit( + self, + X: TwoDimArrayLikeType, + y: OneDimArrayLikeType | TwoDimArrayLikeType | None = None, + **fit_params: Any, + ) -> "OptunaSearchCV": + n_samples = _num_samples(X) + + self.best_estimator_ = clone(self.estimator) + + try: + self.best_estimator_.set_params(**self.study_.best_params) + except ValueError as e: + _logger.exception(e) + + _logger.info("Refitting the estimator using {} samples...".format(n_samples)) + + start_time = time() + + self.best_estimator_.fit(X, y, **fit_params) + + self.refit_time_ = time() - start_time + + _logger.info("Finished refitting! (elapsed time: {:.3f} sec.)".format(self.refit_time_)) + + return self + + def fit( + self, + X: TwoDimArrayLikeType, + y: OneDimArrayLikeType | TwoDimArrayLikeType | None = None, + groups: OneDimArrayLikeType | None = None, + **fit_params: Any, + ) -> "OptunaSearchCV": + """Run fit with all sets of parameters. + + Args: + X: + Training data. + + y: + Target variable. + + groups: + Group labels for the samples used while splitting the dataset + into train/validation set. + + **fit_params: + Parameters passed to ``fit`` on the estimator. + + Returns: + self. + """ + + self._check_params() + + random_state = check_random_state(self.random_state) + max_samples = self.subsample + n_samples = _num_samples(X) + old_level = _logger.getEffectiveLevel() + + if self.verbose > 1: + _logger.setLevel(DEBUG) + elif self.verbose > 0: + _logger.setLevel(INFO) + else: + _logger.setLevel(WARNING) + + self.sample_indices_ = np.arange(n_samples) + + if type(max_samples) is float: + max_samples = int(max_samples * n_samples) + + if max_samples < n_samples: + self.sample_indices_ = random_state.choice( + self.sample_indices_, max_samples, replace=False + ) + + self.sample_indices_.sort() + + X_res = _safe_indexing(X, self.sample_indices_) + y_res = _safe_indexing(y, self.sample_indices_) + groups_res = _safe_indexing(groups, self.sample_indices_) + fit_params_res = fit_params + + if fit_params_res is not None: + fit_params_res = _check_fit_params(X, fit_params, self.sample_indices_) + + classifier = is_classifier(self.estimator) + cv = check_cv(self.cv, y_res, classifier=classifier) + + self.n_splits_ = cv.get_n_splits(X_res, y_res, groups=groups_res) + self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) + + if self.study is None: + seed = random_state.randint(0, np.iinfo("int32").max) + sampler = samplers.TPESampler(seed=seed) + + self.study_ = study_module.create_study(direction="maximize", sampler=sampler) + + else: + self.study_ = self.study + + objective = _Objective( + self.estimator, + self.param_distributions, + X_res, + y_res, + cv, + self.enable_pruning, + self.error_score, + fit_params_res, + groups_res, + self.max_iter, + self.return_train_score, + self.scorer_, + ) + + _logger.info( + "Searching the best hyperparameters using {} " + "samples...".format(_num_samples(self.sample_indices_)) + ) + + self.study_.optimize( + objective, + n_jobs=self.n_jobs, + n_trials=self.n_trials, + timeout=self.timeout, + callbacks=self.callbacks, + ) + + _logger.info("Finished hyperparameter search!") + + if self.refit: + self._refit(X, y, **fit_params) + + _logger.setLevel(old_level) + + return self + + def score( + self, + X: TwoDimArrayLikeType, + y: OneDimArrayLikeType | TwoDimArrayLikeType | None = None, + ) -> float: + """Return the score on the given data. + + Args: + X: + Data. + + y: + Target variable. + + Returns: + Scaler score. + """ + + return self.scorer_(self.best_estimator_, X, y) diff --git a/pyproject.toml b/pyproject.toml index d22fed52..fc4403ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,9 @@ checking = [ "typing_extensions>=3.10.0.0", ] document = [ + "pandas", + "scikit-learn>=0.24.2", + "scipy>=1.9.2; python_version>='3.8'", "sphinx", "sphinx_rtd_theme", ] @@ -58,7 +61,10 @@ all = [ "distributed", "fastai", "mxnet", + "pandas", "shap", + "scikit-learn>=0.24.2", + "scipy>=1.9.2; python_version>='3.8'", "skorch", "tensorboard", "tensorflow", diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py new file mode 100644 index 00000000..603467ab --- /dev/null +++ b/tests/test_sklearn.py @@ -0,0 +1,432 @@ +from __future__ import annotations + +from unittest.mock import MagicMock +from unittest.mock import patch +import warnings + +import numpy as np +from optuna import distributions +from optuna.samplers import BruteForceSampler +from optuna.study import create_study +from optuna.terminator.erroreval import _CROSS_VALIDATION_SCORES_KEY +import pytest +import scipy as sp + +import optuna_integration as integration +from sklearn.datasets import make_blobs +from sklearn.datasets import make_regression +from sklearn.decomposition import PCA +from sklearn.exceptions import ConvergenceWarning +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import SGDClassifier +from sklearn.neighbors import KernelDensity +from sklearn.tree import DecisionTreeRegressor + + +pytestmark = pytest.mark.integration + + +def test_is_arraylike() -> None: + assert integration.sklearn._is_arraylike([]) + assert integration.sklearn._is_arraylike(np.zeros(5)) + assert not integration.sklearn._is_arraylike(1) + + +def test_num_samples() -> None: + x1 = np.random.random((10, 10)) + x2 = [1, 2, 3] + assert integration.sklearn._num_samples(x1) == 10 + assert integration.sklearn._num_samples(x2) == 3 + + +def test_make_indexable() -> None: + x1 = np.random.random((10, 10)) + x2 = sp.sparse.coo_matrix(x1) + x3 = [1, 2, 3] + + assert hasattr(integration.sklearn._make_indexable(x1), "__getitem__") + assert hasattr(integration.sklearn._make_indexable(x2), "__getitem__") + assert hasattr(integration.sklearn._make_indexable(x3), "__getitem__") + assert integration.sklearn._make_indexable(None) is None + + +@pytest.mark.parametrize("enable_pruning", [True, False]) +@pytest.mark.parametrize("fit_params", ["", "coef_init"]) +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_optuna_search(enable_pruning: bool, fit_params: str) -> None: + X, y = make_blobs(n_samples=10) + est = SGDClassifier(max_iter=5, tol=1e-03) + param_dist = {"alpha": distributions.FloatDistribution(1e-04, 1e03, log=True)} + optuna_search = integration.OptunaSearchCV( + est, + param_dist, + cv=3, + enable_pruning=enable_pruning, + error_score="raise", + max_iter=5, + random_state=0, + return_train_score=True, + ) + + with pytest.raises(NotFittedError): + optuna_search._check_is_fitted() + + if fit_params == "coef_init" and not enable_pruning: + optuna_search.fit(X, y, coef_init=np.ones((3, 2), dtype=np.float64)) + else: + optuna_search.fit(X, y) + + optuna_search.trials_dataframe() + optuna_search.decision_function(X) + optuna_search.predict(X) + optuna_search.score(X, y) + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_optuna_search_properties() -> None: + X, y = make_blobs(n_samples=10) + est = LogisticRegression(tol=1e-03) + param_dist = {"C": distributions.FloatDistribution(1e-04, 1e03, log=True)} + + optuna_search = integration.OptunaSearchCV( + est, param_dist, cv=3, error_score="raise", random_state=0, return_train_score=True + ) + optuna_search.fit(X, y) + optuna_search.set_user_attr("dataset", "blobs") + + assert optuna_search._estimator_type == "classifier" + assert isinstance(optuna_search.best_index_, int) + assert isinstance(optuna_search.best_params_, dict) + assert isinstance(optuna_search.cv_results_, dict) + for cv_result_list_ in optuna_search.cv_results_.values(): + assert len(cv_result_list_) == optuna_search.n_trials_ + assert optuna_search.best_score_ is not None + assert optuna_search.best_trial_ is not None + assert np.allclose(optuna_search.classes_, np.array([0, 1, 2])) + assert optuna_search.n_trials_ == 10 + assert optuna_search.user_attrs_ == {"dataset": "blobs"} + assert type(optuna_search.predict_log_proba(X)) == np.ndarray + assert type(optuna_search.predict_proba(X)) == np.ndarray + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_optuna_search_score_samples() -> None: + X, y = make_blobs(n_samples=10) + est = KernelDensity() + optuna_search = integration.OptunaSearchCV( + est, {}, cv=3, error_score="raise", random_state=0, return_train_score=True + ) + optuna_search.fit(X) + assert optuna_search.score_samples(X) is not None + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_optuna_search_transforms() -> None: + X, y = make_blobs(n_samples=10) + est = PCA() + optuna_search = integration.OptunaSearchCV( + est, {}, cv=3, error_score="raise", random_state=0, return_train_score=True + ) + optuna_search.fit(X) + assert type(optuna_search.transform(X)) == np.ndarray + assert type(optuna_search.inverse_transform(X)) == np.ndarray + + +def test_optuna_search_invalid_estimator() -> None: + X, y = make_blobs(n_samples=10) + est = "not an estimator" + optuna_search = integration.OptunaSearchCV( + est, {}, cv=3, error_score="raise", random_state=0, return_train_score=True + ) + + with pytest.raises(ValueError, match="estimator must be a scikit-learn estimator."): + optuna_search.fit(X) + + +def test_optuna_search_pruning_without_partial_fit() -> None: + X, y = make_blobs(n_samples=10) + est = KernelDensity() + param_dist = {} # type: ignore + optuna_search = integration.OptunaSearchCV( + est, + param_dist, + cv=3, + enable_pruning=True, + error_score="raise", + random_state=0, + return_train_score=True, + ) + + with pytest.raises(ValueError, match="estimator must support partial_fit."): + optuna_search.fit(X) + + +def test_optuna_search_negative_max_iter() -> None: + X, y = make_blobs(n_samples=10) + est = KernelDensity() + param_dist = {} # type: ignore + optuna_search = integration.OptunaSearchCV( + est, + param_dist, + cv=3, + max_iter=-1, + error_score="raise", + random_state=0, + return_train_score=True, + ) + + with pytest.raises(ValueError, match="max_iter must be > 0"): + optuna_search.fit(X) + + +def test_optuna_search_tuple_instead_of_distribution() -> None: + X, y = make_blobs(n_samples=10) + est = KernelDensity() + param_dist = {"kernel": ("gaussian", "linear")} + optuna_search = integration.OptunaSearchCV( + est, + param_dist, # type: ignore + cv=3, + error_score="raise", + random_state=0, + return_train_score=True, + ) + + with pytest.raises(ValueError, match="must be a optuna distribution."): + optuna_search.fit(X) + + +def test_optuna_search_study_with_minimize() -> None: + X, y = make_blobs(n_samples=10) + est = KernelDensity() + study = create_study(direction="minimize") + optuna_search = integration.OptunaSearchCV( + est, {}, cv=3, error_score="raise", random_state=0, return_train_score=True, study=study + ) + + with pytest.raises(ValueError, match="direction of study must be 'maximize'."): + optuna_search.fit(X) + + +@pytest.mark.parametrize("verbose", [1, 2]) +def test_optuna_search_verbosity(verbose: int) -> None: + X, y = make_blobs(n_samples=10) + est = KernelDensity() + param_dist = {} # type: ignore + optuna_search = integration.OptunaSearchCV( + est, + param_dist, + cv=3, + error_score="raise", + random_state=0, + return_train_score=True, + verbose=verbose, + ) + optuna_search.fit(X) + + +def test_optuna_search_subsample() -> None: + X, y = make_blobs(n_samples=10) + est = KernelDensity() + param_dist = {} # type: ignore + optuna_search = integration.OptunaSearchCV( + est, + param_dist, + cv=3, + error_score="raise", + random_state=0, + return_train_score=True, + subsample=5, + ) + optuna_search.fit(X) + + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_objective_y_None() -> None: + X, y = make_blobs(n_samples=10) + est = SGDClassifier(max_iter=5, tol=1e-03) + param_dist = {} # type: ignore + optuna_search = integration.OptunaSearchCV( + est, + param_dist, + cv=3, + enable_pruning=True, + error_score="raise", + random_state=0, + return_train_score=True, + ) + + with pytest.raises(ValueError): + optuna_search.fit(X) + + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_objective_error_score_nan() -> None: + X, y = make_blobs(n_samples=10) + est = SGDClassifier(max_iter=5, tol=1e-03) + param_dist = {} # type: ignore + optuna_search = integration.OptunaSearchCV( + est, + param_dist, + cv=3, + enable_pruning=True, + max_iter=5, + error_score=np.nan, + random_state=0, + return_train_score=True, + ) + + with pytest.raises( + ValueError, + match="This SGDClassifier estimator requires y to be passed, but the target y is None.", + ): + optuna_search.fit(X) + + for trial in optuna_search.study_.get_trials(): + assert np.all(np.isnan(list(trial.intermediate_values.values()))) + + # "_score" stores every score value for train and test validation holds. + for name, value in trial.user_attrs.items(): + if name.endswith("_score"): + assert np.isnan(value) + + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_objective_error_score_invalid() -> None: + X, y = make_blobs(n_samples=10) + est = SGDClassifier(max_iter=5, tol=1e-03) + param_dist = {} # type: ignore + optuna_search = integration.OptunaSearchCV( + est, + param_dist, + cv=3, + enable_pruning=True, + max_iter=5, + error_score="invalid error score", + random_state=0, + return_train_score=True, + ) + + with pytest.raises(ValueError, match="error_score must be 'raise' or numeric."): + optuna_search.fit(X) + + +# This test checks whether OptunaSearchCV completes the study without halting, even if some trials +# fails due to misconfiguration. +@pytest.mark.parametrize( + "param_dist,all_params", + [ + ({"max_depth": distributions.IntDistribution(0, 1)}, [0, 1]), + ({"max_depth": distributions.IntDistribution(0, 0)}, [0]), + ], +) +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_no_halt_with_error( + param_dist: dict[str, distributions.BaseDistribution], all_params: list[int] +) -> None: + X, y = make_regression(n_samples=100, n_features=10) + estimator = DecisionTreeRegressor() + study = create_study(sampler=BruteForceSampler(), direction="maximize") + + # DecisionTreeRegressor raises ValueError when max_depth==0. + optuna_search = integration.OptunaSearchCV( + estimator, + param_dist, + study=study, + ) + optuna_search.fit(X, y) + all_suggested_values = [t.params["max_depth"] for t in study.trials] + assert len(all_suggested_values) == len(all_params) + for a in all_params: + assert a in all_suggested_values + + +# TODO(himkt): Remove this method with the deletion of deprecated distributions. +# https://github.com/optuna/optuna/issues/2941 +@pytest.mark.filterwarnings("ignore::FutureWarning") +def test_optuna_search_convert_deprecated_distribution() -> None: + param_dist = { + "ud": distributions.UniformDistribution(low=0, high=10), + "dud": distributions.DiscreteUniformDistribution(low=0, high=10, q=2), + "lud": distributions.LogUniformDistribution(low=1, high=10), + "id": distributions.IntUniformDistribution(low=0, high=10), + "idd": distributions.IntUniformDistribution(low=0, high=10, step=2), + "ild": distributions.IntLogUniformDistribution(low=1, high=10), + } + + expected_param_dist = { + "ud": distributions.FloatDistribution(low=0, high=10, log=False, step=None), + "dud": distributions.FloatDistribution(low=0, high=10, log=False, step=2), + "lud": distributions.FloatDistribution(low=1, high=10, log=True, step=None), + "id": distributions.IntDistribution(low=0, high=10, log=False, step=1), + "idd": distributions.IntDistribution(low=0, high=10, log=False, step=2), + "ild": distributions.IntDistribution(low=1, high=10, log=True, step=1), + } + + with pytest.raises(ValueError): + optuna_search = integration.OptunaSearchCV( + KernelDensity(), + param_dist, + ) + + # It confirms that ask doesn't convert non-deprecated distributions. + optuna_search = integration.OptunaSearchCV( + KernelDensity(), + expected_param_dist, + ) + + assert optuna_search.param_distributions == expected_param_dist + + +def test_callbacks() -> None: + callbacks = [] + + for _ in range(2): + callback = MagicMock() + callback.__call__ = MagicMock(return_value=None) # type: ignore + callbacks.append(callback) + + n_trials = 5 + X, y = make_blobs(n_samples=10) + est = SGDClassifier(max_iter=5, tol=1e-03) + param_dist = {"alpha": distributions.FloatDistribution(1e-04, 1e03, log=True)} + optuna_search = integration.OptunaSearchCV( + est, + param_dist, + cv=3, + enable_pruning=True, + max_iter=5, + n_trials=n_trials, + error_score=np.nan, + callbacks=callbacks, # type: ignore + ) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=ConvergenceWarning) + optuna_search.fit(X, y) + + for callback in callbacks: + for trial in optuna_search.trials_: + callback.assert_any_call(optuna_search.study_, trial) + assert callback.call_count == n_trials + + +@pytest.mark.filterwarnings("ignore::UserWarning") +@patch("optuna_integration.sklearn.cross_validate") +def test_terminator_cv_score_reporting(mock: MagicMock) -> None: + scores = { + "fit_time": np.array([2.01, 1.78, 3.22]), + "score_time": np.array([0.33, 0.35, 0.48]), + "test_score": np.array([0.04, 0.80, 0.70]), + } + mock.return_value = scores + + X, _ = make_blobs(n_samples=10) + est = PCA() + optuna_search = integration.OptunaSearchCV(est, {}, cv=3, error_score="raise", random_state=0) + optuna_search.fit(X) + + for trial in optuna_search.study_.trials: + assert (trial.system_attrs[_CROSS_VALIDATION_SCORES_KEY] == scores["test_score"]).all()