diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R index 9da2f9bd7167..9c928c1f71d1 100644 --- a/R-package/tests/testthat/helper.R +++ b/R-package/tests/testthat/helper.R @@ -29,3 +29,20 @@ .LGB_VERBOSITY <- as.integer( Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") ) + +# [description] +# test that every element of 'x' is in 'y' +# +# testthat::expect_in() is not available in version of {testthat} +# built for R 3.6, this is here to support a similar interface on R 3.6 +.expect_in <- function(x, y) { + if (exists("expect_in")) { + expect_in(x, y) + } else { + missing_items <- x[!(x %in% y)] + if (length(missing_items) != 0L) { + error_msg <- paste0("Some expected items not found: ", toString(missing_items)) + stop(error_msg) + } + } +} diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 1ff038598db1..5f398f1c081d 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -799,37 +799,166 @@ test_that("all parameters are stored correctly with save_model_to_string()", { data = matrix(rnorm(500L), nrow = 100L) , label = rnorm(100L) ) - nrounds <- 4L bst <- lgb.train( params = list( - objective = "regression" - , metric = "l2" + objective = "mape" + , metric = c("l2", "mae") , num_threads = .LGB_MAX_THREADS + , seed = 708L + , data_sample_strategy = "bagging" + , sub_row = 0.8234 ) , data = dtrain - , nrounds = nrounds + , nrounds = 3L , verbose = .LGB_VERBOSITY ) - model_str <- bst$save_model_to_string() - params_in_file <- .params_from_model_string(model_str = model_str) + # entries whose values should reflect params passed to lgb.train() + non_default_param_entries <- c( + "[objective: mape]" + # 'l1' was passed in with alias 'mae' + , "[metric: l2,l1]" + , "[data_sample_strategy: bagging]" + , "[seed: 708]" + # this was passed in with alias 'sub_row' + , "[bagging_fraction: 0.8234]" + , "[num_iterations: 3]" + ) + + # entries with default values of params + default_param_entries <- c( + "[boosting: gbdt]" + , "[tree_learner: serial]" + , "[device_type: cpu]" + , "[data: ]" + , "[valid: ]" + , "[learning_rate: 0.1]" + , "[num_leaves: 31]" + , sprintf("[num_threads: %i]", .LGB_MAX_THREADS) + , "[deterministic: 0]" + , "[histogram_pool_size: -1]" + , "[max_depth: -1]" + , "[min_data_in_leaf: 20]" + , "[min_sum_hessian_in_leaf: 0.001]" + , "[pos_bagging_fraction: 1]" + , "[neg_bagging_fraction: 1]" + , "[bagging_freq: 0]" + , "[bagging_seed: 15415]" + , "[feature_fraction: 1]" + , "[feature_fraction_bynode: 1]" + , "[feature_fraction_seed: 32671]" + , "[extra_trees: 0]" + , "[extra_seed: 6642]" + , "[early_stopping_round: 0]" + , "[first_metric_only: 0]" + , "[max_delta_step: 0]" + , "[lambda_l1: 0]" + , "[lambda_l2: 0]" + , "[linear_lambda: 0]" + , "[min_gain_to_split: 0]" + , "[drop_rate: 0.1]" + , "[max_drop: 50]" + , "[skip_drop: 0.5]" + , "[xgboost_dart_mode: 0]" + , "[uniform_drop: 0]" + , "[drop_seed: 20623]" + , "[top_rate: 0.2]" + , "[other_rate: 0.1]" + , "[min_data_per_group: 100]" + , "[max_cat_threshold: 32]" + , "[cat_l2: 10]" + , "[cat_smooth: 10]" + , "[max_cat_to_onehot: 4]" + , "[top_k: 20]" + , "[monotone_constraints: ]" + , "[monotone_constraints_method: basic]" + , "[monotone_penalty: 0]" + , "[feature_contri: ]" + , "[forcedsplits_filename: ]" + , "[force_col_wise: 0]" + , "[force_row_wise: 0]" + , "[refit_decay_rate: 0.9]" + , "[cegb_tradeoff: 1]" + , "[cegb_penalty_split: 0]" + , "[cegb_penalty_feature_lazy: ]" + , "[cegb_penalty_feature_coupled: ]" + , "[path_smooth: 0]" + , "[interaction_constraints: ]" + , sprintf("[verbosity: %i]", .LGB_VERBOSITY) + , "[saved_feature_importance_type: 0]" + , "[use_quantized_grad: 0]" + , "[num_grad_quant_bins: 4]" + , "[quant_train_renew_leaf: 0]" + , "[stochastic_rounding: 1]" + , "[linear_tree: 0]" + , "[max_bin: 255]" + , "[max_bin_by_feature: ]" + , "[min_data_in_bin: 3]" + , "[bin_construct_sample_cnt: 200000]" + , "[data_random_seed: 2350]" + , "[is_enable_sparse: 1]" + , "[enable_bundle: 1]" + , "[use_missing: 1]" + , "[zero_as_missing: 0]" + , "[feature_pre_filter: 1]" + , "[pre_partition: 0]" + , "[two_round: 0]" + , "[header: 0]" + , "[label_column: ]" + , "[weight_column: ]" + , "[group_column: ]" + , "[ignore_column: ]" + , "[categorical_feature: ]" + , "[forcedbins_filename: ]" + , "[precise_float_parser: 0]" + , "[parser_config_file: ]" + , "[objective_seed: 4309]" + , "[num_class: 1]" + , "[is_unbalance: 0]" + , "[scale_pos_weight: 1]" + , "[sigmoid: 1]" + , "[boost_from_average: 1]" + , "[reg_sqrt: 0]" + , "[alpha: 0.9]" + , "[fair_c: 1]" + , "[poisson_max_delta_step: 0.7]" + , "[tweedie_variance_power: 1.5]" + , "[lambdarank_truncation_level: 30]" + , "[lambdarank_norm: 1]" + , "[label_gain: ]" + , "[lambdarank_position_bias_regularization: 0]" + , "[eval_at: ]" + , "[multi_error_top_k: 1]" + , "[auc_mu_weights: ]" + , "[num_machines: 1]" + , "[local_listen_port: 12400]" + , "[time_out: 120]" + , "[machine_list_filename: ]" + , "[machines: ]" + , "[gpu_platform_id: -1]" + , "[gpu_device_id: -1]" + , "[gpu_use_dp: 0]" + , "[num_gpu: 1]" + ) + all_param_entries <- c(non_default_param_entries, default_param_entries) # parameters should match what was passed from the R package - expect_equal(sum(startsWith(params_in_file, "[metric:")), 1L) - expect_equal(sum(params_in_file == "[metric: l2]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L) - expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[objective:")), 1L) - expect_equal(sum(params_in_file == "[objective: regression]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[verbosity:")), 1L) - expect_equal(sum(params_in_file == sprintf("[verbosity: %i]", .LGB_VERBOSITY)), 1L) + model_str <- bst$save_model_to_string() + params_in_file <- .params_from_model_string(model_str = model_str) + .expect_in(all_param_entries, params_in_file) # early stopping should be off by default expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L) expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L) + + # since save_model_to_string() is used when serializing with saveRDS(), check that parameters all + # roundtrip saveRDS()/loadRDS() successfully + rds_file <- tempfile() + saveRDS(bst, rds_file) + bst_rds <- readRDS(rds_file) + model_str <- bst_rds$save_model_to_string() + params_in_file <- .params_from_model_string(model_str = model_str) + .expect_in(all_param_entries, params_in_file) }) test_that("early_stopping, num_iterations are stored correctly in model string even with aliases", { diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py index 407f2c73e1e3..a554ee60b6c9 100644 --- a/helpers/parameter_generator.py +++ b/helpers/parameter_generator.py @@ -330,7 +330,7 @@ def gen_parameter_code( str_to_write += ' std::string tmp_str = "";\n' for x in infos: for y in x: - if "[doc-only]" in y: + if "[no-automatically-extract]" in y: continue param_type = y["inner_type"][0] name = y["name"][0] @@ -345,7 +345,7 @@ def gen_parameter_code( str_to_write += " std::stringstream str_buf;\n" for x in infos: for y in x: - if "[doc-only]" in y or "[no-save]" in y: + if "[no-save]" in y: continue param_type = y["inner_type"][0] name = y["name"][0] diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 187043cc2053..6d61bc764924 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -5,8 +5,13 @@ * \note * - desc and descl2 fields must be written in reStructuredText format; * - nested sections can be placed only at the bottom of parent's section; - * - [doc-only] tag indicates that only documentation for this param should be generated and all other actions are performed manually; - * - [no-save] tag indicates that this param should not be saved into a model text representation. + * - [no-automatically-extract] + * - do not automatically extract this parameter into a Config property with the same name in Config::GetMembersFromString(). Use if: + * - specialized extraction logic for this param exists in Config::GetMembersFromString() + * - [no-save] + * - this param should not be saved into a model text representation via Config::SaveMembersToString(). Use if: + * - param is only used by the CLI (especially the "predict" and "convert_model" tasks) + * - param is related to LightGBM writing files (e.g. "output_model", "save_binary") */ #ifndef LIGHTGBM_CONFIG_H_ #define LIGHTGBM_CONFIG_H_ @@ -97,15 +102,15 @@ struct Config { #pragma region Core Parameters #endif // __NVCC__ + // [no-automatically-extract] // [no-save] - // [doc-only] // alias = config_file // desc = path of config file // desc = **Note**: can be used only in CLI version std::string config = ""; + // [no-automatically-extract] // [no-save] - // [doc-only] // type = enum // default = train // options = train, predict, convert_model, refit @@ -118,7 +123,8 @@ struct Config { // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions TaskType task = TaskType::kTrain; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = regression, regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda, lambdarank, rank_xendcg // alias = objective_type, app, application, loss @@ -150,7 +156,8 @@ struct Config { // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) std::string objective = "regression"; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // alias = boosting_type, boost // options = gbdt, rf, dart @@ -160,7 +167,7 @@ struct Config { // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations std::string boosting = "gbdt"; - // [doc-only] + // [no-automatically-extract] // type = enum // options = bagging, goss // desc = ``bagging``, Randomly Bagging Sampling @@ -200,7 +207,8 @@ struct Config { // desc = max number of leaves in one tree int num_leaves = kDefaultNumLeaves; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = serial, feature, data, voting // alias = tree, tree_type, tree_learner_type @@ -222,7 +230,8 @@ struct Config { // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors int num_threads = 0; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = cpu, gpu, cuda // alias = device @@ -235,7 +244,7 @@ struct Config { // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support std::string device_type = "cpu"; - // [doc-only] + // [no-automatically-extract] // alias = random_seed, random_state // default = None // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc. @@ -593,7 +602,6 @@ struct Config { // desc = **Note**: can be used only in CLI version int snapshot_freq = -1; - // [no-save] // desc = whether to use gradient quantization when training // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins`` // desc = with quantized training, most arithmetics in the training process will be integer operations @@ -602,21 +610,18 @@ struct Config { // desc = *New in version 4.0.0* bool use_quantized_grad = false; - // [no-save] // desc = number of bins to quantization gradients and hessians // desc = with more bins, the quantized training will be closer to full precision training // desc = **Note**: can be used only with ``device_type = cpu`` // desc = *New in 4.0.0* int num_grad_quant_bins = 4; - // [no-save] // desc = whether to renew the leaf values with original gradients when quantized training // desc = renewing is very helpful for good quantized training accuracy for ranking objectives // desc = **Note**: can be used only with ``device_type = cpu`` // desc = *New in 4.0.0* bool quant_train_renew_leaf = false; - // [no-save] // desc = whether to use stochastic rounding in gradient quantization // desc = *New in 4.0.0* bool stochastic_rounding = true; @@ -976,7 +981,8 @@ struct Config { #pragma region Metric Parameters #endif // __NVCC__ - // [doc-only] + // [no-automatically-extract] + // [no-save] // alias = metrics, metric_types // default = "" // type = multi-enum diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index 5815bc602bde..0dc5b75cfdf2 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -6,7 +6,7 @@ from pathlib import Path from .basic import Booster, Dataset, Sequence, register_logger -from .callback import early_stopping, log_evaluation, record_evaluation, reset_parameter +from .callback import EarlyStopException, early_stopping, log_evaluation, record_evaluation, reset_parameter from .engine import CVBooster, cv, train try: @@ -32,5 +32,5 @@ 'train', 'cv', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker', - 'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', + 'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'EarlyStopException', 'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph'] diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 182ec200d207..8256dc026973 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -54,6 +54,7 @@ _LGBM_EvalFunctionResultType = Tuple[str, float, bool] _LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]] _LGBM_BoosterEvalMethodResultType = Tuple[str, str, float, bool] +_LGBM_BoosterEvalMethodResultWithStandardDeviationType = Tuple[str, str, float, bool, float] _LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], "Literal['auto']"] _LGBM_FeatureNameConfiguration = Union[List[str], "Literal['auto']"] _LGBM_GroupType = Union[ diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index ccf0059faf84..7db3d400ecd6 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -3,14 +3,16 @@ from collections import OrderedDict from dataclasses import dataclass from functools import partial -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union -from .basic import Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType, _log_info, _log_warning +from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType, + _LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning) if TYPE_CHECKING: from .engine import CVBooster __all__ = [ + 'EarlyStopException', 'early_stopping', 'log_evaluation', 'record_evaluation', @@ -20,16 +22,20 @@ _EvalResultDict = Dict[str, Dict[str, List[Any]]] _EvalResultTuple = Union[ _LGBM_BoosterEvalMethodResultType, - Tuple[str, str, float, bool, float] + _LGBM_BoosterEvalMethodResultWithStandardDeviationType ] _ListOfEvalResultTuples = Union[ List[_LGBM_BoosterEvalMethodResultType], - List[Tuple[str, str, float, bool, float]] + List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType] ] class EarlyStopException(Exception): - """Exception of early stopping.""" + """Exception of early stopping. + + Raise this from a callback passed in via keyword argument ``callbacks`` + in ``cv()`` or ``train()`` to trigger early stopping. + """ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> None: """Create early stopping exception. @@ -38,6 +44,7 @@ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> ---------- best_iteration : int The best iteration stopped. + 0-based... pass ``best_iteration=2`` to indicate that the third iteration was the best one. best_score : list of (eval_name, metric_name, eval_result, is_higher_better) tuple or (eval_name, metric_name, eval_result, is_higher_better, stdv) tuple Scores for each metric, on each validation set, as of the best iteration. """ @@ -54,7 +61,7 @@ class CallbackEnv: iteration: int begin_iteration: int end_iteration: int - evaluation_result_list: Optional[List[_LGBM_BoosterEvalMethodResultType]] + evaluation_result_list: Optional[_ListOfEvalResultTuples] def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str: @@ -124,6 +131,11 @@ def __init__(self, eval_result: _EvalResultDict) -> None: self.eval_result = eval_result def _init(self, env: CallbackEnv) -> None: + if env.evaluation_result_list is None: + raise RuntimeError( + "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) self.eval_result.clear() for item in env.evaluation_result_list: if len(item) == 4: # regular train @@ -140,6 +152,11 @@ def _init(self, env: CallbackEnv) -> None: def __call__(self, env: CallbackEnv) -> None: if env.iteration == env.begin_iteration: self._init(env) + if env.evaluation_result_list is None: + raise RuntimeError( + "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) for item in env.evaluation_result_list: if len(item) == 4: data_name, eval_name, result = item[:3] @@ -278,6 +295,10 @@ def _is_train_set(self, ds_name: str, eval_name: str, train_name: str) -> bool: return (ds_name == "cv_agg" and eval_name == "train") or ds_name == train_name def _init(self, env: CallbackEnv) -> None: + if env.evaluation_result_list is None or env.evaluation_result_list == []: + raise ValueError( + "For early stopping, at least one dataset and eval metric is required for evaluation" + ) is_dart = any(env.params.get(alias, "") == 'dart' for alias in _ConfigAliases.get("boosting")) only_train_set = ( len(env.evaluation_result_list) == 1 @@ -293,9 +314,6 @@ def _init(self, env: CallbackEnv) -> None: elif only_train_set: _log_warning('Only training set found, disabling early stopping.') return - if not env.evaluation_result_list: - raise ValueError('For early stopping, ' - 'at least one dataset and eval metric is required for evaluation') if self.stopping_rounds <= 0: raise ValueError("stopping_rounds should be greater than zero.") @@ -357,6 +375,11 @@ def __call__(self, env: CallbackEnv) -> None: self._init(env) if not self.enabled: return + if env.evaluation_result_list is None: + raise RuntimeError( + "early_stopping() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) # self.best_score_list is initialized to an empty list first_time_updating_best_score_list = (self.best_score_list == []) for i in range(len(env.evaluation_result_list)): diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index daa6e16b6a9a..822aa3b35017 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -11,9 +11,9 @@ from . import callback from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor, - _LGBM_BoosterEvalMethodResultType, _LGBM_CategoricalFeatureConfiguration, - _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration, - _log_warning) + _LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType, + _LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, + _LGBM_FeatureNameConfiguration, _log_warning) from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold __all__ = [ @@ -519,8 +519,8 @@ def _make_n_folds( def _agg_cv_result( - raw_results: List[List[Tuple[str, str, float, bool]]] -) -> List[Tuple[str, str, float, bool, float]]: + raw_results: List[List[_LGBM_BoosterEvalMethodResultType]] +) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]: """Aggregate cross-validation results.""" cvmap: Dict[str, List[float]] = OrderedDict() metric_type: Dict[str, bool] = {} @@ -530,7 +530,7 @@ def _agg_cv_result( metric_type[key] = one_line[3] cvmap.setdefault(key, []) cvmap[key].append(one_line[2]) - return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()] + return [('cv_agg', k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()] def cv( diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 8182c9b52b93..394614af3f33 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -664,12 +664,14 @@ void Config::GetMembersFromString(const std::unordered_map