diff --git a/.appveyor.yml b/.appveyor.yml index 8733301fbfe9..4cff03d571a1 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,4 @@ -version: 4.1.0.{build} +version: 4.1.0.99.{build} image: Visual Studio 2015 platform: x64 diff --git a/CMakeLists.txt b/CMakeLists.txt index 5087d6a8fddb..6705ef130052 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -326,6 +326,13 @@ if(UNIX OR MINGW OR CYGWIN) CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type" ) + if(MINGW) + # ignore this warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95353 + set( + CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow" + ) + endif() if(USE_DEBUG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0") else() diff --git a/R-package/configure b/R-package/configure index 5f441f942e63..39a18d669833 100755 --- a/R-package/configure +++ b/R-package/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0. +# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0.99. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -607,8 +607,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='lightgbm' PACKAGE_TARNAME='lightgbm' -PACKAGE_VERSION='4.1.0' -PACKAGE_STRING='lightgbm 4.1.0' +PACKAGE_VERSION='4.1.0.99' +PACKAGE_STRING='lightgbm 4.1.0.99' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1211,7 +1211,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures lightgbm 4.1.0 to adapt to many kinds of systems. +\`configure' configures lightgbm 4.1.0.99 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1273,7 +1273,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of lightgbm 4.1.0:";; + short | recursive ) echo "Configuration of lightgbm 4.1.0.99:";; esac cat <<\_ACEOF @@ -1341,7 +1341,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -lightgbm configure 4.1.0 +lightgbm configure 4.1.0.99 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1378,7 +1378,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by lightgbm $as_me 4.1.0, which was +It was created by lightgbm $as_me 4.1.0.99, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -2454,7 +2454,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by lightgbm $as_me 4.1.0, which was +This file was extended by lightgbm $as_me 4.1.0.99, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -2509,7 +2509,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -lightgbm config.status 4.1.0 +lightgbm config.status 4.1.0.99 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/R-package/pkgdown/_pkgdown.yml b/R-package/pkgdown/_pkgdown.yml index ca4a84a5d045..233a31f0ead9 100644 --- a/R-package/pkgdown/_pkgdown.yml +++ b/R-package/pkgdown/_pkgdown.yml @@ -14,7 +14,7 @@ repo: user: https://github.com/ development: - mode: release + mode: unreleased authors: Yu Shi: diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R index 9da2f9bd7167..9c928c1f71d1 100644 --- a/R-package/tests/testthat/helper.R +++ b/R-package/tests/testthat/helper.R @@ -29,3 +29,20 @@ .LGB_VERBOSITY <- as.integer( Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") ) + +# [description] +# test that every element of 'x' is in 'y' +# +# testthat::expect_in() is not available in version of {testthat} +# built for R 3.6, this is here to support a similar interface on R 3.6 +.expect_in <- function(x, y) { + if (exists("expect_in")) { + expect_in(x, y) + } else { + missing_items <- x[!(x %in% y)] + if (length(missing_items) != 0L) { + error_msg <- paste0("Some expected items not found: ", toString(missing_items)) + stop(error_msg) + } + } +} diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 1ff038598db1..5f398f1c081d 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -799,37 +799,166 @@ test_that("all parameters are stored correctly with save_model_to_string()", { data = matrix(rnorm(500L), nrow = 100L) , label = rnorm(100L) ) - nrounds <- 4L bst <- lgb.train( params = list( - objective = "regression" - , metric = "l2" + objective = "mape" + , metric = c("l2", "mae") , num_threads = .LGB_MAX_THREADS + , seed = 708L + , data_sample_strategy = "bagging" + , sub_row = 0.8234 ) , data = dtrain - , nrounds = nrounds + , nrounds = 3L , verbose = .LGB_VERBOSITY ) - model_str <- bst$save_model_to_string() - params_in_file <- .params_from_model_string(model_str = model_str) + # entries whose values should reflect params passed to lgb.train() + non_default_param_entries <- c( + "[objective: mape]" + # 'l1' was passed in with alias 'mae' + , "[metric: l2,l1]" + , "[data_sample_strategy: bagging]" + , "[seed: 708]" + # this was passed in with alias 'sub_row' + , "[bagging_fraction: 0.8234]" + , "[num_iterations: 3]" + ) + + # entries with default values of params + default_param_entries <- c( + "[boosting: gbdt]" + , "[tree_learner: serial]" + , "[device_type: cpu]" + , "[data: ]" + , "[valid: ]" + , "[learning_rate: 0.1]" + , "[num_leaves: 31]" + , sprintf("[num_threads: %i]", .LGB_MAX_THREADS) + , "[deterministic: 0]" + , "[histogram_pool_size: -1]" + , "[max_depth: -1]" + , "[min_data_in_leaf: 20]" + , "[min_sum_hessian_in_leaf: 0.001]" + , "[pos_bagging_fraction: 1]" + , "[neg_bagging_fraction: 1]" + , "[bagging_freq: 0]" + , "[bagging_seed: 15415]" + , "[feature_fraction: 1]" + , "[feature_fraction_bynode: 1]" + , "[feature_fraction_seed: 32671]" + , "[extra_trees: 0]" + , "[extra_seed: 6642]" + , "[early_stopping_round: 0]" + , "[first_metric_only: 0]" + , "[max_delta_step: 0]" + , "[lambda_l1: 0]" + , "[lambda_l2: 0]" + , "[linear_lambda: 0]" + , "[min_gain_to_split: 0]" + , "[drop_rate: 0.1]" + , "[max_drop: 50]" + , "[skip_drop: 0.5]" + , "[xgboost_dart_mode: 0]" + , "[uniform_drop: 0]" + , "[drop_seed: 20623]" + , "[top_rate: 0.2]" + , "[other_rate: 0.1]" + , "[min_data_per_group: 100]" + , "[max_cat_threshold: 32]" + , "[cat_l2: 10]" + , "[cat_smooth: 10]" + , "[max_cat_to_onehot: 4]" + , "[top_k: 20]" + , "[monotone_constraints: ]" + , "[monotone_constraints_method: basic]" + , "[monotone_penalty: 0]" + , "[feature_contri: ]" + , "[forcedsplits_filename: ]" + , "[force_col_wise: 0]" + , "[force_row_wise: 0]" + , "[refit_decay_rate: 0.9]" + , "[cegb_tradeoff: 1]" + , "[cegb_penalty_split: 0]" + , "[cegb_penalty_feature_lazy: ]" + , "[cegb_penalty_feature_coupled: ]" + , "[path_smooth: 0]" + , "[interaction_constraints: ]" + , sprintf("[verbosity: %i]", .LGB_VERBOSITY) + , "[saved_feature_importance_type: 0]" + , "[use_quantized_grad: 0]" + , "[num_grad_quant_bins: 4]" + , "[quant_train_renew_leaf: 0]" + , "[stochastic_rounding: 1]" + , "[linear_tree: 0]" + , "[max_bin: 255]" + , "[max_bin_by_feature: ]" + , "[min_data_in_bin: 3]" + , "[bin_construct_sample_cnt: 200000]" + , "[data_random_seed: 2350]" + , "[is_enable_sparse: 1]" + , "[enable_bundle: 1]" + , "[use_missing: 1]" + , "[zero_as_missing: 0]" + , "[feature_pre_filter: 1]" + , "[pre_partition: 0]" + , "[two_round: 0]" + , "[header: 0]" + , "[label_column: ]" + , "[weight_column: ]" + , "[group_column: ]" + , "[ignore_column: ]" + , "[categorical_feature: ]" + , "[forcedbins_filename: ]" + , "[precise_float_parser: 0]" + , "[parser_config_file: ]" + , "[objective_seed: 4309]" + , "[num_class: 1]" + , "[is_unbalance: 0]" + , "[scale_pos_weight: 1]" + , "[sigmoid: 1]" + , "[boost_from_average: 1]" + , "[reg_sqrt: 0]" + , "[alpha: 0.9]" + , "[fair_c: 1]" + , "[poisson_max_delta_step: 0.7]" + , "[tweedie_variance_power: 1.5]" + , "[lambdarank_truncation_level: 30]" + , "[lambdarank_norm: 1]" + , "[label_gain: ]" + , "[lambdarank_position_bias_regularization: 0]" + , "[eval_at: ]" + , "[multi_error_top_k: 1]" + , "[auc_mu_weights: ]" + , "[num_machines: 1]" + , "[local_listen_port: 12400]" + , "[time_out: 120]" + , "[machine_list_filename: ]" + , "[machines: ]" + , "[gpu_platform_id: -1]" + , "[gpu_device_id: -1]" + , "[gpu_use_dp: 0]" + , "[num_gpu: 1]" + ) + all_param_entries <- c(non_default_param_entries, default_param_entries) # parameters should match what was passed from the R package - expect_equal(sum(startsWith(params_in_file, "[metric:")), 1L) - expect_equal(sum(params_in_file == "[metric: l2]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L) - expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[objective:")), 1L) - expect_equal(sum(params_in_file == "[objective: regression]"), 1L) - - expect_equal(sum(startsWith(params_in_file, "[verbosity:")), 1L) - expect_equal(sum(params_in_file == sprintf("[verbosity: %i]", .LGB_VERBOSITY)), 1L) + model_str <- bst$save_model_to_string() + params_in_file <- .params_from_model_string(model_str = model_str) + .expect_in(all_param_entries, params_in_file) # early stopping should be off by default expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L) expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L) + + # since save_model_to_string() is used when serializing with saveRDS(), check that parameters all + # roundtrip saveRDS()/loadRDS() successfully + rds_file <- tempfile() + saveRDS(bst, rds_file) + bst_rds <- readRDS(rds_file) + model_str <- bst_rds$save_model_to_string() + params_in_file <- .params_from_model_string(model_str = model_str) + .expect_in(all_param_entries, params_in_file) }) test_that("early_stopping, num_iterations are stored correctly in model string even with aliases", { diff --git a/VERSION.txt b/VERSION.txt index ee74734aa225..1f06da0058c9 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -4.1.0 +4.1.0.99 diff --git a/external_libs/fast_double_parser b/external_libs/fast_double_parser index ace60646c02d..efec03532ef6 160000 --- a/external_libs/fast_double_parser +++ b/external_libs/fast_double_parser @@ -1 +1 @@ -Subproject commit ace60646c02dc54c57f19d644e49a61e7e7758ec +Subproject commit efec03532ef65984786e5e32dbc81f6e6a55a115 diff --git a/external_libs/fmt b/external_libs/fmt index b6f4ceaed0a0..f5e54359df4c 160000 --- a/external_libs/fmt +++ b/external_libs/fmt @@ -1 +1 @@ -Subproject commit b6f4ceaed0a0a24ccf575fab6c56dd50ccf6f1a9 +Subproject commit f5e54359df4c26b6230fc61d38aa294581393084 diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py index 407f2c73e1e3..a554ee60b6c9 100644 --- a/helpers/parameter_generator.py +++ b/helpers/parameter_generator.py @@ -330,7 +330,7 @@ def gen_parameter_code( str_to_write += ' std::string tmp_str = "";\n' for x in infos: for y in x: - if "[doc-only]" in y: + if "[no-automatically-extract]" in y: continue param_type = y["inner_type"][0] name = y["name"][0] @@ -345,7 +345,7 @@ def gen_parameter_code( str_to_write += " std::stringstream str_buf;\n" for x in infos: for y in x: - if "[doc-only]" in y or "[no-save]" in y: + if "[no-save]" in y: continue param_type = y["inner_type"][0] name = y["name"][0] diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 187043cc2053..6d61bc764924 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -5,8 +5,13 @@ * \note * - desc and descl2 fields must be written in reStructuredText format; * - nested sections can be placed only at the bottom of parent's section; - * - [doc-only] tag indicates that only documentation for this param should be generated and all other actions are performed manually; - * - [no-save] tag indicates that this param should not be saved into a model text representation. + * - [no-automatically-extract] + * - do not automatically extract this parameter into a Config property with the same name in Config::GetMembersFromString(). Use if: + * - specialized extraction logic for this param exists in Config::GetMembersFromString() + * - [no-save] + * - this param should not be saved into a model text representation via Config::SaveMembersToString(). Use if: + * - param is only used by the CLI (especially the "predict" and "convert_model" tasks) + * - param is related to LightGBM writing files (e.g. "output_model", "save_binary") */ #ifndef LIGHTGBM_CONFIG_H_ #define LIGHTGBM_CONFIG_H_ @@ -97,15 +102,15 @@ struct Config { #pragma region Core Parameters #endif // __NVCC__ + // [no-automatically-extract] // [no-save] - // [doc-only] // alias = config_file // desc = path of config file // desc = **Note**: can be used only in CLI version std::string config = ""; + // [no-automatically-extract] // [no-save] - // [doc-only] // type = enum // default = train // options = train, predict, convert_model, refit @@ -118,7 +123,8 @@ struct Config { // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions TaskType task = TaskType::kTrain; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = regression, regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda, lambdarank, rank_xendcg // alias = objective_type, app, application, loss @@ -150,7 +156,8 @@ struct Config { // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) std::string objective = "regression"; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // alias = boosting_type, boost // options = gbdt, rf, dart @@ -160,7 +167,7 @@ struct Config { // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations std::string boosting = "gbdt"; - // [doc-only] + // [no-automatically-extract] // type = enum // options = bagging, goss // desc = ``bagging``, Randomly Bagging Sampling @@ -200,7 +207,8 @@ struct Config { // desc = max number of leaves in one tree int num_leaves = kDefaultNumLeaves; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = serial, feature, data, voting // alias = tree, tree_type, tree_learner_type @@ -222,7 +230,8 @@ struct Config { // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors int num_threads = 0; - // [doc-only] + // [no-automatically-extract] + // [no-save] // type = enum // options = cpu, gpu, cuda // alias = device @@ -235,7 +244,7 @@ struct Config { // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support std::string device_type = "cpu"; - // [doc-only] + // [no-automatically-extract] // alias = random_seed, random_state // default = None // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc. @@ -593,7 +602,6 @@ struct Config { // desc = **Note**: can be used only in CLI version int snapshot_freq = -1; - // [no-save] // desc = whether to use gradient quantization when training // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins`` // desc = with quantized training, most arithmetics in the training process will be integer operations @@ -602,21 +610,18 @@ struct Config { // desc = *New in version 4.0.0* bool use_quantized_grad = false; - // [no-save] // desc = number of bins to quantization gradients and hessians // desc = with more bins, the quantized training will be closer to full precision training // desc = **Note**: can be used only with ``device_type = cpu`` // desc = *New in 4.0.0* int num_grad_quant_bins = 4; - // [no-save] // desc = whether to renew the leaf values with original gradients when quantized training // desc = renewing is very helpful for good quantized training accuracy for ranking objectives // desc = **Note**: can be used only with ``device_type = cpu`` // desc = *New in 4.0.0* bool quant_train_renew_leaf = false; - // [no-save] // desc = whether to use stochastic rounding in gradient quantization // desc = *New in 4.0.0* bool stochastic_rounding = true; @@ -976,7 +981,8 @@ struct Config { #pragma region Metric Parameters #endif // __NVCC__ - // [doc-only] + // [no-automatically-extract] + // [no-save] // alias = metrics, metric_types // default = "" // type = multi-enum diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index 5815bc602bde..0dc5b75cfdf2 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -6,7 +6,7 @@ from pathlib import Path from .basic import Booster, Dataset, Sequence, register_logger -from .callback import early_stopping, log_evaluation, record_evaluation, reset_parameter +from .callback import EarlyStopException, early_stopping, log_evaluation, record_evaluation, reset_parameter from .engine import CVBooster, cv, train try: @@ -32,5 +32,5 @@ 'train', 'cv', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker', - 'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', + 'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'EarlyStopException', 'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph'] diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index cb27b4e1af39..84f5ec02bcb4 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -55,6 +55,7 @@ _LGBM_EvalFunctionResultType = Tuple[str, float, bool] _LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]] _LGBM_BoosterEvalMethodResultType = Tuple[str, str, float, bool] +_LGBM_BoosterEvalMethodResultWithStandardDeviationType = Tuple[str, str, float, bool, float] _LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], "Literal['auto']"] _LGBM_FeatureNameConfiguration = Union[List[str], "Literal['auto']"] _LGBM_GroupType = Union[ diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index ccf0059faf84..7db3d400ecd6 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -3,14 +3,16 @@ from collections import OrderedDict from dataclasses import dataclass from functools import partial -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union -from .basic import Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType, _log_info, _log_warning +from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType, + _LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning) if TYPE_CHECKING: from .engine import CVBooster __all__ = [ + 'EarlyStopException', 'early_stopping', 'log_evaluation', 'record_evaluation', @@ -20,16 +22,20 @@ _EvalResultDict = Dict[str, Dict[str, List[Any]]] _EvalResultTuple = Union[ _LGBM_BoosterEvalMethodResultType, - Tuple[str, str, float, bool, float] + _LGBM_BoosterEvalMethodResultWithStandardDeviationType ] _ListOfEvalResultTuples = Union[ List[_LGBM_BoosterEvalMethodResultType], - List[Tuple[str, str, float, bool, float]] + List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType] ] class EarlyStopException(Exception): - """Exception of early stopping.""" + """Exception of early stopping. + + Raise this from a callback passed in via keyword argument ``callbacks`` + in ``cv()`` or ``train()`` to trigger early stopping. + """ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> None: """Create early stopping exception. @@ -38,6 +44,7 @@ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> ---------- best_iteration : int The best iteration stopped. + 0-based... pass ``best_iteration=2`` to indicate that the third iteration was the best one. best_score : list of (eval_name, metric_name, eval_result, is_higher_better) tuple or (eval_name, metric_name, eval_result, is_higher_better, stdv) tuple Scores for each metric, on each validation set, as of the best iteration. """ @@ -54,7 +61,7 @@ class CallbackEnv: iteration: int begin_iteration: int end_iteration: int - evaluation_result_list: Optional[List[_LGBM_BoosterEvalMethodResultType]] + evaluation_result_list: Optional[_ListOfEvalResultTuples] def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str: @@ -124,6 +131,11 @@ def __init__(self, eval_result: _EvalResultDict) -> None: self.eval_result = eval_result def _init(self, env: CallbackEnv) -> None: + if env.evaluation_result_list is None: + raise RuntimeError( + "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) self.eval_result.clear() for item in env.evaluation_result_list: if len(item) == 4: # regular train @@ -140,6 +152,11 @@ def _init(self, env: CallbackEnv) -> None: def __call__(self, env: CallbackEnv) -> None: if env.iteration == env.begin_iteration: self._init(env) + if env.evaluation_result_list is None: + raise RuntimeError( + "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) for item in env.evaluation_result_list: if len(item) == 4: data_name, eval_name, result = item[:3] @@ -278,6 +295,10 @@ def _is_train_set(self, ds_name: str, eval_name: str, train_name: str) -> bool: return (ds_name == "cv_agg" and eval_name == "train") or ds_name == train_name def _init(self, env: CallbackEnv) -> None: + if env.evaluation_result_list is None or env.evaluation_result_list == []: + raise ValueError( + "For early stopping, at least one dataset and eval metric is required for evaluation" + ) is_dart = any(env.params.get(alias, "") == 'dart' for alias in _ConfigAliases.get("boosting")) only_train_set = ( len(env.evaluation_result_list) == 1 @@ -293,9 +314,6 @@ def _init(self, env: CallbackEnv) -> None: elif only_train_set: _log_warning('Only training set found, disabling early stopping.') return - if not env.evaluation_result_list: - raise ValueError('For early stopping, ' - 'at least one dataset and eval metric is required for evaluation') if self.stopping_rounds <= 0: raise ValueError("stopping_rounds should be greater than zero.") @@ -357,6 +375,11 @@ def __call__(self, env: CallbackEnv) -> None: self._init(env) if not self.enabled: return + if env.evaluation_result_list is None: + raise RuntimeError( + "early_stopping() callback enabled but no evaluation results found. This is a probably bug in LightGBM. " + "Please report it at https://github.com/microsoft/LightGBM/issues" + ) # self.best_score_list is initialized to an empty list first_time_updating_best_score_list = (self.best_score_list == []) for i in range(len(env.evaluation_result_list)): diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index daa6e16b6a9a..822aa3b35017 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -11,9 +11,9 @@ from . import callback from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor, - _LGBM_BoosterEvalMethodResultType, _LGBM_CategoricalFeatureConfiguration, - _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration, - _log_warning) + _LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType, + _LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, + _LGBM_FeatureNameConfiguration, _log_warning) from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold __all__ = [ @@ -519,8 +519,8 @@ def _make_n_folds( def _agg_cv_result( - raw_results: List[List[Tuple[str, str, float, bool]]] -) -> List[Tuple[str, str, float, bool, float]]: + raw_results: List[List[_LGBM_BoosterEvalMethodResultType]] +) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]: """Aggregate cross-validation results.""" cvmap: Dict[str, List[float]] = OrderedDict() metric_type: Dict[str, bool] = {} @@ -530,7 +530,7 @@ def _agg_cv_result( metric_type[key] = one_line[3] cvmap.setdefault(key, []) cvmap[key].append(one_line[2]) - return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()] + return [('cv_agg', k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()] def cv( diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 7e909342c01f..c71c233df908 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1103,6 +1103,8 @@ def fit( # type: ignore[override] self._classes = self._le.classes_ self._n_classes = len(self._classes) # type: ignore[arg-type] + if self.objective is None: + self._objective = None # adjust eval metrics to match whether binary or multiclass # classification is being performed diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index f05b6fc22ddd..83520c5248cd 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -30,7 +30,7 @@ maintainers = [ name = "lightgbm" readme = "README.rst" requires-python = ">=3.6" -version = "4.1.0" +version = "4.1.0.99" [project.optional-dependencies] arrow = [ diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 8182c9b52b93..394614af3f33 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -664,12 +664,14 @@ void Config::GetMembersFromString(const std::unordered_map 0) { - if (USE_INDICES) { - if (USE_HESSIAN) { -#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) + if (USE_QUANT_GRAD) { + int16_t* ordered_gradients_and_hessians = reinterpret_cast(ordered_gradients); + const int16_t* gradients_and_hessians = reinterpret_cast(gradients); + if (USE_INDICES) { + #pragma omp parallel for schedule(static, 512) if (num_data >= 1024) for (data_size_t i = 0; i < num_data; ++i) { - ordered_gradients[i] = gradients[data_indices[i]]; - ordered_hessians[i] = hessians[data_indices[i]]; + ordered_gradients_and_hessians[i] = gradients_and_hessians[data_indices[i]]; } - ptr_ordered_grad = ordered_gradients; - ptr_ordered_hess = ordered_hessians; - } else { -#pragma omp parallel for schedule(static, 512) if (num_data >= 1024) - for (data_size_t i = 0; i < num_data; ++i) { - ordered_gradients[i] = gradients[data_indices[i]]; + ptr_ordered_grad = reinterpret_cast(ordered_gradients); + ptr_ordered_hess = nullptr; + } + } else { + if (USE_INDICES) { + if (USE_HESSIAN) { + #pragma omp parallel for schedule(static, 512) if (num_data >= 1024) + for (data_size_t i = 0; i < num_data; ++i) { + ordered_gradients[i] = gradients[data_indices[i]]; + ordered_hessians[i] = hessians[data_indices[i]]; + } + ptr_ordered_grad = ordered_gradients; + ptr_ordered_hess = ordered_hessians; + } else { + #pragma omp parallel for schedule(static, 512) if (num_data >= 1024) + for (data_size_t i = 0; i < num_data; ++i) { + ordered_gradients[i] = gradients[data_indices[i]]; + } + ptr_ordered_grad = ordered_gradients; } - ptr_ordered_grad = ordered_gradients; } } OMP_INIT_EX(); diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 163bfc4df9ca..fdf55693a0e9 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -53,6 +53,25 @@ class LeafSplits { weight_ = weight; } + /*! + * \brief Init split on current leaf on partial data. + * \param leaf Index of current leaf + * \param data_partition current data partition + * \param sum_gradients + * \param sum_hessians + * \param sum_gradients_and_hessians + * \param weight + */ + void Init(int leaf, const DataPartition* data_partition, double sum_gradients, + double sum_hessians, int64_t sum_gradients_and_hessians, double weight) { + leaf_index_ = leaf; + data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); + sum_gradients_ = sum_gradients; + sum_hessians_ = sum_hessians; + int_sum_gradients_and_hessians_ = sum_gradients_and_hessians; + weight_ = weight; + } + /*! * \brief Init split on current leaf on partial data. * \param leaf Index of current leaf diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index c322c1a796c2..37d9a2a50713 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -841,32 +841,65 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, #endif // init the leaves that used on next iteration - if (best_split_info.left_count < best_split_info.right_count) { - CHECK_GT(best_split_info.left_count, 0); - smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), - best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian, - best_split_info.left_output); - larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), - best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian, - best_split_info.right_output); + if (!config_->use_quantized_grad) { + if (best_split_info.left_count < best_split_info.right_count) { + CHECK_GT(best_split_info.left_count, 0); + smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, + best_split_info.left_output); + larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, + best_split_info.right_output); + } else { + CHECK_GT(best_split_info.right_count, 0); + smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, + best_split_info.right_output); + larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, + best_split_info.left_output); + } } else { - CHECK_GT(best_split_info.right_count, 0); - smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), - best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian, - best_split_info.right_output); - larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), - best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian, - best_split_info.left_output); + if (best_split_info.left_count < best_split_info.right_count) { + CHECK_GT(best_split_info.left_count, 0); + smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, + best_split_info.left_sum_gradient_and_hessian, + best_split_info.left_output); + larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, + best_split_info.right_sum_gradient_and_hessian, + best_split_info.right_output); + } else { + CHECK_GT(best_split_info.right_count, 0); + smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, + best_split_info.right_sum_gradient_and_hessian, + best_split_info.right_output); + larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, + best_split_info.left_sum_gradient_and_hessian, + best_split_info.left_output); + } } if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) { gradient_discretizer_->SetNumBitsInHistogramBin(*left_leaf, *right_leaf, data_partition_->leaf_count(*left_leaf), data_partition_->leaf_count(*right_leaf)); } + + #ifdef DEBUG + CheckSplit(best_split_info, *left_leaf, *right_leaf); + #endif + auto leaves_need_update = constraints_->Update( is_numerical_split, *left_leaf, *right_leaf, best_split_info.monotone_type, best_split_info.right_output, @@ -1024,4 +1057,48 @@ std::vector node_used_features = col_sampler_.GetByNode(tree, leaf); *split = bests[best_idx]; } +#ifdef DEBUG +void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index) { + data_size_t num_data_in_left = 0; + data_size_t num_data_in_right = 0; + const data_size_t* data_indices_in_left = data_partition_->GetIndexOnLeaf(left_leaf_index, &num_data_in_left); + const data_size_t* data_indices_in_right = data_partition_->GetIndexOnLeaf(right_leaf_index, &num_data_in_right); + if (config_->use_quantized_grad) { + int32_t sum_left_gradient = 0; + int32_t sum_left_hessian = 0; + int32_t sum_right_gradient = 0; + int32_t sum_right_hessian = 0; + const int8_t* discretized_grad_and_hess = gradient_discretizer_->discretized_gradients_and_hessians(); + for (data_size_t i = 0; i < num_data_in_left; ++i) { + const data_size_t index = data_indices_in_left[i]; + sum_left_gradient += discretized_grad_and_hess[2 * index + 1]; + sum_left_hessian += discretized_grad_and_hess[2 * index]; + } + for (data_size_t i = 0; i < num_data_in_right; ++i) { + const data_size_t index = data_indices_in_right[i]; + sum_right_gradient += discretized_grad_and_hess[2 * index + 1]; + sum_right_hessian += discretized_grad_and_hess[2 * index]; + } + Log::Warning("============================ start leaf split info ============================"); + Log::Warning("left_leaf_index = %d, right_leaf_index = %d", left_leaf_index, right_leaf_index); + Log::Warning("num_data_in_left = %d, num_data_in_right = %d", num_data_in_left, num_data_in_right); + Log::Warning("sum_left_gradient = %d, best_split_info->left_sum_gradient_and_hessian.gradient = %d", sum_left_gradient, + static_cast(best_split_info.left_sum_gradient_and_hessian >> 32)); + Log::Warning("sum_left_hessian = %d, best_split_info->left_sum_gradient_and_hessian.hessian = %d", sum_left_hessian, + static_cast(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff)); + Log::Warning("sum_right_gradient = %d, best_split_info->right_sum_gradient_and_hessian.gradient = %d", sum_right_gradient, + static_cast(best_split_info.right_sum_gradient_and_hessian >> 32)); + Log::Warning("sum_right_hessian = %d, best_split_info->right_sum_gradient_and_hessian.hessian = %d", sum_right_hessian, + static_cast(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff)); + CHECK_EQ(num_data_in_left, best_split_info.left_count); + CHECK_EQ(num_data_in_right, best_split_info.right_count); + CHECK_EQ(sum_left_gradient, static_cast(best_split_info.left_sum_gradient_and_hessian >> 32)) + CHECK_EQ(sum_left_hessian, static_cast(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff)); + CHECK_EQ(sum_right_gradient, static_cast(best_split_info.right_sum_gradient_and_hessian >> 32)); + CHECK_EQ(sum_right_hessian, static_cast(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff)); + Log::Warning("============================ end leaf split info ============================"); + } +} +#endif + } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index d815d265c0d2..93e0787a90cf 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -171,7 +171,9 @@ class SerialTreeLearner: public TreeLearner { std::set FindAllForceFeatures(Json force_split_leaf_setting); + #ifdef DEBUG void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index); + #endif /*! * \brief Get the number of data in a leaf diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index cb69440b3cde..9da50945385c 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1838,7 +1838,6 @@ def test_distributed_quantized_training(cluster): 'num_grad_quant_bins': 30, 'quant_train_renew_leaf': True, 'verbose': -1, - 'force_row_wise': True, } quant_dask_classifier = lgb.DaskLGBMRegressor( diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 25413d7ea072..b46526bcfaf6 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1092,6 +1092,33 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better): assert np.greater_equal(last_score, best_score - min_delta).any() +def test_early_stopping_can_be_triggered_via_custom_callback(): + X, y = make_synthetic_regression() + + def _early_stop_after_seventh_iteration(env): + if env.iteration == 6: + exc = lgb.EarlyStopException( + best_iteration=6, + best_score=[("some_validation_set", "some_metric", 0.708, True)] + ) + raise exc + + bst = lgb.train( + params={ + "objective": "regression", + "verbose": -1, + "num_leaves": 2 + }, + train_set=lgb.Dataset(X, label=y), + num_boost_round=23, + callbacks=[_early_stop_after_seventh_iteration] + ) + assert bst.num_trees() == 7 + assert bst.best_score["some_validation_set"]["some_metric"] == 0.708 + assert bst.best_iteration == 7 + assert bst.current_iteration() == 7 + + def test_continue_train(): X, y = make_synthetic_regression() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -1507,6 +1534,203 @@ def train_and_predict(init_model=None, return_model=False): assert ret_origin == pytest.approx(ret) +def test_all_expected_params_are_written_out_to_model_text(tmp_path): + X, y = make_synthetic_regression() + params = { + 'objective': 'mape', + 'metric': ['l2', 'mae'], + 'seed': 708, + 'data_sample_strategy': 'bagging', + 'sub_row': 0.8234, + 'verbose': -1 + } + dtrain = lgb.Dataset(data=X, label=y) + gbm = lgb.train( + params=params, + train_set=dtrain, + num_boost_round=3 + ) + + model_txt_from_memory = gbm.model_to_string() + model_file = tmp_path / "out.model" + gbm.save_model(filename=model_file) + with open(model_file, "r") as f: + model_txt_from_file = f.read() + + assert model_txt_from_memory == model_txt_from_file + + # entries whose values should reflect params passed to lgb.train() + non_default_param_entries = [ + "[objective: mape]", + # 'l1' was passed in with alias 'mae' + "[metric: l2,l1]", + "[data_sample_strategy: bagging]", + "[seed: 708]", + # NOTE: this was passed in with alias 'sub_row' + "[bagging_fraction: 0.8234]", + "[num_iterations: 3]", + ] + + # entries with default values of params + default_param_entries = [ + "[boosting: gbdt]", + "[tree_learner: serial]", + "[data: ]", + "[valid: ]", + "[learning_rate: 0.1]", + "[num_leaves: 31]", + "[num_threads: 0]", + "[deterministic: 0]", + "[histogram_pool_size: -1]", + "[max_depth: -1]", + "[min_data_in_leaf: 20]", + "[min_sum_hessian_in_leaf: 0.001]", + "[pos_bagging_fraction: 1]", + "[neg_bagging_fraction: 1]", + "[bagging_freq: 0]", + "[bagging_seed: 15415]", + "[feature_fraction: 1]", + "[feature_fraction_bynode: 1]", + "[feature_fraction_seed: 32671]", + "[extra_trees: 0]", + "[extra_seed: 6642]", + "[early_stopping_round: 0]", + "[first_metric_only: 0]", + "[max_delta_step: 0]", + "[lambda_l1: 0]", + "[lambda_l2: 0]", + "[linear_lambda: 0]", + "[min_gain_to_split: 0]", + "[drop_rate: 0.1]", + "[max_drop: 50]", + "[skip_drop: 0.5]", + "[xgboost_dart_mode: 0]", + "[uniform_drop: 0]", + "[drop_seed: 20623]", + "[top_rate: 0.2]", + "[other_rate: 0.1]", + "[min_data_per_group: 100]", + "[max_cat_threshold: 32]", + "[cat_l2: 10]", + "[cat_smooth: 10]", + "[max_cat_to_onehot: 4]", + "[top_k: 20]", + "[monotone_constraints: ]", + "[monotone_constraints_method: basic]", + "[monotone_penalty: 0]", + "[feature_contri: ]", + "[forcedsplits_filename: ]", + "[refit_decay_rate: 0.9]", + "[cegb_tradeoff: 1]", + "[cegb_penalty_split: 0]", + "[cegb_penalty_feature_lazy: ]", + "[cegb_penalty_feature_coupled: ]", + "[path_smooth: 0]", + "[interaction_constraints: ]", + "[verbosity: -1]", + "[saved_feature_importance_type: 0]", + "[use_quantized_grad: 0]", + "[num_grad_quant_bins: 4]", + "[quant_train_renew_leaf: 0]", + "[stochastic_rounding: 1]", + "[linear_tree: 0]", + "[max_bin: 255]", + "[max_bin_by_feature: ]", + "[min_data_in_bin: 3]", + "[bin_construct_sample_cnt: 200000]", + "[data_random_seed: 2350]", + "[is_enable_sparse: 1]", + "[enable_bundle: 1]", + "[use_missing: 1]", + "[zero_as_missing: 0]", + "[feature_pre_filter: 1]", + "[pre_partition: 0]", + "[two_round: 0]", + "[header: 0]", + "[label_column: ]", + "[weight_column: ]", + "[group_column: ]", + "[ignore_column: ]", + "[categorical_feature: ]", + "[forcedbins_filename: ]", + "[precise_float_parser: 0]", + "[parser_config_file: ]", + "[objective_seed: 4309]", + "[num_class: 1]", + "[is_unbalance: 0]", + "[scale_pos_weight: 1]", + "[sigmoid: 1]", + "[boost_from_average: 1]", + "[reg_sqrt: 0]", + "[alpha: 0.9]", + "[fair_c: 1]", + "[poisson_max_delta_step: 0.7]", + "[tweedie_variance_power: 1.5]", + "[lambdarank_truncation_level: 30]", + "[lambdarank_norm: 1]", + "[label_gain: ]", + "[lambdarank_position_bias_regularization: 0]", + "[eval_at: ]", + "[multi_error_top_k: 1]", + "[auc_mu_weights: ]", + "[num_machines: 1]", + "[local_listen_port: 12400]", + "[time_out: 120]", + "[machine_list_filename: ]", + "[machines: ]", + "[gpu_platform_id: -1]", + "[gpu_device_id: -1]", + "[num_gpu: 1]", + ] + all_param_entries = non_default_param_entries + default_param_entries + + # add device-specific entries + # + # passed-in force_col_wise / force_row_wise parameters are ignored on CUDA and GPU builds... + # https://github.com/microsoft/LightGBM/blob/1d7ee63686272bceffd522284127573b511df6be/src/io/config.cpp#L375-L377 + if getenv('TASK', '') == 'cuda': + device_entries = [ + "[force_col_wise: 0]", + "[force_row_wise: 1]", + "[device_type: cuda]", + "[gpu_use_dp: 1]" + ] + elif getenv('TASK', '') == 'gpu': + device_entries = [ + "[force_col_wise: 1]", + "[force_row_wise: 0]", + "[device_type: gpu]", + "[gpu_use_dp: 0]" + ] + else: + device_entries = [ + "[force_col_wise: 0]", + "[force_row_wise: 0]", + "[device_type: cpu]", + "[gpu_use_dp: 0]" + ] + + all_param_entries += device_entries + + # check that model text has all expected param entries + for param_str in all_param_entries: + assert param_str in model_txt_from_file + assert param_str in model_txt_from_memory + + # since Booster.model_to_string() is used when pickling, check that parameters all + # roundtrip pickling successfully too + gbm_pkl = pickle_and_unpickle_object(gbm, serializer="joblib") + model_txt_from_memory = gbm_pkl.model_to_string() + model_file = tmp_path / "out-pkl.model" + gbm_pkl.save_model(filename=model_file) + with open(model_file, "r") as f: + model_txt_from_file = f.read() + + for param_str in all_param_entries: + assert param_str in model_txt_from_file + assert param_str in model_txt_from_memory + + def test_pandas_categorical(): pd = pytest.importorskip("pandas") np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index e41719845c0a..2247c9a512d2 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1561,3 +1561,20 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type ) preds = model.predict(X) assert spearmanr(preds, y).correlation >= 0.99 + + +def test_classifier_fit_detects_classes_every_time(): + rng = np.random.default_rng(seed=123) + nrows = 1000 + ncols = 20 + + X = rng.standard_normal(size=(nrows, ncols)) + y_bin = (rng.random(size=nrows) <= .3).astype(np.float64) + y_multi = rng.integers(4, size=nrows) + + model = lgb.LGBMClassifier(verbose=-1) + for _ in range(2): + model.fit(X, y_multi) + assert model.objective_ == "multiclass" + model.fit(X, y_bin) + assert model.objective_ == "binary"