Skip to content

Commit

Permalink
Merge branch 'master' into ci/r-4.3
Browse files Browse the repository at this point in the history
  • Loading branch information
jameslamb authored Sep 13, 2023
2 parents 32269fc + ab1eaa8 commit ff7f6ca
Show file tree
Hide file tree
Showing 10 changed files with 457 additions and 51 deletions.
17 changes: 17 additions & 0 deletions R-package/tests/testthat/helper.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,20 @@
.LGB_VERBOSITY <- as.integer(
Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
)

# [description]
# test that every element of 'x' is in 'y'
#
# testthat::expect_in() is not available in version of {testthat}
# built for R 3.6, this is here to support a similar interface on R 3.6
.expect_in <- function(x, y) {
if (exists("expect_in")) {
expect_in(x, y)
} else {
missing_items <- x[!(x %in% y)]
if (length(missing_items) != 0L) {
error_msg <- paste0("Some expected items not found: ", toString(missing_items))
stop(error_msg)
}
}
}
163 changes: 146 additions & 17 deletions R-package/tests/testthat/test_lgb.Booster.R
Original file line number Diff line number Diff line change
Expand Up @@ -799,37 +799,166 @@ test_that("all parameters are stored correctly with save_model_to_string()", {
data = matrix(rnorm(500L), nrow = 100L)
, label = rnorm(100L)
)
nrounds <- 4L
bst <- lgb.train(
params = list(
objective = "regression"
, metric = "l2"
objective = "mape"
, metric = c("l2", "mae")
, num_threads = .LGB_MAX_THREADS
, seed = 708L
, data_sample_strategy = "bagging"
, sub_row = 0.8234
)
, data = dtrain
, nrounds = nrounds
, nrounds = 3L
, verbose = .LGB_VERBOSITY
)

model_str <- bst$save_model_to_string()
params_in_file <- .params_from_model_string(model_str = model_str)
# entries whose values should reflect params passed to lgb.train()
non_default_param_entries <- c(
"[objective: mape]"
# 'l1' was passed in with alias 'mae'
, "[metric: l2,l1]"
, "[data_sample_strategy: bagging]"
, "[seed: 708]"
# this was passed in with alias 'sub_row'
, "[bagging_fraction: 0.8234]"
, "[num_iterations: 3]"
)

# entries with default values of params
default_param_entries <- c(
"[boosting: gbdt]"
, "[tree_learner: serial]"
, "[device_type: cpu]"
, "[data: ]"
, "[valid: ]"
, "[learning_rate: 0.1]"
, "[num_leaves: 31]"
, sprintf("[num_threads: %i]", .LGB_MAX_THREADS)
, "[deterministic: 0]"
, "[histogram_pool_size: -1]"
, "[max_depth: -1]"
, "[min_data_in_leaf: 20]"
, "[min_sum_hessian_in_leaf: 0.001]"
, "[pos_bagging_fraction: 1]"
, "[neg_bagging_fraction: 1]"
, "[bagging_freq: 0]"
, "[bagging_seed: 15415]"
, "[feature_fraction: 1]"
, "[feature_fraction_bynode: 1]"
, "[feature_fraction_seed: 32671]"
, "[extra_trees: 0]"
, "[extra_seed: 6642]"
, "[early_stopping_round: 0]"
, "[first_metric_only: 0]"
, "[max_delta_step: 0]"
, "[lambda_l1: 0]"
, "[lambda_l2: 0]"
, "[linear_lambda: 0]"
, "[min_gain_to_split: 0]"
, "[drop_rate: 0.1]"
, "[max_drop: 50]"
, "[skip_drop: 0.5]"
, "[xgboost_dart_mode: 0]"
, "[uniform_drop: 0]"
, "[drop_seed: 20623]"
, "[top_rate: 0.2]"
, "[other_rate: 0.1]"
, "[min_data_per_group: 100]"
, "[max_cat_threshold: 32]"
, "[cat_l2: 10]"
, "[cat_smooth: 10]"
, "[max_cat_to_onehot: 4]"
, "[top_k: 20]"
, "[monotone_constraints: ]"
, "[monotone_constraints_method: basic]"
, "[monotone_penalty: 0]"
, "[feature_contri: ]"
, "[forcedsplits_filename: ]"
, "[force_col_wise: 0]"
, "[force_row_wise: 0]"
, "[refit_decay_rate: 0.9]"
, "[cegb_tradeoff: 1]"
, "[cegb_penalty_split: 0]"
, "[cegb_penalty_feature_lazy: ]"
, "[cegb_penalty_feature_coupled: ]"
, "[path_smooth: 0]"
, "[interaction_constraints: ]"
, sprintf("[verbosity: %i]", .LGB_VERBOSITY)
, "[saved_feature_importance_type: 0]"
, "[use_quantized_grad: 0]"
, "[num_grad_quant_bins: 4]"
, "[quant_train_renew_leaf: 0]"
, "[stochastic_rounding: 1]"
, "[linear_tree: 0]"
, "[max_bin: 255]"
, "[max_bin_by_feature: ]"
, "[min_data_in_bin: 3]"
, "[bin_construct_sample_cnt: 200000]"
, "[data_random_seed: 2350]"
, "[is_enable_sparse: 1]"
, "[enable_bundle: 1]"
, "[use_missing: 1]"
, "[zero_as_missing: 0]"
, "[feature_pre_filter: 1]"
, "[pre_partition: 0]"
, "[two_round: 0]"
, "[header: 0]"
, "[label_column: ]"
, "[weight_column: ]"
, "[group_column: ]"
, "[ignore_column: ]"
, "[categorical_feature: ]"
, "[forcedbins_filename: ]"
, "[precise_float_parser: 0]"
, "[parser_config_file: ]"
, "[objective_seed: 4309]"
, "[num_class: 1]"
, "[is_unbalance: 0]"
, "[scale_pos_weight: 1]"
, "[sigmoid: 1]"
, "[boost_from_average: 1]"
, "[reg_sqrt: 0]"
, "[alpha: 0.9]"
, "[fair_c: 1]"
, "[poisson_max_delta_step: 0.7]"
, "[tweedie_variance_power: 1.5]"
, "[lambdarank_truncation_level: 30]"
, "[lambdarank_norm: 1]"
, "[label_gain: ]"
, "[lambdarank_position_bias_regularization: 0]"
, "[eval_at: ]"
, "[multi_error_top_k: 1]"
, "[auc_mu_weights: ]"
, "[num_machines: 1]"
, "[local_listen_port: 12400]"
, "[time_out: 120]"
, "[machine_list_filename: ]"
, "[machines: ]"
, "[gpu_platform_id: -1]"
, "[gpu_device_id: -1]"
, "[gpu_use_dp: 0]"
, "[num_gpu: 1]"
)
all_param_entries <- c(non_default_param_entries, default_param_entries)

# parameters should match what was passed from the R package
expect_equal(sum(startsWith(params_in_file, "[metric:")), 1L)
expect_equal(sum(params_in_file == "[metric: l2]"), 1L)

expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L)
expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L)

expect_equal(sum(startsWith(params_in_file, "[objective:")), 1L)
expect_equal(sum(params_in_file == "[objective: regression]"), 1L)

expect_equal(sum(startsWith(params_in_file, "[verbosity:")), 1L)
expect_equal(sum(params_in_file == sprintf("[verbosity: %i]", .LGB_VERBOSITY)), 1L)
model_str <- bst$save_model_to_string()
params_in_file <- .params_from_model_string(model_str = model_str)
.expect_in(all_param_entries, params_in_file)

# early stopping should be off by default
expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L)
expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L)

# since save_model_to_string() is used when serializing with saveRDS(), check that parameters all
# roundtrip saveRDS()/loadRDS() successfully
rds_file <- tempfile()
saveRDS(bst, rds_file)
bst_rds <- readRDS(rds_file)
model_str <- bst_rds$save_model_to_string()
params_in_file <- .params_from_model_string(model_str = model_str)
.expect_in(all_param_entries, params_in_file)
})

test_that("early_stopping, num_iterations are stored correctly in model string even with aliases", {
Expand Down
4 changes: 2 additions & 2 deletions helpers/parameter_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def gen_parameter_code(
str_to_write += ' std::string tmp_str = "";\n'
for x in infos:
for y in x:
if "[doc-only]" in y:
if "[no-automatically-extract]" in y:
continue
param_type = y["inner_type"][0]
name = y["name"][0]
Expand All @@ -345,7 +345,7 @@ def gen_parameter_code(
str_to_write += " std::stringstream str_buf;\n"
for x in infos:
for y in x:
if "[doc-only]" in y or "[no-save]" in y:
if "[no-save]" in y:
continue
param_type = y["inner_type"][0]
name = y["name"][0]
Expand Down
36 changes: 21 additions & 15 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@
* \note
* - desc and descl2 fields must be written in reStructuredText format;
* - nested sections can be placed only at the bottom of parent's section;
* - [doc-only] tag indicates that only documentation for this param should be generated and all other actions are performed manually;
* - [no-save] tag indicates that this param should not be saved into a model text representation.
* - [no-automatically-extract]
* - do not automatically extract this parameter into a Config property with the same name in Config::GetMembersFromString(). Use if:
* - specialized extraction logic for this param exists in Config::GetMembersFromString()
* - [no-save]
* - this param should not be saved into a model text representation via Config::SaveMembersToString(). Use if:
* - param is only used by the CLI (especially the "predict" and "convert_model" tasks)
* - param is related to LightGBM writing files (e.g. "output_model", "save_binary")
*/
#ifndef LIGHTGBM_CONFIG_H_
#define LIGHTGBM_CONFIG_H_
Expand Down Expand Up @@ -97,15 +102,15 @@ struct Config {
#pragma region Core Parameters
#endif // __NVCC__

// [no-automatically-extract]
// [no-save]
// [doc-only]
// alias = config_file
// desc = path of config file
// desc = **Note**: can be used only in CLI version
std::string config = "";

// [no-automatically-extract]
// [no-save]
// [doc-only]
// type = enum
// default = train
// options = train, predict, convert_model, refit
Expand All @@ -118,7 +123,8 @@ struct Config {
// desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions
TaskType task = TaskType::kTrain;

// [doc-only]
// [no-automatically-extract]
// [no-save]
// type = enum
// options = regression, regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda, lambdarank, rank_xendcg
// alias = objective_type, app, application, loss
Expand Down Expand Up @@ -150,7 +156,8 @@ struct Config {
// descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
std::string objective = "regression";

// [doc-only]
// [no-automatically-extract]
// [no-save]
// type = enum
// alias = boosting_type, boost
// options = gbdt, rf, dart
Expand All @@ -160,7 +167,7 @@ struct Config {
// descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
std::string boosting = "gbdt";

// [doc-only]
// [no-automatically-extract]
// type = enum
// options = bagging, goss
// desc = ``bagging``, Randomly Bagging Sampling
Expand Down Expand Up @@ -200,7 +207,8 @@ struct Config {
// desc = max number of leaves in one tree
int num_leaves = kDefaultNumLeaves;

// [doc-only]
// [no-automatically-extract]
// [no-save]
// type = enum
// options = serial, feature, data, voting
// alias = tree, tree_type, tree_learner_type
Expand All @@ -222,7 +230,8 @@ struct Config {
// desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
int num_threads = 0;

// [doc-only]
// [no-automatically-extract]
// [no-save]
// type = enum
// options = cpu, gpu, cuda
// alias = device
Expand All @@ -235,7 +244,7 @@ struct Config {
// desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
std::string device_type = "cpu";

// [doc-only]
// [no-automatically-extract]
// alias = random_seed, random_state
// default = None
// desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc.
Expand Down Expand Up @@ -593,7 +602,6 @@ struct Config {
// desc = **Note**: can be used only in CLI version
int snapshot_freq = -1;

// [no-save]
// desc = whether to use gradient quantization when training
// desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
// desc = with quantized training, most arithmetics in the training process will be integer operations
Expand All @@ -602,21 +610,18 @@ struct Config {
// desc = *New in version 4.0.0*
bool use_quantized_grad = false;

// [no-save]
// desc = number of bins to quantization gradients and hessians
// desc = with more bins, the quantized training will be closer to full precision training
// desc = **Note**: can be used only with ``device_type = cpu``
// desc = *New in 4.0.0*
int num_grad_quant_bins = 4;

// [no-save]
// desc = whether to renew the leaf values with original gradients when quantized training
// desc = renewing is very helpful for good quantized training accuracy for ranking objectives
// desc = **Note**: can be used only with ``device_type = cpu``
// desc = *New in 4.0.0*
bool quant_train_renew_leaf = false;

// [no-save]
// desc = whether to use stochastic rounding in gradient quantization
// desc = *New in 4.0.0*
bool stochastic_rounding = true;
Expand Down Expand Up @@ -976,7 +981,8 @@ struct Config {
#pragma region Metric Parameters
#endif // __NVCC__

// [doc-only]
// [no-automatically-extract]
// [no-save]
// alias = metrics, metric_types
// default = ""
// type = multi-enum
Expand Down
4 changes: 2 additions & 2 deletions python-package/lightgbm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path

from .basic import Booster, Dataset, Sequence, register_logger
from .callback import early_stopping, log_evaluation, record_evaluation, reset_parameter
from .callback import EarlyStopException, early_stopping, log_evaluation, record_evaluation, reset_parameter
from .engine import CVBooster, cv, train

try:
Expand All @@ -32,5 +32,5 @@
'train', 'cv',
'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker',
'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'EarlyStopException',
'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph']
1 change: 1 addition & 0 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
_LGBM_EvalFunctionResultType = Tuple[str, float, bool]
_LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]]
_LGBM_BoosterEvalMethodResultType = Tuple[str, str, float, bool]
_LGBM_BoosterEvalMethodResultWithStandardDeviationType = Tuple[str, str, float, bool, float]
_LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], "Literal['auto']"]
_LGBM_FeatureNameConfiguration = Union[List[str], "Literal['auto']"]
_LGBM_GroupType = Union[
Expand Down
Loading

0 comments on commit ff7f6ca

Please sign in to comment.