From 7399fe1b7349a57050d260a0e2e13e2946240cce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Fri, 12 Aug 2022 01:22:01 -0500 Subject: [PATCH 01/23] initial work to retrieve parameters from loaded booster --- include/LightGBM/boosting.h | 2 ++ include/LightGBM/c_api.h | 6 ++++++ python-package/lightgbm/basic.py | 22 ++++++++++++++++++++++ src/boosting/gbdt.h | 16 ++++++++++++++++ src/c_api.cpp | 15 +++++++++++++++ 5 files changed, 61 insertions(+) diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index 7530495c0e17..fd2e6330869c 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -313,6 +313,8 @@ class LIGHTGBM_EXPORT Boosting { */ static Boosting* CreateBoosting(const std::string& type, const char* filename); + virtual std::string GetParameters() const = 0; + virtual bool IsLinear() const { return false; } virtual std::string ParserConfigStr() const = 0; diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 8e4d8d4d8602..f287eee7831b 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -497,6 +497,12 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterLoadModelFromString(const char* model_str, int* out_num_iterations, BoosterHandle* out); +LIGHTGBM_C_EXPORT int LGBM_BoosterGetParameters(BoosterHandle handle, + int64_t buffer_len, + int64_t* out_len, + char* out_str); + + /*! * \brief Free space for booster. * \param handle Handle of booster to be freed diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index a5e1bfb0a41e..db518c502138 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2765,6 +2765,28 @@ def __setstate__(self, state): state['handle'] = handle self.__dict__.update(state) + def _get_params(self) -> Dict[str, Any]: + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_BoosterGetParameters( + self.handle, + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_DumpParamAliases( + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + params = json.loads(string_buffer.value.decode('utf-8')) + return params + def free_dataset(self) -> "Booster": """Free Booster's Datasets. diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index f699719b525e..515f0645ce33 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -157,6 +157,22 @@ class GBDT : public GBDTBase { */ int GetCurrentIteration() const override { return static_cast(models_.size()) / num_tree_per_iteration_; } + /*! + * \brief Get parameters as a JSON string + */ + std::string GetParameters() const override { + std::stringstream str_buf; + auto lines = Common::Split(loaded_parameter_.c_str(), "\n"); + for (auto line : lines) { + auto pair = Common::Split(line.c_str(), "[:]"); + if (pair[1] != " ") { + str_buf << pair[0] << "=" << Common::Trim(pair[1]) << "\n"; + } + } + auto map = Config::Str2Map(str_buf.str().c_str()); + return Json(map).dump(); + } + /*! * \brief Can use early stopping for prediction or not * \return True if cannot use early stopping for prediction diff --git a/src/c_api.cpp b/src/c_api.cpp index d86862060917..dcd69a72b9e8 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -1624,6 +1624,21 @@ int LGBM_BoosterLoadModelFromString( API_END(); } +int LGBM_BoosterGetParameters( + BoosterHandle handle, + int64_t buffer_len, + int64_t* out_len, + char* out_str) { + API_BEGIN(); + Booster* ref_booster = reinterpret_cast(handle); + std::string params = ref_booster->GetBoosting()->GetParameters(); + *out_len = static_cast(params.size()) + 1; + if (*out_len <= buffer_len) { + std::memcpy(out_str, params.c_str(), *out_len); + } + API_END(); +} + #ifdef _MSC_VER #pragma warning(disable : 4702) #endif From 02ca63a950af672190f7bd5df172086af1383c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Sun, 14 Aug 2022 19:25:09 -0500 Subject: [PATCH 02/23] get parameter types and use to parse --- helpers/parameter_generator.py | 25 ++++++ include/LightGBM/c_api.h | 5 ++ include/LightGBM/config.h | 1 + python-package/lightgbm/basic.py | 42 +++++++++- src/boosting/gbdt.h | 3 + src/c_api.cpp | 12 +++ src/io/config_auto.cpp | 131 +++++++++++++++++++++++++++++++ 7 files changed, 216 insertions(+), 3 deletions(-) diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py index 9bc62b093a26..9e57ae7875a0 100644 --- a/helpers/parameter_generator.py +++ b/helpers/parameter_generator.py @@ -6,6 +6,7 @@ along with parameters description in LightGBM/docs/Parameters.rst file from the information in LightGBM/include/LightGBM/config.h file. """ +import re from collections import defaultdict from pathlib import Path from typing import Dict, List, Tuple @@ -373,6 +374,30 @@ def gen_parameter_code( } """ + str_to_write += """const std::string Config::ParameterTypes() { + std::stringstream str_buf; + str_buf << "{";""" + int_t_pat = re.compile(r'int\d+_t') + first = True + for x in infos: + for y in x: + if "[doc-only]" in y: + continue + param_type = int_t_pat.sub('int', y["inner_type"][0]).replace('std::', '') + name = y["name"][0] + prefix = f'\n str_buf << "' + if first: + first = False + else: + prefix += ',' + str_to_write += f'{prefix}\\"{name}\\": \\"{param_type}\\"";' + str_to_write += """ + str_buf << "}"; + return str_buf.str(); +} + +""" + str_to_write += "} // namespace LightGBM\n" with open(config_out_cpp, "w") as config_out_cpp_file: config_out_cpp_file.write(str_to_write) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index f287eee7831b..130de5953355 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -63,6 +63,11 @@ LIGHTGBM_C_EXPORT int LGBM_DumpParamAliases(int64_t buffer_len, int64_t* out_len, char* out_str); + +LIGHTGBM_C_EXPORT int LGBM_DumpParameterTypes(int64_t buffer_len, + int64_t* out_len, + char* out_str); + /*! * \brief Register a callback function for log redirecting. * \param callback The callback function to register diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index e88c4d7b70b7..69b16d24ec58 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -1075,6 +1075,7 @@ struct Config { static const std::unordered_set& parameter_set(); std::vector> auc_mu_weights_matrix; std::vector> interaction_constraints_vector; + static const std::string ParameterTypes(); static const std::string DumpAliases(); private: diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index db518c502138..871f93d8f0dc 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2765,6 +2765,27 @@ def __setstate__(self, state): state['handle'] = handle self.__dict__.update(state) + def _get_param_types(self) -> Dict[str, Any]: + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_DumpParameterTypes( + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_DumpParameterTypes( + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + return json.loads(ptr_string_buffer.value.decode('utf-8')) + + def _get_params(self) -> Dict[str, Any]: buffer_len = 1 << 20 tmp_out_len = ctypes.c_int64(0) @@ -2780,12 +2801,27 @@ def _get_params(self) -> Dict[str, Any]: if actual_len > buffer_len: string_buffer = ctypes.create_string_buffer(actual_len) ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_DumpParamAliases( + _safe_call(_LIB.LGBM_BoosterGetParameters( + self.handle, ctypes.c_int64(actual_len), ctypes.byref(tmp_out_len), ptr_string_buffer)) - params = json.loads(string_buffer.value.decode('utf-8')) - return params + params = json.loads(ptr_string_buffer.value.decode('utf-8')) + ptypes = self._get_param_types() + types_dict = {'string': str, 'int': int, 'double': float, 'bool': bool} + + def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: + if 'vector' in type_name: + if not value: + return [] + eltype_name = type_name[type_name.find('<') + 1 : type_name.find('>')] + eltype = types_dict[eltype_name] + return [eltype(v) for v in value.split(',')] + eltype = types_dict[type_name] + return eltype(value) + + return {param: parse_param(value, ptypes.get(param, 'string')) for param, value in params.items()} + def free_dataset(self) -> "Booster": """Free Booster's Datasets. diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 515f0645ce33..f7710da640e9 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -161,6 +161,9 @@ class GBDT : public GBDTBase { * \brief Get parameters as a JSON string */ std::string GetParameters() const override { + if (loaded_parameter_.empty()) { + return std::string("{}"); + } std::stringstream str_buf; auto lines = Common::Split(loaded_parameter_.c_str(), "\n"); for (auto line : lines) { diff --git a/src/c_api.cpp b/src/c_api.cpp index dcd69a72b9e8..7f365cb861bc 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -900,6 +900,18 @@ int LGBM_DumpParamAliases(int64_t buffer_len, API_END(); } +int LGBM_DumpParameterTypes(int64_t buffer_len, + int64_t* out_len, + char* out_str) { + API_BEGIN(); + std::string ptypes = Config::ParameterTypes(); + *out_len = static_cast(ptypes.size()) + 1; + if (*out_len <= buffer_len) { + std::memcpy(out_str, ptypes.c_str(), *out_len); + } + API_END(); +} + int LGBM_RegisterLogCallback(void (*callback)(const char*)) { API_BEGIN(); Log::ResetCallBack(callback); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 6c2e3cabad00..9ef6c11bd22a 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -894,4 +894,135 @@ const std::unordered_map>& Config::paramet return map; } +const std::string Config::ParameterTypes() { + std::stringstream str_buf; + str_buf << "{"; + str_buf << "\"data\": \"string\""; + str_buf << ",\"valid\": \"vector\""; + str_buf << ",\"num_iterations\": \"int\""; + str_buf << ",\"learning_rate\": \"double\""; + str_buf << ",\"num_leaves\": \"int\""; + str_buf << ",\"num_threads\": \"int\""; + str_buf << ",\"deterministic\": \"bool\""; + str_buf << ",\"force_col_wise\": \"bool\""; + str_buf << ",\"force_row_wise\": \"bool\""; + str_buf << ",\"histogram_pool_size\": \"double\""; + str_buf << ",\"max_depth\": \"int\""; + str_buf << ",\"min_data_in_leaf\": \"int\""; + str_buf << ",\"min_sum_hessian_in_leaf\": \"double\""; + str_buf << ",\"bagging_fraction\": \"double\""; + str_buf << ",\"pos_bagging_fraction\": \"double\""; + str_buf << ",\"neg_bagging_fraction\": \"double\""; + str_buf << ",\"bagging_freq\": \"int\""; + str_buf << ",\"bagging_seed\": \"int\""; + str_buf << ",\"feature_fraction\": \"double\""; + str_buf << ",\"feature_fraction_bynode\": \"double\""; + str_buf << ",\"feature_fraction_seed\": \"int\""; + str_buf << ",\"extra_trees\": \"bool\""; + str_buf << ",\"extra_seed\": \"int\""; + str_buf << ",\"early_stopping_round\": \"int\""; + str_buf << ",\"first_metric_only\": \"bool\""; + str_buf << ",\"max_delta_step\": \"double\""; + str_buf << ",\"lambda_l1\": \"double\""; + str_buf << ",\"lambda_l2\": \"double\""; + str_buf << ",\"linear_lambda\": \"double\""; + str_buf << ",\"min_gain_to_split\": \"double\""; + str_buf << ",\"drop_rate\": \"double\""; + str_buf << ",\"max_drop\": \"int\""; + str_buf << ",\"skip_drop\": \"double\""; + str_buf << ",\"xgboost_dart_mode\": \"bool\""; + str_buf << ",\"uniform_drop\": \"bool\""; + str_buf << ",\"drop_seed\": \"int\""; + str_buf << ",\"top_rate\": \"double\""; + str_buf << ",\"other_rate\": \"double\""; + str_buf << ",\"min_data_per_group\": \"int\""; + str_buf << ",\"max_cat_threshold\": \"int\""; + str_buf << ",\"cat_l2\": \"double\""; + str_buf << ",\"cat_smooth\": \"double\""; + str_buf << ",\"max_cat_to_onehot\": \"int\""; + str_buf << ",\"top_k\": \"int\""; + str_buf << ",\"monotone_constraints\": \"vector\""; + str_buf << ",\"monotone_constraints_method\": \"string\""; + str_buf << ",\"monotone_penalty\": \"double\""; + str_buf << ",\"feature_contri\": \"vector\""; + str_buf << ",\"forcedsplits_filename\": \"string\""; + str_buf << ",\"refit_decay_rate\": \"double\""; + str_buf << ",\"cegb_tradeoff\": \"double\""; + str_buf << ",\"cegb_penalty_split\": \"double\""; + str_buf << ",\"cegb_penalty_feature_lazy\": \"vector\""; + str_buf << ",\"cegb_penalty_feature_coupled\": \"vector\""; + str_buf << ",\"path_smooth\": \"double\""; + str_buf << ",\"interaction_constraints\": \"string\""; + str_buf << ",\"verbosity\": \"int\""; + str_buf << ",\"input_model\": \"string\""; + str_buf << ",\"output_model\": \"string\""; + str_buf << ",\"saved_feature_importance_type\": \"int\""; + str_buf << ",\"snapshot_freq\": \"int\""; + str_buf << ",\"linear_tree\": \"bool\""; + str_buf << ",\"max_bin\": \"int\""; + str_buf << ",\"max_bin_by_feature\": \"vector\""; + str_buf << ",\"min_data_in_bin\": \"int\""; + str_buf << ",\"bin_construct_sample_cnt\": \"int\""; + str_buf << ",\"data_random_seed\": \"int\""; + str_buf << ",\"is_enable_sparse\": \"bool\""; + str_buf << ",\"enable_bundle\": \"bool\""; + str_buf << ",\"use_missing\": \"bool\""; + str_buf << ",\"zero_as_missing\": \"bool\""; + str_buf << ",\"feature_pre_filter\": \"bool\""; + str_buf << ",\"pre_partition\": \"bool\""; + str_buf << ",\"two_round\": \"bool\""; + str_buf << ",\"header\": \"bool\""; + str_buf << ",\"label_column\": \"string\""; + str_buf << ",\"weight_column\": \"string\""; + str_buf << ",\"group_column\": \"string\""; + str_buf << ",\"ignore_column\": \"string\""; + str_buf << ",\"categorical_feature\": \"string\""; + str_buf << ",\"forcedbins_filename\": \"string\""; + str_buf << ",\"save_binary\": \"bool\""; + str_buf << ",\"precise_float_parser\": \"bool\""; + str_buf << ",\"parser_config_file\": \"string\""; + str_buf << ",\"start_iteration_predict\": \"int\""; + str_buf << ",\"num_iteration_predict\": \"int\""; + str_buf << ",\"predict_raw_score\": \"bool\""; + str_buf << ",\"predict_leaf_index\": \"bool\""; + str_buf << ",\"predict_contrib\": \"bool\""; + str_buf << ",\"predict_disable_shape_check\": \"bool\""; + str_buf << ",\"pred_early_stop\": \"bool\""; + str_buf << ",\"pred_early_stop_freq\": \"int\""; + str_buf << ",\"pred_early_stop_margin\": \"double\""; + str_buf << ",\"output_result\": \"string\""; + str_buf << ",\"convert_model_language\": \"string\""; + str_buf << ",\"convert_model\": \"string\""; + str_buf << ",\"objective_seed\": \"int\""; + str_buf << ",\"num_class\": \"int\""; + str_buf << ",\"is_unbalance\": \"bool\""; + str_buf << ",\"scale_pos_weight\": \"double\""; + str_buf << ",\"sigmoid\": \"double\""; + str_buf << ",\"boost_from_average\": \"bool\""; + str_buf << ",\"reg_sqrt\": \"bool\""; + str_buf << ",\"alpha\": \"double\""; + str_buf << ",\"fair_c\": \"double\""; + str_buf << ",\"poisson_max_delta_step\": \"double\""; + str_buf << ",\"tweedie_variance_power\": \"double\""; + str_buf << ",\"lambdarank_truncation_level\": \"int\""; + str_buf << ",\"lambdarank_norm\": \"bool\""; + str_buf << ",\"label_gain\": \"vector\""; + str_buf << ",\"metric_freq\": \"int\""; + str_buf << ",\"is_provide_training_metric\": \"bool\""; + str_buf << ",\"eval_at\": \"vector\""; + str_buf << ",\"multi_error_top_k\": \"int\""; + str_buf << ",\"auc_mu_weights\": \"vector\""; + str_buf << ",\"num_machines\": \"int\""; + str_buf << ",\"local_listen_port\": \"int\""; + str_buf << ",\"time_out\": \"int\""; + str_buf << ",\"machine_list_filename\": \"string\""; + str_buf << ",\"machines\": \"string\""; + str_buf << ",\"gpu_platform_id\": \"int\""; + str_buf << ",\"gpu_device_id\": \"int\""; + str_buf << ",\"gpu_use_dp\": \"bool\""; + str_buf << ",\"num_gpu\": \"int\""; + str_buf << "}"; + return str_buf.str(); +} + } // namespace LightGBM From c81f7682aa39f65c38c86714118d54ccfbfe36ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Mon, 15 Aug 2022 21:05:11 -0500 Subject: [PATCH 03/23] add test --- python-package/lightgbm/basic.py | 51 +++++++++++++----------- tests/python_package_test/test_engine.py | 19 +++++++++ 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 871f93d8f0dc..e49b9fdf4844 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -6,7 +6,7 @@ import warnings from collections import OrderedDict from copy import deepcopy -from functools import wraps +from functools import lru_cache, wraps from os import SEEK_END, environ from os.path import getsize from pathlib import Path @@ -444,6 +444,30 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va return params +@lru_cache +def _get_param_types() -> Dict[str, str]: + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_DumpParameterTypes( + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_DumpParameterTypes( + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + res = json.loads(ptr_string_buffer.value.decode('utf-8')) + res['categorical_feature'] = 'vector' + return res + + MAX_INT32 = (1 << 31) - 1 """Macro definition of data type in C API of LightGBM""" @@ -2722,6 +2746,8 @@ def __init__( else: raise TypeError('Need at least one training dataset or model file or model string ' 'to create Booster instance') + if model_file is not None or model_str is not None: + params = self._get_params() self.params = params def __del__(self) -> None: @@ -2765,27 +2791,6 @@ def __setstate__(self, state): state['handle'] = handle self.__dict__.update(state) - def _get_param_types(self) -> Dict[str, Any]: - buffer_len = 1 << 20 - tmp_out_len = ctypes.c_int64(0) - string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_DumpParameterTypes( - ctypes.c_int64(buffer_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - actual_len = tmp_out_len.value - # if buffer length is not long enough, re-allocate a buffer - if actual_len > buffer_len: - string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_DumpParameterTypes( - ctypes.c_int64(actual_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - return json.loads(ptr_string_buffer.value.decode('utf-8')) - - def _get_params(self) -> Dict[str, Any]: buffer_len = 1 << 20 tmp_out_len = ctypes.c_int64(0) @@ -2807,7 +2812,7 @@ def _get_params(self) -> Dict[str, Any]: ctypes.byref(tmp_out_len), ptr_string_buffer)) params = json.loads(ptr_string_buffer.value.decode('utf-8')) - ptypes = self._get_param_types() + ptypes = _get_param_types() types_dict = {'string': str, 'int': int, 'double': float, 'bool': bool} def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index d4852ce4a95a..1c8662b27c86 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1102,6 +1102,25 @@ def test_feature_name_with_non_ascii(): assert feature_names == gbm2.feature_name() +def test_parameters_are_loaded_from_model_file(tmp_path): + X = np.hstack([np.random.rand(100, 1), np.random.randint(0, 5, (100, 2))]) + y = np.random.rand(100) + ds = lgb.Dataset(X, y) + params = { + 'num_leaves': 5, + 'bagging_fraction': 0.8, + 'bagging_freq': 2, + 'feature_fraction': 0.7, + 'force_col_wise': True, + 'num_threads': 1, + } + model_file = tmp_path / 'model.txt' + lgb.train(params, ds, num_boost_round=1, categorical_feature=[1, 2]).save_model(model_file) + bst = lgb.Booster(model_file=model_file) + assert all(bst.params[k] == params[k] for k in params) # bst.params has all parameters + assert bst.params['categorical_feature'] == [1, 2] + + def test_save_load_copy_pickle(): def train_and_predict(init_model=None, return_model=False): X, y = make_synthetic_regression() From b33d6a03d3beb2245164a1be2365b61caad591fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 16 Aug 2022 14:12:59 -0500 Subject: [PATCH 04/23] True for boolean field if it's equal to '1' --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index ec7fa05b81b7..7c77f651c5c7 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2829,7 +2829,7 @@ def _get_params(self) -> Dict[str, Any]: ptr_string_buffer)) params = json.loads(ptr_string_buffer.value.decode('utf-8')) ptypes = _get_param_types() - types_dict = {'string': str, 'int': int, 'double': float, 'bool': bool} + types_dict = {'string': str, 'int': int, 'double': float, 'bool': lambda x: x == '1'} def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: if 'vector' in type_name: From c7a6a229e448e0a0eaf82c248a320bfddd625d83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 16 Aug 2022 15:31:50 -0500 Subject: [PATCH 05/23] remove bound on cache --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 7c77f651c5c7..c58d65c2561e 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -455,7 +455,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va return params -@lru_cache +@lru_cache(maxsize=None) def _get_param_types() -> Dict[str, str]: buffer_len = 1 << 20 tmp_out_len = ctypes.c_int64(0) From f43934e92511378ac0affec7dca8eebe5222946d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 16 Aug 2022 22:48:36 -0500 Subject: [PATCH 06/23] remove duplicated code --- python-package/lightgbm/basic.py | 92 +++++++++++--------------------- 1 file changed, 32 insertions(+), 60 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index ec7fa05b81b7..59f8f91da32d 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -156,6 +156,28 @@ def _safe_call(ret: int) -> None: raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8')) +def _get_string_from_c_api(func: Callable, booster_handle: Optional[ctypes.c_void_p] = None) -> str: + def c_api_call(buffer_len: int, out_len: ctypes.c_int64): + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + args = (ctypes.c_int64(buffer_len), ctypes.byref(out_len), ptr_string_buffer) + if booster_handle is None: + f = func(*args) + else: + f = func(booster_handle, *args) + _safe_call(f) + return ptr_string_buffer.value.decode('utf-8') + + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + res = c_api_call(buffer_len, tmp_out_len) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + res = c_api_call(actual_len, tmp_out_len) + return res + + def _is_numeric(obj: Any) -> bool: """Check whether object is a number or not, include numpy number, etc.""" try: @@ -357,25 +379,9 @@ class _ConfigAliases: @staticmethod def _get_all_param_aliases() -> Dict[str, List[str]]: - buffer_len = 1 << 20 - tmp_out_len = ctypes.c_int64(0) - string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_DumpParamAliases( - ctypes.c_int64(buffer_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - actual_len = tmp_out_len.value - # if buffer length is not long enough, re-allocate a buffer - if actual_len > buffer_len: - string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_DumpParamAliases( - ctypes.c_int64(actual_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) + aliases_str = _get_string_from_c_api(_LIB.LGBM_DumpParamAliases) aliases = json.loads( - string_buffer.value.decode('utf-8'), + aliases_str, object_hook=lambda obj: {k: [k] + v for k, v in obj.items()} ) return aliases @@ -456,25 +462,9 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va @lru_cache -def _get_param_types() -> Dict[str, str]: - buffer_len = 1 << 20 - tmp_out_len = ctypes.c_int64(0) - string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_DumpParameterTypes( - ctypes.c_int64(buffer_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - actual_len = tmp_out_len.value - # if buffer length is not long enough, re-allocate a buffer - if actual_len > buffer_len: - string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_DumpParameterTypes( - ctypes.c_int64(actual_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - res = json.loads(ptr_string_buffer.value.decode('utf-8')) +def _get_parameter_types() -> Dict[str, str]: + types_str = _get_string_from_c_api(_LIB.LGBM_DumpParameterTypes) + res = json.loads(types_str) res['categorical_feature'] = 'vector' return res @@ -2763,7 +2753,7 @@ def __init__( raise TypeError('Need at least one training dataset or model file or model string ' 'to create Booster instance') if model_file is not None or model_str is not None: - params = self._get_params() + params = self._get_parameters() self.params = params def __del__(self) -> None: @@ -2807,28 +2797,10 @@ def __setstate__(self, state): state['handle'] = handle self.__dict__.update(state) - def _get_params(self) -> Dict[str, Any]: - buffer_len = 1 << 20 - tmp_out_len = ctypes.c_int64(0) - string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_BoosterGetParameters( - self.handle, - ctypes.c_int64(buffer_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - actual_len = tmp_out_len.value - # if buffer length is not long enough, re-allocate a buffer - if actual_len > buffer_len: - string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_BoosterGetParameters( - self.handle, - ctypes.c_int64(actual_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - params = json.loads(ptr_string_buffer.value.decode('utf-8')) - ptypes = _get_param_types() + def _get_parameters(self) -> Dict[str, Any]: + params_str = _get_string_from_c_api(_LIB.LGBM_BoosterGetParameters, self.handle) + params = json.loads(params_str) + ptypes = _get_parameter_types() types_dict = {'string': str, 'int': int, 'double': float, 'bool': bool} def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: From 7761124b822ad34f2545ed658845e033b499e60f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 16 Aug 2022 23:12:09 -0500 Subject: [PATCH 07/23] manually parse json string --- src/boosting/gbdt.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index f7710da640e9..aa1b10f8a7ed 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -173,7 +173,20 @@ class GBDT : public GBDTBase { } } auto map = Config::Str2Map(str_buf.str().c_str()); - return Json(map).dump(); + str_buf.str(""); + str_buf << "{"; + bool first = true; + for (auto it = map.cbegin(); it != map.cend(); ++it) { + if (first) { + first = false; + str_buf << "\""; + } else { + str_buf << ",\""; + } + str_buf << it->first << "\": \"" << it->second << "\""; + } + str_buf << "}"; + return str_buf.str(); } /*! From 26ba91f2b45ceacdb2ffb3c9fe9db8ddf70cb321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 16 Aug 2022 23:29:46 -0500 Subject: [PATCH 08/23] dont create temporary map. lint --- python-package/lightgbm/basic.py | 3 +-- src/boosting/gbdt.h | 23 +++++++++-------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index ac952fdbff24..6dcab0085fd0 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2807,7 +2807,7 @@ def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: if 'vector' in type_name: if not value: return [] - eltype_name = type_name[type_name.find('<') + 1 : type_name.find('>')] + eltype_name = type_name[type_name.find('<') + 1:type_name.find('>')] eltype = types_dict[eltype_name] return [eltype(v) for v in value.split(',')] eltype = types_dict[type_name] @@ -2815,7 +2815,6 @@ def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: return {param: parse_param(value, ptypes.get(param, 'string')) for param, value in params.items()} - def free_dataset(self) -> "Booster": """Free Booster's Datasets. diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index aa1b10f8a7ed..a6133a2ed047 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -165,25 +165,20 @@ class GBDT : public GBDTBase { return std::string("{}"); } std::stringstream str_buf; + str_buf << "{"; auto lines = Common::Split(loaded_parameter_.c_str(), "\n"); + bool first = true; for (auto line : lines) { auto pair = Common::Split(line.c_str(), "[:]"); if (pair[1] != " ") { - str_buf << pair[0] << "=" << Common::Trim(pair[1]) << "\n"; - } - } - auto map = Config::Str2Map(str_buf.str().c_str()); - str_buf.str(""); - str_buf << "{"; - bool first = true; - for (auto it = map.cbegin(); it != map.cend(); ++it) { - if (first) { - first = false; - str_buf << "\""; - } else { - str_buf << ",\""; + if (first) { + first = false; + str_buf << "\""; + } else { + str_buf << ",\""; + } + str_buf << pair[0] << "\": \"" << Common::Trim(pair[1]) << "\""; } - str_buf << it->first << "\": \"" << it->second << "\""; } str_buf << "}"; return str_buf.str(); From ec113c0c4c31485b56c6a175bc39dcfb0b60749c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 16 Aug 2022 23:32:19 -0500 Subject: [PATCH 09/23] add doc --- include/LightGBM/c_api.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 2fe83731330c..302cd9fdbe94 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -600,6 +600,14 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterLoadModelFromString(const char* model_str, int* out_num_iterations, BoosterHandle* out); +/*! + * \brief Get parameters as JSON string. + * \param handle Handle of booster. + * \param buffer_len Allocated space for string. + * \param[out] out_len Actual size of string. + * \param[out] out_str JSON string containing parameters. + * \return 0 when succeed, -1 when failure happens + */ LIGHTGBM_C_EXPORT int LGBM_BoosterGetParameters(BoosterHandle handle, int64_t buffer_len, int64_t* out_len, From 39c7a8ce795376dcfc2b8bc4641e47723e3ee4fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Sat, 27 Aug 2022 18:48:05 -0500 Subject: [PATCH 10/23] minor fixes --- include/LightGBM/c_api.h | 8 +++++++- src/boosting/gbdt.h | 6 +++--- tests/python_package_test/test_engine.py | 1 + 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 302cd9fdbe94..e51b2119c263 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -63,7 +63,13 @@ LIGHTGBM_C_EXPORT int LGBM_DumpParamAliases(int64_t buffer_len, int64_t* out_len, char* out_str); - +/*! + * \brief Dump all parameter names with their types to JSON. + * \param buffer_len String buffer length, if ``buffer_len < out_len``, you should re-allocate buffer + * \param[out] out_len Actual output length + * \param[out] out_str JSON format string of parameters, should pre-allocate memory + * \return 0 when succeed, -1 when failure happens + */ LIGHTGBM_C_EXPORT int LGBM_DumpParameterTypes(int64_t buffer_len, int64_t* out_len, char* out_str); diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index a6133a2ed047..200aa588c006 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -166,10 +166,10 @@ class GBDT : public GBDTBase { } std::stringstream str_buf; str_buf << "{"; - auto lines = Common::Split(loaded_parameter_.c_str(), "\n"); + const auto lines = Common::Split(loaded_parameter_.c_str(), "\n"); bool first = true; - for (auto line : lines) { - auto pair = Common::Split(line.c_str(), "[:]"); + for (const auto& line : lines) { + const auto pair = Common::Split(line.c_str(), "[:]"); if (pair[1] != " ") { if (first) { first = false; diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 1c8662b27c86..4205173ab524 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1107,6 +1107,7 @@ def test_parameters_are_loaded_from_model_file(tmp_path): y = np.random.rand(100) ds = lgb.Dataset(X, y) params = { + 'boosting': 'rf', 'num_leaves': 5, 'bagging_fraction': 0.8, 'bagging_freq': 2, From 0e6591b974f151eb8fb90afcf63112b250cc13fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Sat, 27 Aug 2022 21:52:28 -0500 Subject: [PATCH 11/23] revert _get_string_from_c_api. rename parameter to param --- include/LightGBM/c_api.h | 2 +- python-package/lightgbm/basic.py | 95 +++++++++++++++++++++----------- src/c_api.cpp | 2 +- 3 files changed, 64 insertions(+), 35 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index e51b2119c263..cf38bc113414 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -70,7 +70,7 @@ LIGHTGBM_C_EXPORT int LGBM_DumpParamAliases(int64_t buffer_len, * \param[out] out_str JSON format string of parameters, should pre-allocate memory * \return 0 when succeed, -1 when failure happens */ -LIGHTGBM_C_EXPORT int LGBM_DumpParameterTypes(int64_t buffer_len, +LIGHTGBM_C_EXPORT int LGBM_DumpParamTypes(int64_t buffer_len, int64_t* out_len, char* out_str); diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 6dcab0085fd0..f4253ea09af6 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -156,28 +156,6 @@ def _safe_call(ret: int) -> None: raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8')) -def _get_string_from_c_api(func: Callable, booster_handle: Optional[ctypes.c_void_p] = None) -> str: - def c_api_call(buffer_len: int, out_len: ctypes.c_int64): - string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - args = (ctypes.c_int64(buffer_len), ctypes.byref(out_len), ptr_string_buffer) - if booster_handle is None: - f = func(*args) - else: - f = func(booster_handle, *args) - _safe_call(f) - return ptr_string_buffer.value.decode('utf-8') - - buffer_len = 1 << 20 - tmp_out_len = ctypes.c_int64(0) - res = c_api_call(buffer_len, tmp_out_len) - actual_len = tmp_out_len.value - # if buffer length is not long enough, re-allocate a buffer - if actual_len > buffer_len: - res = c_api_call(actual_len, tmp_out_len) - return res - - def _is_numeric(obj: Any) -> bool: """Check whether object is a number or not, include numpy number, etc.""" try: @@ -379,9 +357,25 @@ class _ConfigAliases: @staticmethod def _get_all_param_aliases() -> Dict[str, List[str]]: - aliases_str = _get_string_from_c_api(_LIB.LGBM_DumpParamAliases) + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_DumpParamAliases( + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_DumpParamAliases( + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) aliases = json.loads( - aliases_str, + string_buffer.value.decode('utf-8'), object_hook=lambda obj: {k: [k] + v for k, v in obj.items()} ) return aliases @@ -462,10 +456,28 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va @lru_cache(maxsize=None) -def _get_parameter_types() -> Dict[str, str]: - types_str = _get_string_from_c_api(_LIB.LGBM_DumpParameterTypes) - res = json.loads(types_str) +def _get_param_types() -> Dict[str, str]: + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_DumpParamTypes( + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_DumpParamTypes( + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + res = json.loads(string_buffer.value.decode('utf-8')) res['categorical_feature'] = 'vector' + res['monotone_constraints'] = 'vector' + res['max_bin_by_feature'] = 'vector' return res @@ -2747,13 +2759,12 @@ def __init__( ctypes.byref(out_num_class))) self.__num_class = out_num_class.value self.pandas_categorical = _load_pandas_categorical(file_name=model_file) + params = self._get_params() elif model_str is not None: self.model_from_string(model_str) else: raise TypeError('Need at least one training dataset or model file or model string ' 'to create Booster instance') - if model_file is not None or model_str is not None: - params = self._get_parameters() self.params = params def __del__(self) -> None: @@ -2797,10 +2808,28 @@ def __setstate__(self, state): state['handle'] = handle self.__dict__.update(state) - def _get_parameters(self) -> Dict[str, Any]: - params_str = _get_string_from_c_api(_LIB.LGBM_BoosterGetParameters, self.handle) - params = json.loads(params_str) - ptypes = _get_parameter_types() + def _get_params(self) -> Dict[str, Any]: + buffer_len = 1 << 20 + tmp_out_len = ctypes.c_int64(0) + string_buffer = ctypes.create_string_buffer(buffer_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_BoosterGetParameters( + self.handle, + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) + _safe_call(_LIB.LGBM_BoosterGetParameters( + self.handle, + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer)) + params = json.loads(string_buffer.value.decode('utf-8')) + ptypes = _get_param_types() types_dict = {'string': str, 'int': int, 'double': float, 'bool': lambda x: x == '1'} def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: diff --git a/src/c_api.cpp b/src/c_api.cpp index 3df5900f2f37..b51f385fb942 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -900,7 +900,7 @@ int LGBM_DumpParamAliases(int64_t buffer_len, API_END(); } -int LGBM_DumpParameterTypes(int64_t buffer_len, +int LGBM_DumpParamTypes(int64_t buffer_len, int64_t* out_len, char* out_str) { API_BEGIN(); From d4e781b2a8883ba523bc7da64cf158addd14b8ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Sat, 27 Aug 2022 21:54:35 -0500 Subject: [PATCH 12/23] add R-package functions --- R-package/R/aliases.R | 21 ++++++++++ R-package/R/lgb.Booster.R | 41 ++++++++++++++++++++ R-package/src/lightgbm_R.cpp | 43 +++++++++++++++++++++ R-package/src/lightgbm_R.h | 15 +++++++ R-package/tests/testthat/test_lgb.Booster.R | 21 +++++++--- 5 files changed, 135 insertions(+), 6 deletions(-) diff --git a/R-package/R/aliases.R b/R-package/R/aliases.R index 0aa886ab90c2..0729fabf148a 100644 --- a/R-package/R/aliases.R +++ b/R-package/R/aliases.R @@ -63,6 +63,27 @@ return(params_to_aliases) } +# [description] List of parameter types. Wrapped in a function to take advantage of +# lazy evaluation (so it doesn't matter what order R sources files during installation). +# [return] A named list, where each key is a main LightGBM parameter and each value is a character +# vector of corresponding of their type name in C++. +.PARAMETER_TYPES <- function() { + json_str <- .Call( + LGBM_DumpParamTypes_R + ) + param_types <- jsonlite::fromJSON(json_str) + param_types["categorical_feature"] <- "vector" + param_types["monotone_constraints"] <- "vector" + param_types["max_bin_by_feature"] <- "vector" + # store in cache so the next call to `.PARAMETER_TYPES()` doesn't need to recompute this + assign( + x = "PARAMETER_TYPES" + , value = param_types + , envir = .lgb_session_cache_env + ) + return(param_types) +} + # [description] # Per https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst#metric, # a few different strings can be used to indicate "no metrics". diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 5fd0ef02f229..876b6ce5b1a0 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -77,6 +77,7 @@ Booster <- R6::R6Class( LGBM_BoosterCreateFromModelfile_R , modelfile ) + params <- private$get_params(handle) } else if (!is.null(model_str)) { @@ -674,6 +675,46 @@ Booster <- R6::R6Class( }, + get_params = function(handle) { + params_str <- .Call( + LGBM_BoosterGetParameters_R + , handle + ) + params <- jsonlite::fromJSON(params_str) + param_types <- .PARAMETER_TYPES() + + type_name_to_fn <- c( + "string" = as.character + , "int" = as.integer + , "double" = as.numeric + , "bool" = function(x) x == "1" + ) + + parse_param <- function(value, type_name) { + if (grepl("vector", type_name)) { + eltype_name <- sub("vector<(.*)>", "\\1", type_name) + parse_fn <- type_name_to_fn[[eltype_name]] + values <- strsplit(value, ",") + return(lapply(values, parse_fn)) + } + parse_fn <- type_name_to_fn[[type_name]] + parse_fn(value) + } + + res <- list() + for (param_name in names(params)) { + if (param_name %in% names(param_types)) { + type_name <- param_types[[param_name]] + } else { + type_name <- "string" + } + res[param_name] <- parse_param(params[[param_name]], type_name) + } + + return(res) + + }, + inner_eval = function(data_name, data_idx, feval = NULL) { # Check for unknown dataset (over the maximum provided range) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 560622788422..aa18df8e61ad 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -1019,6 +1019,47 @@ SEXP LGBM_DumpParamAliases_R() { R_API_END(); } +SEXP LGBM_BoosterGetParameters_R(SEXP handle) { + SEXP cont_token = PROTECT(R_MakeUnwindCont()); + R_API_BEGIN(); + _AssertBoosterHandleNotNull(handle); + SEXP params_str; + int64_t out_len = 0; + int64_t buf_len = 1024 * 1024; + std::vector inner_char_buf(buf_len); + CHECK_CALL(LGBM_BoosterGetParameters(R_ExternalPtrAddr(handle), buf_len, &out_len, inner_char_buf.data())); + // if aliases string was larger than the initial buffer, allocate a bigger buffer and try again + if (out_len > buf_len) { + inner_char_buf.resize(out_len); + CHECK_CALL(LGBM_BoosterGetParameters(R_ExternalPtrAddr(handle), out_len, &out_len, inner_char_buf.data())); + } + params_str = PROTECT(safe_R_string(static_cast(1), &cont_token)); + SET_STRING_ELT(params_str, 0, safe_R_mkChar(inner_char_buf.data(), &cont_token)); + UNPROTECT(2); + return params_str; + R_API_END(); +} + +SEXP LGBM_DumpParamTypes_R() { + SEXP cont_token = PROTECT(R_MakeUnwindCont()); + R_API_BEGIN(); + SEXP types_str; + int64_t out_len = 0; + int64_t buf_len = 1024 * 1024; + std::vector inner_char_buf(buf_len); + CHECK_CALL(LGBM_DumpParamTypes(buf_len, &out_len, inner_char_buf.data())); + // if aliases string was larger than the initial buffer, allocate a bigger buffer and try again + if (out_len > buf_len) { + inner_char_buf.resize(out_len); + CHECK_CALL(LGBM_DumpParamTypes(out_len, &out_len, inner_char_buf.data())); + } + types_str = PROTECT(safe_R_string(static_cast(1), &cont_token)); + SET_STRING_ELT(types_str, 0, safe_R_mkChar(inner_char_buf.data(), &cont_token)); + UNPROTECT(2); + return types_str; + R_API_END(); +} + // .Call() calls static const R_CallMethodDef CallEntries[] = { {"LGBM_HandleIsNull_R" , (DL_FUNC) &LGBM_HandleIsNull_R , 1}, @@ -1056,6 +1097,7 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterGetEvalNames_R" , (DL_FUNC) &LGBM_BoosterGetEvalNames_R , 1}, {"LGBM_BoosterGetEval_R" , (DL_FUNC) &LGBM_BoosterGetEval_R , 3}, {"LGBM_BoosterGetNumPredict_R" , (DL_FUNC) &LGBM_BoosterGetNumPredict_R , 3}, + {"LGBM_BoosterGetParameters_R" , (DL_FUNC) &LGBM_BoosterGetParameters_R , 1}, {"LGBM_BoosterGetPredict_R" , (DL_FUNC) &LGBM_BoosterGetPredict_R , 3}, {"LGBM_BoosterPredictForFile_R" , (DL_FUNC) &LGBM_BoosterPredictForFile_R , 10}, {"LGBM_BoosterCalcNumPredict_R" , (DL_FUNC) &LGBM_BoosterCalcNumPredict_R , 8}, @@ -1067,6 +1109,7 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 3}, {"LGBM_NullBoosterHandleError_R" , (DL_FUNC) &LGBM_NullBoosterHandleError_R , 0}, {"LGBM_DumpParamAliases_R" , (DL_FUNC) &LGBM_DumpParamAliases_R , 0}, + {"LGBM_DumpParamTypes_R" , (DL_FUNC) &LGBM_DumpParamTypes_R , 0}, {NULL, NULL, 0} }; diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 0f2a0949b61c..d3f4db40cd5a 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -266,6 +266,15 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterLoadModelFromString_R( SEXP model_str ); +/*! +* \brief Get parameters as JSON string. +* \param handle Booster handle +* \return R character vector (length=1) with parameters in JSON format +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterGetParameters_R( + SEXP handle +); + /*! * \brief Merge model in two Boosters to first handle * \param handle handle primary Booster handle, will merge other handle to this @@ -650,4 +659,10 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterDumpModel_R( */ LIGHTGBM_C_EXPORT SEXP LGBM_DumpParamAliases_R(); +/*! +* \brief Dump parameter types to JSON +* \return R character vector (length=1) with types JSON +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_DumpParamTypes_R(); + #endif // LIGHTGBM_R_H_ diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 8208ef416a65..1f4683ba5385 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -172,15 +172,21 @@ test_that("Loading a Booster from a text file works", { data(agaricus.test, package = "lightgbm") train <- agaricus.train test <- agaricus.test + params <- list( + num_leaves = 4L + , boosting = "rf" + , bagging_fraction = 0.8 + , bagging_freq = 1L + , force_col_wise = TRUE + , categorical_feature = c(1L, 2L) + , learning_rate = 1.0 + , objective = "binary" + , verbosity = VERBOSITY + ) bst <- lightgbm( data = as.matrix(train$data) , label = train$label - , params = list( - num_leaves = 4L - , learning_rate = 1.0 - , objective = "binary" - , verbose = VERBOSITY - ) + , params = params , nrounds = 2L ) expect_true(lgb.is.Booster(bst)) @@ -199,6 +205,9 @@ test_that("Loading a Booster from a text file works", { ) pred2 <- predict(bst2, test$data) expect_identical(pred, pred2) + + # check that the parameters are loaded correctly + expect_identical(bst2$params[names(params)], params) }) test_that("boosters with linear models at leaves can be written to text file and re-loaded successfully", { From 483a3f48e67d08899e07dced1a43e8e201a371cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Mon, 29 Aug 2022 00:11:39 -0500 Subject: [PATCH 13/23] rename functions to BoosterGetLoadedParam. override array parameters. check interaction constraints are properly loaded --- R-package/R/aliases.R | 3 -- R-package/R/lgb.Booster.R | 39 ++++++++++++--------- R-package/src/lightgbm_R.cpp | 8 ++--- R-package/src/lightgbm_R.h | 2 +- R-package/tests/testthat/test_lgb.Booster.R | 3 +- helpers/parameter_generator.py | 15 ++++++-- include/LightGBM/boosting.h | 2 +- include/LightGBM/c_api.h | 8 ++--- python-package/lightgbm/basic.py | 26 +++++++------- src/boosting/gbdt.h | 8 ++--- src/c_api.cpp | 4 +-- src/io/config_auto.cpp | 15 +++++--- tests/python_package_test/test_engine.py | 5 +-- 13 files changed, 80 insertions(+), 58 deletions(-) diff --git a/R-package/R/aliases.R b/R-package/R/aliases.R index 0729fabf148a..4c4a593ca47d 100644 --- a/R-package/R/aliases.R +++ b/R-package/R/aliases.R @@ -72,9 +72,6 @@ LGBM_DumpParamTypes_R ) param_types <- jsonlite::fromJSON(json_str) - param_types["categorical_feature"] <- "vector" - param_types["monotone_constraints"] <- "vector" - param_types["max_bin_by_feature"] <- "vector" # store in cache so the next call to `.PARAMETER_TYPES()` doesn't need to recompute this assign( x = "PARAMETER_TYPES" diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index e6896ff7d82b..adc4fc071c2e 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -77,7 +77,7 @@ Booster <- R6::R6Class( LGBM_BoosterCreateFromModelfile_R , modelfile ) - params <- private$get_params(handle) + params <- private$get_loaded_param(handle) } else if (!is.null(model_str)) { @@ -728,9 +728,9 @@ Booster <- R6::R6Class( }, - get_params = function(handle) { + get_loaded_param = function(handle) { params_str <- .Call( - LGBM_BoosterGetParameters_R + LGBM_BoosterGetLoadedParam_R , handle ) params <- jsonlite::fromJSON(params_str) @@ -744,24 +744,31 @@ Booster <- R6::R6Class( ) parse_param <- function(value, type_name) { - if (grepl("vector", type_name)) { - eltype_name <- sub("vector<(.*)>", "\\1", type_name) - parse_fn <- type_name_to_fn[[eltype_name]] - values <- strsplit(value, ",") - return(lapply(values, parse_fn)) - } - parse_fn <- type_name_to_fn[[type_name]] - parse_fn(value) + if (grepl("vector", type_name)) { + eltype_name <- sub("vector<(.*)>", "\\1", type_name) + if (grepl("vector", eltype_name)) { + arr_pat <- "\\[(.*?)\\]" + matches <- regmatches(value, gregexpr(arr_pat, value))[[1L]] + # the previous returns the matches with the square brackets + matches <- sapply(matches, function(x) gsub(arr_pat, "\\1", x)) + values <- unname(sapply(matches, parse_param, eltype_name)) + } else { + parse_fn <- type_name_to_fn[[eltype_name]] + values <- parse_fn(strsplit(value, ",")[[1L]]) + } + return(values) + } + parse_fn <- type_name_to_fn[[type_name]] + parse_fn(value) } res <- list() for (param_name in names(params)) { - if (param_name %in% names(param_types)) { - type_name <- param_types[[param_name]] - } else { - type_name <- "string" + value <- parse_param(params[[param_name]], param_types[[param_name]]) + if (param_name == "interaction_constraints") { + value <- lapply(value, function(x) x + 1L) } - res[param_name] <- parse_param(params[[param_name]], type_name) + res[[param_name]] <- value } return(res) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 96658a236bdc..9755fb9d59ca 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -1183,7 +1183,7 @@ SEXP LGBM_DumpParamAliases_R() { R_API_END(); } -SEXP LGBM_BoosterGetParameters_R(SEXP handle) { +SEXP LGBM_BoosterGetLoadedParam_R(SEXP handle) { SEXP cont_token = PROTECT(R_MakeUnwindCont()); R_API_BEGIN(); _AssertBoosterHandleNotNull(handle); @@ -1191,11 +1191,11 @@ SEXP LGBM_BoosterGetParameters_R(SEXP handle) { int64_t out_len = 0; int64_t buf_len = 1024 * 1024; std::vector inner_char_buf(buf_len); - CHECK_CALL(LGBM_BoosterGetParameters(R_ExternalPtrAddr(handle), buf_len, &out_len, inner_char_buf.data())); + CHECK_CALL(LGBM_BoosterGetLoadedParam(R_ExternalPtrAddr(handle), buf_len, &out_len, inner_char_buf.data())); // if aliases string was larger than the initial buffer, allocate a bigger buffer and try again if (out_len > buf_len) { inner_char_buf.resize(out_len); - CHECK_CALL(LGBM_BoosterGetParameters(R_ExternalPtrAddr(handle), out_len, &out_len, inner_char_buf.data())); + CHECK_CALL(LGBM_BoosterGetLoadedParam(R_ExternalPtrAddr(handle), out_len, &out_len, inner_char_buf.data())); } params_str = PROTECT(safe_R_string(static_cast(1), &cont_token)); SET_STRING_ELT(params_str, 0, safe_R_mkChar(inner_char_buf.data(), &cont_token)); @@ -1252,7 +1252,7 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterResetParameter_R" , (DL_FUNC) &LGBM_BoosterResetParameter_R , 2}, {"LGBM_BoosterGetNumClasses_R" , (DL_FUNC) &LGBM_BoosterGetNumClasses_R , 2}, {"LGBM_BoosterGetNumFeature_R" , (DL_FUNC) &LGBM_BoosterGetNumFeature_R , 1}, - {"LGBM_BoosterGetParameters_R" , (DL_FUNC) &LGBM_BoosterGetParameters_R , 1}, + {"LGBM_BoosterGetLoadedParam_R" , (DL_FUNC) &LGBM_BoosterGetLoadedParam_R , 1}, {"LGBM_BoosterUpdateOneIter_R" , (DL_FUNC) &LGBM_BoosterUpdateOneIter_R , 1}, {"LGBM_BoosterUpdateOneIterCustom_R" , (DL_FUNC) &LGBM_BoosterUpdateOneIterCustom_R , 4}, {"LGBM_BoosterRollbackOneIter_R" , (DL_FUNC) &LGBM_BoosterRollbackOneIter_R , 1}, diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index e3d606f40fff..7bbc1737372a 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -271,7 +271,7 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterLoadModelFromString_R( * \param handle Booster handle * \return R character vector (length=1) with parameters in JSON format */ -LIGHTGBM_C_EXPORT SEXP LGBM_BoosterGetParameters_R( +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterGetLoadedParam_R( SEXP handle ); diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 1f4683ba5385..93ccac3f2205 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -179,6 +179,7 @@ test_that("Loading a Booster from a text file works", { , bagging_freq = 1L , force_col_wise = TRUE , categorical_feature = c(1L, 2L) + , interaction_constraints = list(c(1L, 2L), 1L) , learning_rate = 1.0 , objective = "binary" , verbosity = VERBOSITY @@ -207,7 +208,7 @@ test_that("Loading a Booster from a text file works", { expect_identical(pred, pred2) # check that the parameters are loaded correctly - expect_identical(bst2$params[names(params)], params) + expect_equal(bst2$params[names(params)], params) }) test_that("boosters with linear models at leaves can be written to text file and re-loaded successfully", { diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py index 9e57ae7875a0..814edc682aba 100644 --- a/helpers/parameter_generator.py +++ b/helpers/parameter_generator.py @@ -379,12 +379,21 @@ def gen_parameter_code( str_buf << "{";""" int_t_pat = re.compile(r'int\d+_t') first = True + # the following are stored as comma separated strings but are arrays in the wrappers + overrides = { + 'categorical_feature': 'vector', + 'ignore_column': 'vector', + 'interaction_constraints': 'vector>', + } for x in infos: for y in x: - if "[doc-only]" in y: - continue - param_type = int_t_pat.sub('int', y["inner_type"][0]).replace('std::', '') name = y["name"][0] + if name == 'task': + continue + if name in overrides: + param_type = overrides[name] + else: + param_type = int_t_pat.sub('int', y["inner_type"][0]).replace('std::', '') prefix = f'\n str_buf << "' if first: first = False diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index fd2e6330869c..1bfc18b4470b 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -313,7 +313,7 @@ class LIGHTGBM_EXPORT Boosting { */ static Boosting* CreateBoosting(const std::string& type, const char* filename); - virtual std::string GetParameters() const = 0; + virtual std::string GetLoadedParam() const = 0; virtual bool IsLinear() const { return false; } diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index cf38bc113414..f777422a7402 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -614,10 +614,10 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterLoadModelFromString(const char* model_str, * \param[out] out_str JSON string containing parameters. * \return 0 when succeed, -1 when failure happens */ -LIGHTGBM_C_EXPORT int LGBM_BoosterGetParameters(BoosterHandle handle, - int64_t buffer_len, - int64_t* out_len, - char* out_str); +LIGHTGBM_C_EXPORT int LGBM_BoosterGetLoadedParam(BoosterHandle handle, + int64_t buffer_len, + int64_t* out_len, + char* out_str); /*! diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 88a91fd6390d..968769a7c26d 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3,6 +3,7 @@ import abc import ctypes import json +import re import warnings from collections import OrderedDict from copy import deepcopy @@ -478,9 +479,6 @@ def _get_param_types() -> Dict[str, str]: ctypes.byref(tmp_out_len), ptr_string_buffer)) res = json.loads(string_buffer.value.decode('utf-8')) - res['categorical_feature'] = 'vector' - res['monotone_constraints'] = 'vector' - res['max_bin_by_feature'] = 'vector' return res @@ -2790,7 +2788,7 @@ def __init__( ctypes.byref(out_num_class))) self.__num_class = out_num_class.value self.pandas_categorical = _load_pandas_categorical(file_name=model_file) - params = self._get_params() + params = self._get_loaded_param() elif model_str is not None: self.model_from_string(model_str) else: @@ -2839,12 +2837,12 @@ def __setstate__(self, state: Dict[str, Any]) -> None: state['handle'] = handle self.__dict__.update(state) - def _get_params(self) -> Dict[str, Any]: + def _get_loaded_param(self) -> Dict[str, Any]: buffer_len = 1 << 20 tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_BoosterGetParameters( + _safe_call(_LIB.LGBM_BoosterGetLoadedParam( self.handle, ctypes.c_int64(buffer_len), ctypes.byref(tmp_out_len), @@ -2854,7 +2852,7 @@ def _get_params(self) -> Dict[str, Any]: if actual_len > buffer_len: string_buffer = ctypes.create_string_buffer(actual_len) ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_BoosterGetParameters( + _safe_call(_LIB.LGBM_BoosterGetLoadedParam( self.handle, ctypes.c_int64(actual_len), ctypes.byref(tmp_out_len), @@ -2865,15 +2863,17 @@ def _get_params(self) -> Dict[str, Any]: def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: if 'vector' in type_name: - if not value: - return [] - eltype_name = type_name[type_name.find('<') + 1:type_name.find('>')] - eltype = types_dict[eltype_name] - return [eltype(v) for v in value.split(',')] + eltype_name = type_name[type_name.find('<') + 1:type_name.rfind('>')] + if 'vector' in eltype_name: + values = [parse_param(v, eltype_name) for v in re.findall(r'\[(.*?)\]', value)] + else: + eltype = types_dict[eltype_name] + values = [eltype(v) for v in value.split(',')] + return values eltype = types_dict[type_name] return eltype(value) - return {param: parse_param(value, ptypes.get(param, 'string')) for param, value in params.items()} + return {param: parse_param(value, ptypes[param]) for param, value in params.items()} def free_dataset(self) -> "Booster": """Free Booster's Datasets. diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 200aa588c006..d3809a7e82a8 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -160,7 +160,7 @@ class GBDT : public GBDTBase { /*! * \brief Get parameters as a JSON string */ - std::string GetParameters() const override { + std::string GetLoadedParam() const override { if (loaded_parameter_.empty()) { return std::string("{}"); } @@ -169,15 +169,15 @@ class GBDT : public GBDTBase { const auto lines = Common::Split(loaded_parameter_.c_str(), "\n"); bool first = true; for (const auto& line : lines) { - const auto pair = Common::Split(line.c_str(), "[:]"); - if (pair[1] != " ") { + const auto pair = Common::Split(line.c_str(), ":"); + if (pair[1] != " ]") { if (first) { first = false; str_buf << "\""; } else { str_buf << ",\""; } - str_buf << pair[0] << "\": \"" << Common::Trim(pair[1]) << "\""; + str_buf << pair[0].substr(1) << "\": \"" << pair[1].substr(1, pair[1].size() - 2) << "\""; } } str_buf << "}"; diff --git a/src/c_api.cpp b/src/c_api.cpp index b51f385fb942..8a3d3dae33ac 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -1760,14 +1760,14 @@ int LGBM_BoosterLoadModelFromString( API_END(); } -int LGBM_BoosterGetParameters( +int LGBM_BoosterGetLoadedParam( BoosterHandle handle, int64_t buffer_len, int64_t* out_len, char* out_str) { API_BEGIN(); Booster* ref_booster = reinterpret_cast(handle); - std::string params = ref_booster->GetBoosting()->GetParameters(); + std::string params = ref_booster->GetBoosting()->GetLoadedParam(); *out_len = static_cast(params.size()) + 1; if (*out_len <= buffer_len) { std::memcpy(out_str, params.c_str(), *out_len); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 9ef6c11bd22a..67cab2258902 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -897,12 +897,18 @@ const std::unordered_map>& Config::paramet const std::string Config::ParameterTypes() { std::stringstream str_buf; str_buf << "{"; - str_buf << "\"data\": \"string\""; + str_buf << "\"config\": \"string\""; + str_buf << ",\"objective\": \"string\""; + str_buf << ",\"boosting\": \"string\""; + str_buf << ",\"data\": \"string\""; str_buf << ",\"valid\": \"vector\""; str_buf << ",\"num_iterations\": \"int\""; str_buf << ",\"learning_rate\": \"double\""; str_buf << ",\"num_leaves\": \"int\""; + str_buf << ",\"tree_learner\": \"string\""; str_buf << ",\"num_threads\": \"int\""; + str_buf << ",\"device_type\": \"string\""; + str_buf << ",\"seed\": \"int\""; str_buf << ",\"deterministic\": \"bool\""; str_buf << ",\"force_col_wise\": \"bool\""; str_buf << ",\"force_row_wise\": \"bool\""; @@ -952,7 +958,7 @@ const std::string Config::ParameterTypes() { str_buf << ",\"cegb_penalty_feature_lazy\": \"vector\""; str_buf << ",\"cegb_penalty_feature_coupled\": \"vector\""; str_buf << ",\"path_smooth\": \"double\""; - str_buf << ",\"interaction_constraints\": \"string\""; + str_buf << ",\"interaction_constraints\": \"vector>\""; str_buf << ",\"verbosity\": \"int\""; str_buf << ",\"input_model\": \"string\""; str_buf << ",\"output_model\": \"string\""; @@ -975,8 +981,8 @@ const std::string Config::ParameterTypes() { str_buf << ",\"label_column\": \"string\""; str_buf << ",\"weight_column\": \"string\""; str_buf << ",\"group_column\": \"string\""; - str_buf << ",\"ignore_column\": \"string\""; - str_buf << ",\"categorical_feature\": \"string\""; + str_buf << ",\"ignore_column\": \"vector\""; + str_buf << ",\"categorical_feature\": \"vector\""; str_buf << ",\"forcedbins_filename\": \"string\""; str_buf << ",\"save_binary\": \"bool\""; str_buf << ",\"precise_float_parser\": \"bool\""; @@ -1007,6 +1013,7 @@ const std::string Config::ParameterTypes() { str_buf << ",\"lambdarank_truncation_level\": \"int\""; str_buf << ",\"lambdarank_norm\": \"bool\""; str_buf << ",\"label_gain\": \"vector\""; + str_buf << ",\"metric\": \"vector\""; str_buf << ",\"metric_freq\": \"int\""; str_buf << ",\"is_provide_training_metric\": \"bool\""; str_buf << ",\"eval_at\": \"vector\""; diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 83307d83f09f..3589992cef66 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1208,12 +1208,13 @@ def test_parameters_are_loaded_from_model_file(tmp_path): y = np.random.rand(100) ds = lgb.Dataset(X, y) params = { - 'boosting': 'rf', - 'num_leaves': 5, 'bagging_fraction': 0.8, 'bagging_freq': 2, + 'boosting': 'rf', 'feature_fraction': 0.7, 'force_col_wise': True, + 'interaction_constraints': [[0, 1], [0]], + 'num_leaves': 5, 'num_threads': 1, } model_file = tmp_path / 'model.txt' From 4ab5dd42f2dff064fe43bf339901c8e529840708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Mon, 29 Aug 2022 18:45:40 -0500 Subject: [PATCH 14/23] add missing types to tests --- R-package/tests/testthat/test_lgb.Booster.R | 2 ++ tests/python_package_test/test_engine.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 93ccac3f2205..20ddf47019be 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -180,6 +180,8 @@ test_that("Loading a Booster from a text file works", { , force_col_wise = TRUE , categorical_feature = c(1L, 2L) , interaction_constraints = list(c(1L, 2L), 1L) + , feature_contri = rep(0.5, ncol(train)), + , metric = c("map", "average_precision"), , learning_rate = 1.0 , objective = "binary" , verbosity = VERBOSITY diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 3589992cef66..f0f4f28a0da2 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1211,9 +1211,11 @@ def test_parameters_are_loaded_from_model_file(tmp_path): 'bagging_fraction': 0.8, 'bagging_freq': 2, 'boosting': 'rf', + 'feature_contri': [0.5, 0.5, 0.5], 'feature_fraction': 0.7, 'force_col_wise': True, 'interaction_constraints': [[0, 1], [0]], + 'metric': ['l2', 'rmse'], 'num_leaves': 5, 'num_threads': 1, } From bd4eec08f8903a456badbe7e97e7e0b7c4451177 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Mon, 29 Aug 2022 19:06:28 -0500 Subject: [PATCH 15/23] fix R params --- R-package/tests/testthat/test_lgb.Booster.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 20ddf47019be..2e9065c95698 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -180,8 +180,8 @@ test_that("Loading a Booster from a text file works", { , force_col_wise = TRUE , categorical_feature = c(1L, 2L) , interaction_constraints = list(c(1L, 2L), 1L) - , feature_contri = rep(0.5, ncol(train)), - , metric = c("map", "average_precision"), + , feature_contri = rep(0.5, ncol(train$data)) + , metric = c("mape", "average_precision") , learning_rate = 1.0 , objective = "binary" , verbosity = VERBOSITY From 9a00fdec50317430611b26d77327be9a474c9c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Mon, 29 Aug 2022 21:38:42 -0500 Subject: [PATCH 16/23] assert equal dicts --- tests/python_package_test/test_engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index f0f4f28a0da2..749c7af471c6 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1222,7 +1222,8 @@ def test_parameters_are_loaded_from_model_file(tmp_path): model_file = tmp_path / 'model.txt' lgb.train(params, ds, num_boost_round=1, categorical_feature=[1, 2]).save_model(model_file) bst = lgb.Booster(model_file=model_file) - assert all(bst.params[k] == params[k] for k in params) # bst.params has all parameters + set_params = {k: bst.params[k] for k in params.keys()} + assert set_params == params assert bst.params['categorical_feature'] == [1, 2] From de6ef8a42694833daea07fd26380111010065a6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 30 Aug 2022 10:15:17 -0500 Subject: [PATCH 17/23] use boost_from_average as boolean param --- R-package/tests/testthat/test_lgb.Booster.R | 2 +- tests/python_package_test/test_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 2e9065c95698..0f619e0c1dc8 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -177,7 +177,7 @@ test_that("Loading a Booster from a text file works", { , boosting = "rf" , bagging_fraction = 0.8 , bagging_freq = 1L - , force_col_wise = TRUE + , boost_from_average = TRUE , categorical_feature = c(1L, 2L) , interaction_constraints = list(c(1L, 2L), 1L) , feature_contri = rep(0.5, ncol(train$data)) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 749c7af471c6..a730f9b7d635 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1213,7 +1213,7 @@ def test_parameters_are_loaded_from_model_file(tmp_path): 'boosting': 'rf', 'feature_contri': [0.5, 0.5, 0.5], 'feature_fraction': 0.7, - 'force_col_wise': True, + 'boost_from_average': True, 'interaction_constraints': [[0, 1], [0]], 'metric': ['l2', 'rmse'], 'num_leaves': 5, From 2cec69272295f479c7157ca45ec813057290d4bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 30 Aug 2022 10:16:31 -0500 Subject: [PATCH 18/23] set boost_from_average to false --- R-package/tests/testthat/test_lgb.Booster.R | 2 +- tests/python_package_test/test_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 0f619e0c1dc8..1bd565a07345 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -177,7 +177,7 @@ test_that("Loading a Booster from a text file works", { , boosting = "rf" , bagging_fraction = 0.8 , bagging_freq = 1L - , boost_from_average = TRUE + , boost_from_average = FALSE , categorical_feature = c(1L, 2L) , interaction_constraints = list(c(1L, 2L), 1L) , feature_contri = rep(0.5, ncol(train$data)) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index a730f9b7d635..f42231c9074e 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1213,7 +1213,7 @@ def test_parameters_are_loaded_from_model_file(tmp_path): 'boosting': 'rf', 'feature_contri': [0.5, 0.5, 0.5], 'feature_fraction': 0.7, - 'boost_from_average': True, + 'boost_from_average': False, 'interaction_constraints': [[0, 1], [0]], 'metric': ['l2', 'rmse'], 'num_leaves': 5, From f066dbae4593955863f863a85fade0368d1794ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 30 Aug 2022 17:27:03 -0500 Subject: [PATCH 19/23] simplify R's parse_param --- R-package/R/lgb.Booster.R | 29 ++++++++++++++--------------- python-package/lightgbm/basic.py | 1 + 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index adc4fc071c2e..36fe6813901a 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -744,22 +744,21 @@ Booster <- R6::R6Class( ) parse_param <- function(value, type_name) { - if (grepl("vector", type_name)) { - eltype_name <- sub("vector<(.*)>", "\\1", type_name) - if (grepl("vector", eltype_name)) { - arr_pat <- "\\[(.*?)\\]" - matches <- regmatches(value, gregexpr(arr_pat, value))[[1L]] - # the previous returns the matches with the square brackets - matches <- sapply(matches, function(x) gsub(arr_pat, "\\1", x)) - values <- unname(sapply(matches, parse_param, eltype_name)) - } else { - parse_fn <- type_name_to_fn[[eltype_name]] - values <- parse_fn(strsplit(value, ",")[[1L]]) - } - return(values) + if (grepl("vector", type_name)) { + eltype_name <- sub("vector<(.*)>", "\\1", type_name) + if (grepl("vector", eltype_name)) { + # value is like "[0,1],[0]", we make it a JSON array to parse it as a list + values <- jsonlite::fromJSON(paste0("[", value, "]")) + } else { + parse_fn <- type_name_to_fn[[eltype_name]] + values <- parse_fn(strsplit(value, ",")[[1L]]) } - parse_fn <- type_name_to_fn[[type_name]] - parse_fn(value) + return(values) + } + parse_fn <- type_name_to_fn[[type_name]] + parsed_value <- parse_fn(value) + + return(parsed_value) } res <- list() diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 968769a7c26d..250b273a1428 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2865,6 +2865,7 @@ def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: if 'vector' in type_name: eltype_name = type_name[type_name.find('<') + 1:type_name.rfind('>')] if 'vector' in eltype_name: + # value is like "[0,1],[0]" values = [parse_param(v, eltype_name) for v in re.findall(r'\[(.*?)\]', value)] else: eltype = types_dict[eltype_name] From db36cb94376a254bde01d23c7ed83327e1c44941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Wed, 31 Aug 2022 03:25:43 -0500 Subject: [PATCH 20/23] parse types on cpp side --- R-package/R/aliases.R | 18 -- R-package/R/lgb.Booster.R | 38 +---- R-package/src/lightgbm_R.cpp | 21 --- R-package/src/lightgbm_R.h | 6 - helpers/parameter_generator.py | 17 +- include/LightGBM/c_api.h | 11 -- include/LightGBM/config.h | 2 +- python-package/lightgbm/basic.py | 45 +---- src/boosting/gbdt.h | 43 ++++- src/c_api.cpp | 12 -- src/io/config_auto.cpp | 271 +++++++++++++++---------------- 11 files changed, 181 insertions(+), 303 deletions(-) diff --git a/R-package/R/aliases.R b/R-package/R/aliases.R index 4c4a593ca47d..0aa886ab90c2 100644 --- a/R-package/R/aliases.R +++ b/R-package/R/aliases.R @@ -63,24 +63,6 @@ return(params_to_aliases) } -# [description] List of parameter types. Wrapped in a function to take advantage of -# lazy evaluation (so it doesn't matter what order R sources files during installation). -# [return] A named list, where each key is a main LightGBM parameter and each value is a character -# vector of corresponding of their type name in C++. -.PARAMETER_TYPES <- function() { - json_str <- .Call( - LGBM_DumpParamTypes_R - ) - param_types <- jsonlite::fromJSON(json_str) - # store in cache so the next call to `.PARAMETER_TYPES()` doesn't need to recompute this - assign( - x = "PARAMETER_TYPES" - , value = param_types - , envir = .lgb_session_cache_env - ) - return(param_types) -} - # [description] # Per https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst#metric, # a few different strings can be used to indicate "no metrics". diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 36fe6813901a..3240c164f41b 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -734,43 +734,11 @@ Booster <- R6::R6Class( , handle ) params <- jsonlite::fromJSON(params_str) - param_types <- .PARAMETER_TYPES() - - type_name_to_fn <- c( - "string" = as.character - , "int" = as.integer - , "double" = as.numeric - , "bool" = function(x) x == "1" - ) - - parse_param <- function(value, type_name) { - if (grepl("vector", type_name)) { - eltype_name <- sub("vector<(.*)>", "\\1", type_name) - if (grepl("vector", eltype_name)) { - # value is like "[0,1],[0]", we make it a JSON array to parse it as a list - values <- jsonlite::fromJSON(paste0("[", value, "]")) - } else { - parse_fn <- type_name_to_fn[[eltype_name]] - values <- parse_fn(strsplit(value, ",")[[1L]]) - } - return(values) - } - parse_fn <- type_name_to_fn[[type_name]] - parsed_value <- parse_fn(value) - - return(parsed_value) - } - - res <- list() - for (param_name in names(params)) { - value <- parse_param(params[[param_name]], param_types[[param_name]]) - if (param_name == "interaction_constraints") { - value <- lapply(value, function(x) x + 1L) - } - res[[param_name]] <- value + if ("interaction_constraints" %in% names(params)) { + params[["interaction_constraints"]] <- lapply(params[["interaction_constraints"]], function(x) x + 1L) } - return(res) + return(params) }, diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 9755fb9d59ca..82956daef4b9 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -1204,26 +1204,6 @@ SEXP LGBM_BoosterGetLoadedParam_R(SEXP handle) { R_API_END(); } -SEXP LGBM_DumpParamTypes_R() { - SEXP cont_token = PROTECT(R_MakeUnwindCont()); - R_API_BEGIN(); - SEXP types_str; - int64_t out_len = 0; - int64_t buf_len = 1024 * 1024; - std::vector inner_char_buf(buf_len); - CHECK_CALL(LGBM_DumpParamTypes(buf_len, &out_len, inner_char_buf.data())); - // if aliases string was larger than the initial buffer, allocate a bigger buffer and try again - if (out_len > buf_len) { - inner_char_buf.resize(out_len); - CHECK_CALL(LGBM_DumpParamTypes(out_len, &out_len, inner_char_buf.data())); - } - types_str = PROTECT(safe_R_string(static_cast(1), &cont_token)); - SET_STRING_ELT(types_str, 0, safe_R_mkChar(inner_char_buf.data(), &cont_token)); - UNPROTECT(2); - return types_str; - R_API_END(); -} - // .Call() calls static const R_CallMethodDef CallEntries[] = { {"LGBM_HandleIsNull_R" , (DL_FUNC) &LGBM_HandleIsNull_R , 1}, @@ -1280,7 +1260,6 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 3}, {"LGBM_NullBoosterHandleError_R" , (DL_FUNC) &LGBM_NullBoosterHandleError_R , 0}, {"LGBM_DumpParamAliases_R" , (DL_FUNC) &LGBM_DumpParamAliases_R , 0}, - {"LGBM_DumpParamTypes_R" , (DL_FUNC) &LGBM_DumpParamTypes_R , 0}, {NULL, NULL, 0} }; diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 7bbc1737372a..fbd2d7d6fd59 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -847,10 +847,4 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterDumpModel_R( */ LIGHTGBM_C_EXPORT SEXP LGBM_DumpParamAliases_R(); -/*! -* \brief Dump parameter types to JSON -* \return R character vector (length=1) with types JSON -*/ -LIGHTGBM_C_EXPORT SEXP LGBM_DumpParamTypes_R(); - #endif // LIGHTGBM_R_H_ diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py index 814edc682aba..407f2c73e1e3 100644 --- a/helpers/parameter_generator.py +++ b/helpers/parameter_generator.py @@ -374,11 +374,9 @@ def gen_parameter_code( } """ - str_to_write += """const std::string Config::ParameterTypes() { - std::stringstream str_buf; - str_buf << "{";""" + str_to_write += """const std::unordered_map& Config::ParameterTypes() { + static std::unordered_map map({""" int_t_pat = re.compile(r'int\d+_t') - first = True # the following are stored as comma separated strings but are arrays in the wrappers overrides = { 'categorical_feature': 'vector', @@ -394,15 +392,10 @@ def gen_parameter_code( param_type = overrides[name] else: param_type = int_t_pat.sub('int', y["inner_type"][0]).replace('std::', '') - prefix = f'\n str_buf << "' - if first: - first = False - else: - prefix += ',' - str_to_write += f'{prefix}\\"{name}\\": \\"{param_type}\\"";' + str_to_write += '\n {"' + name + '", "' + param_type + '"},' str_to_write += """ - str_buf << "}"; - return str_buf.str(); + }); + return map; } """ diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index f777422a7402..287826ea182c 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -63,17 +63,6 @@ LIGHTGBM_C_EXPORT int LGBM_DumpParamAliases(int64_t buffer_len, int64_t* out_len, char* out_str); -/*! - * \brief Dump all parameter names with their types to JSON. - * \param buffer_len String buffer length, if ``buffer_len < out_len``, you should re-allocate buffer - * \param[out] out_len Actual output length - * \param[out] out_str JSON format string of parameters, should pre-allocate memory - * \return 0 when succeed, -1 when failure happens - */ -LIGHTGBM_C_EXPORT int LGBM_DumpParamTypes(int64_t buffer_len, - int64_t* out_len, - char* out_str); - /*! * \brief Register a callback function for log redirecting. * \param callback The callback function to register diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 69b16d24ec58..c924c6b17485 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -1075,7 +1075,7 @@ struct Config { static const std::unordered_set& parameter_set(); std::vector> auc_mu_weights_matrix; std::vector> interaction_constraints_vector; - static const std::string ParameterTypes(); + static const std::unordered_map& ParameterTypes(); static const std::string DumpAliases(); private: diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 250b273a1428..f65232f9b914 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3,12 +3,11 @@ import abc import ctypes import json -import re import warnings from collections import OrderedDict from copy import deepcopy from enum import Enum -from functools import lru_cache, wraps +from functools import wraps from os import SEEK_END, environ from os.path import getsize from pathlib import Path @@ -459,29 +458,6 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va return params -@lru_cache(maxsize=None) -def _get_param_types() -> Dict[str, str]: - buffer_len = 1 << 20 - tmp_out_len = ctypes.c_int64(0) - string_buffer = ctypes.create_string_buffer(buffer_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_DumpParamTypes( - ctypes.c_int64(buffer_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - actual_len = tmp_out_len.value - # if buffer length is not long enough, re-allocate a buffer - if actual_len > buffer_len: - string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) - _safe_call(_LIB.LGBM_DumpParamTypes( - ctypes.c_int64(actual_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - res = json.loads(string_buffer.value.decode('utf-8')) - return res - - MAX_INT32 = (1 << 31) - 1 """Macro definition of data type in C API of LightGBM""" @@ -2857,24 +2833,7 @@ def _get_loaded_param(self) -> Dict[str, Any]: ctypes.c_int64(actual_len), ctypes.byref(tmp_out_len), ptr_string_buffer)) - params = json.loads(string_buffer.value.decode('utf-8')) - ptypes = _get_param_types() - types_dict = {'string': str, 'int': int, 'double': float, 'bool': lambda x: x == '1'} - - def parse_param(value: str, type_name: str) -> Union[Any, List[Any]]: - if 'vector' in type_name: - eltype_name = type_name[type_name.find('<') + 1:type_name.rfind('>')] - if 'vector' in eltype_name: - # value is like "[0,1],[0]" - values = [parse_param(v, eltype_name) for v in re.findall(r'\[(.*?)\]', value)] - else: - eltype = types_dict[eltype_name] - values = [eltype(v) for v in value.split(',')] - return values - eltype = types_dict[type_name] - return eltype(value) - - return {param: parse_param(value, ptypes[param]) for param, value in params.items()} + return json.loads(string_buffer.value.decode('utf-8')) def free_dataset(self) -> "Booster": """Free Booster's Datasets. diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index d3809a7e82a8..1883590ceffc 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -164,20 +164,47 @@ class GBDT : public GBDTBase { if (loaded_parameter_.empty()) { return std::string("{}"); } - std::stringstream str_buf; - str_buf << "{"; + const auto param_types = Config::ParameterTypes(); const auto lines = Common::Split(loaded_parameter_.c_str(), "\n"); bool first = true; + std::stringstream str_buf; + str_buf << "{"; for (const auto& line : lines) { const auto pair = Common::Split(line.c_str(), ":"); - if (pair[1] != " ]") { - if (first) { - first = false; - str_buf << "\""; + if (pair[1] == " ]") + continue; + if (first) { + first = false; + str_buf << "\""; + } else { + str_buf << ",\""; + } + const auto param = pair[0].substr(1); + const auto value_str = pair[1].substr(1, pair[1].size() - 2); + const auto param_type = param_types.at(param); + str_buf << param << "\": "; + if (param_type == "string") { + str_buf << "\"" << value_str << "\""; + } else if (param_type == "int") { + int value; + Common::Atoi(value_str.c_str(), &value); + str_buf << value; + } else if (param_type == "double") { + double value; + Common::Atof(value_str.c_str(), &value); + str_buf << value; + } else if (param_type == "bool") { + bool value = value_str == "1"; + str_buf << std::boolalpha << value; + } else if (param_type.substr(0, 6) == "vector") { + str_buf << "["; + if (param_type.substr(7, 6) == "string") { + const auto parts = Common::Split(value_str.c_str(), ","); + str_buf << "\"" << Common::Join(parts, "\",\"") << "\""; } else { - str_buf << ",\""; + str_buf << value_str; } - str_buf << pair[0].substr(1) << "\": \"" << pair[1].substr(1, pair[1].size() - 2) << "\""; + str_buf << "]"; } } str_buf << "}"; diff --git a/src/c_api.cpp b/src/c_api.cpp index 8a3d3dae33ac..20633273134e 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -900,18 +900,6 @@ int LGBM_DumpParamAliases(int64_t buffer_len, API_END(); } -int LGBM_DumpParamTypes(int64_t buffer_len, - int64_t* out_len, - char* out_str) { - API_BEGIN(); - std::string ptypes = Config::ParameterTypes(); - *out_len = static_cast(ptypes.size()) + 1; - if (*out_len <= buffer_len) { - std::memcpy(out_str, ptypes.c_str(), *out_len); - } - API_END(); -} - int LGBM_RegisterLogCallback(void (*callback)(const char*)) { API_BEGIN(); Log::ResetCallBack(callback); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 67cab2258902..a86abd3a2c1d 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -894,142 +894,141 @@ const std::unordered_map>& Config::paramet return map; } -const std::string Config::ParameterTypes() { - std::stringstream str_buf; - str_buf << "{"; - str_buf << "\"config\": \"string\""; - str_buf << ",\"objective\": \"string\""; - str_buf << ",\"boosting\": \"string\""; - str_buf << ",\"data\": \"string\""; - str_buf << ",\"valid\": \"vector\""; - str_buf << ",\"num_iterations\": \"int\""; - str_buf << ",\"learning_rate\": \"double\""; - str_buf << ",\"num_leaves\": \"int\""; - str_buf << ",\"tree_learner\": \"string\""; - str_buf << ",\"num_threads\": \"int\""; - str_buf << ",\"device_type\": \"string\""; - str_buf << ",\"seed\": \"int\""; - str_buf << ",\"deterministic\": \"bool\""; - str_buf << ",\"force_col_wise\": \"bool\""; - str_buf << ",\"force_row_wise\": \"bool\""; - str_buf << ",\"histogram_pool_size\": \"double\""; - str_buf << ",\"max_depth\": \"int\""; - str_buf << ",\"min_data_in_leaf\": \"int\""; - str_buf << ",\"min_sum_hessian_in_leaf\": \"double\""; - str_buf << ",\"bagging_fraction\": \"double\""; - str_buf << ",\"pos_bagging_fraction\": \"double\""; - str_buf << ",\"neg_bagging_fraction\": \"double\""; - str_buf << ",\"bagging_freq\": \"int\""; - str_buf << ",\"bagging_seed\": \"int\""; - str_buf << ",\"feature_fraction\": \"double\""; - str_buf << ",\"feature_fraction_bynode\": \"double\""; - str_buf << ",\"feature_fraction_seed\": \"int\""; - str_buf << ",\"extra_trees\": \"bool\""; - str_buf << ",\"extra_seed\": \"int\""; - str_buf << ",\"early_stopping_round\": \"int\""; - str_buf << ",\"first_metric_only\": \"bool\""; - str_buf << ",\"max_delta_step\": \"double\""; - str_buf << ",\"lambda_l1\": \"double\""; - str_buf << ",\"lambda_l2\": \"double\""; - str_buf << ",\"linear_lambda\": \"double\""; - str_buf << ",\"min_gain_to_split\": \"double\""; - str_buf << ",\"drop_rate\": \"double\""; - str_buf << ",\"max_drop\": \"int\""; - str_buf << ",\"skip_drop\": \"double\""; - str_buf << ",\"xgboost_dart_mode\": \"bool\""; - str_buf << ",\"uniform_drop\": \"bool\""; - str_buf << ",\"drop_seed\": \"int\""; - str_buf << ",\"top_rate\": \"double\""; - str_buf << ",\"other_rate\": \"double\""; - str_buf << ",\"min_data_per_group\": \"int\""; - str_buf << ",\"max_cat_threshold\": \"int\""; - str_buf << ",\"cat_l2\": \"double\""; - str_buf << ",\"cat_smooth\": \"double\""; - str_buf << ",\"max_cat_to_onehot\": \"int\""; - str_buf << ",\"top_k\": \"int\""; - str_buf << ",\"monotone_constraints\": \"vector\""; - str_buf << ",\"monotone_constraints_method\": \"string\""; - str_buf << ",\"monotone_penalty\": \"double\""; - str_buf << ",\"feature_contri\": \"vector\""; - str_buf << ",\"forcedsplits_filename\": \"string\""; - str_buf << ",\"refit_decay_rate\": \"double\""; - str_buf << ",\"cegb_tradeoff\": \"double\""; - str_buf << ",\"cegb_penalty_split\": \"double\""; - str_buf << ",\"cegb_penalty_feature_lazy\": \"vector\""; - str_buf << ",\"cegb_penalty_feature_coupled\": \"vector\""; - str_buf << ",\"path_smooth\": \"double\""; - str_buf << ",\"interaction_constraints\": \"vector>\""; - str_buf << ",\"verbosity\": \"int\""; - str_buf << ",\"input_model\": \"string\""; - str_buf << ",\"output_model\": \"string\""; - str_buf << ",\"saved_feature_importance_type\": \"int\""; - str_buf << ",\"snapshot_freq\": \"int\""; - str_buf << ",\"linear_tree\": \"bool\""; - str_buf << ",\"max_bin\": \"int\""; - str_buf << ",\"max_bin_by_feature\": \"vector\""; - str_buf << ",\"min_data_in_bin\": \"int\""; - str_buf << ",\"bin_construct_sample_cnt\": \"int\""; - str_buf << ",\"data_random_seed\": \"int\""; - str_buf << ",\"is_enable_sparse\": \"bool\""; - str_buf << ",\"enable_bundle\": \"bool\""; - str_buf << ",\"use_missing\": \"bool\""; - str_buf << ",\"zero_as_missing\": \"bool\""; - str_buf << ",\"feature_pre_filter\": \"bool\""; - str_buf << ",\"pre_partition\": \"bool\""; - str_buf << ",\"two_round\": \"bool\""; - str_buf << ",\"header\": \"bool\""; - str_buf << ",\"label_column\": \"string\""; - str_buf << ",\"weight_column\": \"string\""; - str_buf << ",\"group_column\": \"string\""; - str_buf << ",\"ignore_column\": \"vector\""; - str_buf << ",\"categorical_feature\": \"vector\""; - str_buf << ",\"forcedbins_filename\": \"string\""; - str_buf << ",\"save_binary\": \"bool\""; - str_buf << ",\"precise_float_parser\": \"bool\""; - str_buf << ",\"parser_config_file\": \"string\""; - str_buf << ",\"start_iteration_predict\": \"int\""; - str_buf << ",\"num_iteration_predict\": \"int\""; - str_buf << ",\"predict_raw_score\": \"bool\""; - str_buf << ",\"predict_leaf_index\": \"bool\""; - str_buf << ",\"predict_contrib\": \"bool\""; - str_buf << ",\"predict_disable_shape_check\": \"bool\""; - str_buf << ",\"pred_early_stop\": \"bool\""; - str_buf << ",\"pred_early_stop_freq\": \"int\""; - str_buf << ",\"pred_early_stop_margin\": \"double\""; - str_buf << ",\"output_result\": \"string\""; - str_buf << ",\"convert_model_language\": \"string\""; - str_buf << ",\"convert_model\": \"string\""; - str_buf << ",\"objective_seed\": \"int\""; - str_buf << ",\"num_class\": \"int\""; - str_buf << ",\"is_unbalance\": \"bool\""; - str_buf << ",\"scale_pos_weight\": \"double\""; - str_buf << ",\"sigmoid\": \"double\""; - str_buf << ",\"boost_from_average\": \"bool\""; - str_buf << ",\"reg_sqrt\": \"bool\""; - str_buf << ",\"alpha\": \"double\""; - str_buf << ",\"fair_c\": \"double\""; - str_buf << ",\"poisson_max_delta_step\": \"double\""; - str_buf << ",\"tweedie_variance_power\": \"double\""; - str_buf << ",\"lambdarank_truncation_level\": \"int\""; - str_buf << ",\"lambdarank_norm\": \"bool\""; - str_buf << ",\"label_gain\": \"vector\""; - str_buf << ",\"metric\": \"vector\""; - str_buf << ",\"metric_freq\": \"int\""; - str_buf << ",\"is_provide_training_metric\": \"bool\""; - str_buf << ",\"eval_at\": \"vector\""; - str_buf << ",\"multi_error_top_k\": \"int\""; - str_buf << ",\"auc_mu_weights\": \"vector\""; - str_buf << ",\"num_machines\": \"int\""; - str_buf << ",\"local_listen_port\": \"int\""; - str_buf << ",\"time_out\": \"int\""; - str_buf << ",\"machine_list_filename\": \"string\""; - str_buf << ",\"machines\": \"string\""; - str_buf << ",\"gpu_platform_id\": \"int\""; - str_buf << ",\"gpu_device_id\": \"int\""; - str_buf << ",\"gpu_use_dp\": \"bool\""; - str_buf << ",\"num_gpu\": \"int\""; - str_buf << "}"; - return str_buf.str(); +const std::unordered_map& Config::ParameterTypes() { + static std::unordered_map map({ + {"config", "string"}, + {"objective", "string"}, + {"boosting", "string"}, + {"data", "string"}, + {"valid", "vector"}, + {"num_iterations", "int"}, + {"learning_rate", "double"}, + {"num_leaves", "int"}, + {"tree_learner", "string"}, + {"num_threads", "int"}, + {"device_type", "string"}, + {"seed", "int"}, + {"deterministic", "bool"}, + {"force_col_wise", "bool"}, + {"force_row_wise", "bool"}, + {"histogram_pool_size", "double"}, + {"max_depth", "int"}, + {"min_data_in_leaf", "int"}, + {"min_sum_hessian_in_leaf", "double"}, + {"bagging_fraction", "double"}, + {"pos_bagging_fraction", "double"}, + {"neg_bagging_fraction", "double"}, + {"bagging_freq", "int"}, + {"bagging_seed", "int"}, + {"feature_fraction", "double"}, + {"feature_fraction_bynode", "double"}, + {"feature_fraction_seed", "int"}, + {"extra_trees", "bool"}, + {"extra_seed", "int"}, + {"early_stopping_round", "int"}, + {"first_metric_only", "bool"}, + {"max_delta_step", "double"}, + {"lambda_l1", "double"}, + {"lambda_l2", "double"}, + {"linear_lambda", "double"}, + {"min_gain_to_split", "double"}, + {"drop_rate", "double"}, + {"max_drop", "int"}, + {"skip_drop", "double"}, + {"xgboost_dart_mode", "bool"}, + {"uniform_drop", "bool"}, + {"drop_seed", "int"}, + {"top_rate", "double"}, + {"other_rate", "double"}, + {"min_data_per_group", "int"}, + {"max_cat_threshold", "int"}, + {"cat_l2", "double"}, + {"cat_smooth", "double"}, + {"max_cat_to_onehot", "int"}, + {"top_k", "int"}, + {"monotone_constraints", "vector"}, + {"monotone_constraints_method", "string"}, + {"monotone_penalty", "double"}, + {"feature_contri", "vector"}, + {"forcedsplits_filename", "string"}, + {"refit_decay_rate", "double"}, + {"cegb_tradeoff", "double"}, + {"cegb_penalty_split", "double"}, + {"cegb_penalty_feature_lazy", "vector"}, + {"cegb_penalty_feature_coupled", "vector"}, + {"path_smooth", "double"}, + {"interaction_constraints", "vector>"}, + {"verbosity", "int"}, + {"input_model", "string"}, + {"output_model", "string"}, + {"saved_feature_importance_type", "int"}, + {"snapshot_freq", "int"}, + {"linear_tree", "bool"}, + {"max_bin", "int"}, + {"max_bin_by_feature", "vector"}, + {"min_data_in_bin", "int"}, + {"bin_construct_sample_cnt", "int"}, + {"data_random_seed", "int"}, + {"is_enable_sparse", "bool"}, + {"enable_bundle", "bool"}, + {"use_missing", "bool"}, + {"zero_as_missing", "bool"}, + {"feature_pre_filter", "bool"}, + {"pre_partition", "bool"}, + {"two_round", "bool"}, + {"header", "bool"}, + {"label_column", "string"}, + {"weight_column", "string"}, + {"group_column", "string"}, + {"ignore_column", "vector"}, + {"categorical_feature", "vector"}, + {"forcedbins_filename", "string"}, + {"save_binary", "bool"}, + {"precise_float_parser", "bool"}, + {"parser_config_file", "string"}, + {"start_iteration_predict", "int"}, + {"num_iteration_predict", "int"}, + {"predict_raw_score", "bool"}, + {"predict_leaf_index", "bool"}, + {"predict_contrib", "bool"}, + {"predict_disable_shape_check", "bool"}, + {"pred_early_stop", "bool"}, + {"pred_early_stop_freq", "int"}, + {"pred_early_stop_margin", "double"}, + {"output_result", "string"}, + {"convert_model_language", "string"}, + {"convert_model", "string"}, + {"objective_seed", "int"}, + {"num_class", "int"}, + {"is_unbalance", "bool"}, + {"scale_pos_weight", "double"}, + {"sigmoid", "double"}, + {"boost_from_average", "bool"}, + {"reg_sqrt", "bool"}, + {"alpha", "double"}, + {"fair_c", "double"}, + {"poisson_max_delta_step", "double"}, + {"tweedie_variance_power", "double"}, + {"lambdarank_truncation_level", "int"}, + {"lambdarank_norm", "bool"}, + {"label_gain", "vector"}, + {"metric", "vector"}, + {"metric_freq", "int"}, + {"is_provide_training_metric", "bool"}, + {"eval_at", "vector"}, + {"multi_error_top_k", "int"}, + {"auc_mu_weights", "vector"}, + {"num_machines", "int"}, + {"local_listen_port", "int"}, + {"time_out", "int"}, + {"machine_list_filename", "string"}, + {"machines", "string"}, + {"gpu_platform_id", "int"}, + {"gpu_device_id", "int"}, + {"gpu_use_dp", "bool"}, + {"num_gpu", "int"}, + }); + return map; } } // namespace LightGBM From 9467814e7dfd3d3e772a738e7346d0ef5f43daa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Wed, 21 Sep 2022 14:02:32 -0500 Subject: [PATCH 21/23] warn about ignoring parameters passed to constructor --- python-package/lightgbm/basic.py | 2 ++ tests/python_package_test/test_engine.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index f65232f9b914..ebfd6797215d 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2764,6 +2764,8 @@ def __init__( ctypes.byref(out_num_class))) self.__num_class = out_num_class.value self.pandas_categorical = _load_pandas_categorical(file_name=model_file) + if params: + _log_warning('Ignoring params argument, using parameters from model file.') params = self._get_loaded_param() elif model_str is not None: self.model_from_string(model_str) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index f42231c9074e..f7fbdd777cbd 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1226,6 +1226,11 @@ def test_parameters_are_loaded_from_model_file(tmp_path): assert set_params == params assert bst.params['categorical_feature'] == [1, 2] + # check that passing parameters to the constructor raises warning and ignores them + with pytest.warns(UserWarning, match='Ignoring params argument'): + bst2 = lgb.Booster(params={'num_leaves': 7}, model_file=model_file) + assert bst.params == bst2.params + def test_save_load_copy_pickle(): def train_and_predict(init_model=None, return_model=False): From 4cbf4771d4aa2e8efccdd194f87ced813d50a2f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Sat, 24 Sep 2022 13:19:24 -0500 Subject: [PATCH 22/23] trigger ci From 17ad0c1974b0e07c901655ce00ab801fb7928aa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 11 Oct 2022 09:18:05 -0500 Subject: [PATCH 23/23] trigger ci