From c87b8e2a803a7739d52943b595e9d58fb91cc6b8 Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Fri, 2 Nov 2018 13:07:36 +1100 Subject: [PATCH 01/14] Fix index out-of-range exception generated by BaggingHelper on small datasets. Prior to this change, the line "score_t threshold = tmp_gradients[top_k - 1];" would generate an exception, since tmp_gradients would be empty when the cnt input value to the function is zero. --- src/boosting/goss.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index c1ca8298742c..dcec193ba9b7 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -150,6 +150,7 @@ class GOSS: public GBDT { if (cur_start > num_data_) { continue; } data_size_t cur_cnt = inner_size; if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; } + if (cur_cnt == 0) { continue; } Random cur_rand(config_->bagging_seed + iter * num_threads_ + i); data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start); From 48b6e2ceca571164df771ab327d5b0af8b1254b3 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 6 Nov 2018 10:21:32 +0800 Subject: [PATCH 02/14] Update goss.hpp --- src/boosting/goss.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index dcec193ba9b7..792141275343 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -86,6 +86,9 @@ class GOSS: public GBDT { } data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer, data_size_t* buffer_right) { + if (cnt == 0) { + return 0; + } std::vector tmp_gradients(cnt, 0.0f); for (data_size_t i = 0; i < cnt; ++i) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { @@ -150,7 +153,6 @@ class GOSS: public GBDT { if (cur_start > num_data_) { continue; } data_size_t cur_cnt = inner_size; if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; } - if (cur_cnt == 0) { continue; } Random cur_rand(config_->bagging_seed + iter * num_threads_ + i); data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start); From 182e9249a70202e470cb1bea83edd73f379402e2 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 6 Nov 2018 10:21:57 +0800 Subject: [PATCH 03/14] Update goss.hpp --- src/boosting/goss.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 792141275343..1e0c451f6a8e 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -86,7 +86,7 @@ class GOSS: public GBDT { } data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer, data_size_t* buffer_right) { - if (cnt == 0) { + if (cnt <= 0) { return 0; } std::vector tmp_gradients(cnt, 0.0f); From c3ab42f39aab41be7fc238228c83dcefd3b49102 Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Wed, 13 Feb 2019 09:59:07 +1100 Subject: [PATCH 04/14] Add API method LGBM_BoosterPredictForMats which runs prediction on a data set given as of array of pointers to rows (as opposed to existing method LGBM_BoosterPredictForMat which requires data given as contiguous array) --- include/LightGBM/c_api.h | 31 +++++++++++++++++++++++++++++ src/c_api.cpp | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 09e0c07b9a9f..f092c5af251c 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -711,6 +711,37 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle, int64_t* out_len, double* out_result); +/*! +* \brief make prediction for an new data set +* Note: should pre-allocate memory for out_result, +* for noraml and raw score: its length is equal to num_class * num_data +* for leaf index, its length is equal to num_class * num_data * num_iteration +* \param handle handle +* \param data pointer to the data space +* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64 +* \param nrow number of rows +* \param ncol number columns +* \param predict_type +* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed) +* C_API_PREDICT_RAW_SCORE: raw score +* C_API_PREDICT_LEAF_INDEX: leaf index +* \param num_iteration number of iteration for prediction, <= 0 means no limit +* \param parameter Other parameters for the parameters, e.g. early stopping for prediction. +* \param out_len len of output result +* \param out_result used to set a pointer to array, should allocate memory before call this function +* \return 0 when succeed, -1 when failure happens +*/ +LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle, + const void** data, + int data_type, + int32_t nrow, + int32_t ncol, + int predict_type, + int num_iteration, + const char* parameter, + int64_t* out_len, + double* out_result); + /*! * \brief save model into file * \param handle handle diff --git a/src/c_api.cpp b/src/c_api.cpp index 4ffdb328b711..31bb52d99772 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -350,6 +350,9 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_ std::function>(int row_idx)> RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major); +std::function>(int row_idx)> +RowPairFunctionFromDenseRows(const void** data, int num_col, int data_type); + std::function>(int idx)> RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, const void* data, int data_type, int64_t nindptr, int64_t nelem); @@ -1232,6 +1235,30 @@ int LGBM_BoosterPredictForMat(BoosterHandle handle, API_END(); } +int LGBM_BoosterPredictForMats(BoosterHandle handle, + const void** data, + int data_type, + int32_t nrow, + int32_t ncol, + int predict_type, + int num_iteration, + const char* parameter, + int64_t* out_len, + double* out_result) { + API_BEGIN(); + auto param = Config::Str2Map(parameter); + Config config; + config.Set(param); + if (config.num_threads > 0) { + omp_set_num_threads(config.num_threads); + } + Booster* ref_booster = reinterpret_cast(handle); + auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type); + ref_booster->Predict(num_iteration, predict_type, nrow, get_row_fun, + config, out_result, out_len); + API_END(); +} + int LGBM_BoosterSaveModel(BoosterHandle handle, int start_iteration, int num_iteration, @@ -1405,6 +1432,22 @@ RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int d return nullptr; } +// data is array of pointers to individual rows +std::function>(int row_idx)> +RowPairFunctionFromDenseRows(const void** data, int num_col, int data_type) { + return [=](int row_idx) { + auto inner_function = RowFunctionFromDenseMatric(data[row_idx], 1, num_col, data_type, /* is_row_major */ true); + auto raw_values = inner_function(0); + std::vector> ret; + for (int i = 0; i < static_cast(raw_values.size()); ++i) { + if (std::fabs(raw_values[i]) > kZeroThreshold || std::isnan(raw_values[i])) { + ret.emplace_back(i, raw_values[i]); + } + } + return ret; + }; +} + std::function>(int idx)> RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, const void* data, int data_type, int64_t , int64_t ) { if (data_type == C_API_DTYPE_FLOAT32) { From fbf86372fd846cbdbff89d0359357168e760d0b8 Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Wed, 13 Feb 2019 10:07:31 +1100 Subject: [PATCH 05/14] Fix incorrect upstream merge --- src/boosting/goss.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 4b361be2b45c..6a310afaf298 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -152,7 +152,6 @@ class GOSS: public GBDT { if (cur_start > num_data_) { continue; } data_size_t cur_cnt = inner_size; if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; } - if (cur_cnt == 0) { continue; } Random cur_rand(config_->bagging_seed + iter * num_threads_ + i); data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start); From 84c3af868a91739a72950c610ddbf5d061015bba Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Wed, 20 Feb 2019 09:10:28 +1100 Subject: [PATCH 06/14] Add link to LightGBM.NET --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0ce3c206ca96..e575cd9d25a8 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,8 @@ MMLSpark (Spark-package): https://github.com/Azure/mmlspark ML.NET (.NET/C#-package): https://github.com/dotnet/machinelearning +LightGBM.NET (.NET/C#-package): https://github.com/rca22/LightGBM.Net + Dask-LightGBM (distributed and parallel Python-package): https://github.com/dask/dask-lightgbm Get Started and Documentation From af9c9425ad075af6bbfccebb4b16643f7cb4cf29 Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Thu, 14 Mar 2019 14:00:56 +1100 Subject: [PATCH 07/14] Fix indenting to 2 spaces --- src/c_api.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/c_api.cpp b/src/c_api.cpp index 31bb52d99772..be739a8d3d70 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -1435,17 +1435,17 @@ RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int d // data is array of pointers to individual rows std::function>(int row_idx)> RowPairFunctionFromDenseRows(const void** data, int num_col, int data_type) { - return [=](int row_idx) { - auto inner_function = RowFunctionFromDenseMatric(data[row_idx], 1, num_col, data_type, /* is_row_major */ true); - auto raw_values = inner_function(0); - std::vector> ret; - for (int i = 0; i < static_cast(raw_values.size()); ++i) { - if (std::fabs(raw_values[i]) > kZeroThreshold || std::isnan(raw_values[i])) { - ret.emplace_back(i, raw_values[i]); - } - } - return ret; - }; + return [=](int row_idx) { + auto inner_function = RowFunctionFromDenseMatric(data[row_idx], 1, num_col, data_type, /* is_row_major */ true); + auto raw_values = inner_function(0); + std::vector> ret; + for (int i = 0; i < static_cast(raw_values.size()); ++i) { + if (std::fabs(raw_values[i]) > kZeroThreshold || std::isnan(raw_values[i])) { + ret.emplace_back(i, raw_values[i]); + } + } + return ret; + }; } std::function>(int idx)> From c75f75613c5b4a1f50d05cb7c34210cd89c4d835 Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Mon, 18 Mar 2019 14:39:17 +1100 Subject: [PATCH 08/14] Dummy edit to trigger CI --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e575cd9d25a8..dcc986667426 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ MMLSpark (Spark-package): https://github.com/Azure/mmlspark ML.NET (.NET/C#-package): https://github.com/dotnet/machinelearning -LightGBM.NET (.NET/C#-package): https://github.com/rca22/LightGBM.Net +LightGBM.NET (.NET/C#-package): https://github.com/rca22/LightGBM.Net Dask-LightGBM (distributed and parallel Python-package): https://github.com/dask/dask-lightgbm From 25a7a2961adde46c023f973b4516b68e07ce49f8 Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Fri, 22 Mar 2019 09:45:31 +1100 Subject: [PATCH 09/14] Dummy edit to trigger CI --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dcc986667426..e575cd9d25a8 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ MMLSpark (Spark-package): https://github.com/Azure/mmlspark ML.NET (.NET/C#-package): https://github.com/dotnet/machinelearning -LightGBM.NET (.NET/C#-package): https://github.com/rca22/LightGBM.Net +LightGBM.NET (.NET/C#-package): https://github.com/rca22/LightGBM.Net Dask-LightGBM (distributed and parallel Python-package): https://github.com/dask/dask-lightgbm From 6272a30f993e194c93f460bdc0c6958cb1c2a4f0 Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Wed, 10 Feb 2021 13:32:09 +1100 Subject: [PATCH 10/14] remove duplicate functions from merge --- src/c_api.cpp | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/src/c_api.cpp b/src/c_api.cpp index 4ce4f7647523..47d7f8fef8de 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -2200,30 +2200,6 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, API_END(); } -int LGBM_BoosterPredictForMats(BoosterHandle handle, - const void** data, - int data_type, - int32_t nrow, - int32_t ncol, - int predict_type, - int num_iteration, - const char* parameter, - int64_t* out_len, - double* out_result) { - API_BEGIN(); - auto param = Config::Str2Map(parameter); - Config config; - config.Set(param); - if (config.num_threads > 0) { - omp_set_num_threads(config.num_threads); - } - Booster* ref_booster = reinterpret_cast(handle); - auto get_row_fun = RowPairFunctionFromDenseRows(data, ncol, data_type); - ref_booster->Predict(num_iteration, predict_type, nrow, get_row_fun, - config, out_result, out_len); - API_END(); -} - int LGBM_BoosterSaveModel(BoosterHandle handle, int start_iteration, int num_iteration, @@ -2412,22 +2388,6 @@ RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int d return nullptr; } -// data is array of pointers to individual rows -std::function>(int row_idx)> -RowPairFunctionFromDenseRows(const void** data, int num_col, int data_type) { - return [=](int row_idx) { - auto inner_function = RowFunctionFromDenseMatric(data[row_idx], 1, num_col, data_type, /* is_row_major */ true); - auto raw_values = inner_function(0); - std::vector> ret; - for (int i = 0; i < static_cast(raw_values.size()); ++i) { - if (std::fabs(raw_values[i]) > kZeroThreshold || std::isnan(raw_values[i])) { - ret.emplace_back(i, raw_values[i]); - } - } - return ret; - }; -} - // data is array of pointers to individual rows std::function>(int row_idx)> RowPairFunctionFromDenseRows(const void** data, int num_col, int data_type) { From fd2a504893ef46fb33118306e045c1ba0681c0ee Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Fri, 12 Feb 2021 14:14:36 +1100 Subject: [PATCH 11/14] Fix parsing of non-finite values. Current implementation silently returns zero when input string is "inf", "-inf", or "nan" when compiled with VS2017, so instead just explicitly check for these values and fail if there is no match. No attempt to optimise string allocations in this implementation since it is usually rarely invoked. --- include/LightGBM/utils/common.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index dd71782fb1ae..71dedfc75bff 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -1082,12 +1082,18 @@ struct __StringToTHelper { // Fast (common) path: For numeric inputs in RFC 7159 format: const bool fast_parse_succeeded = fast_double_parser::parse_number(str.c_str(), &tmp); - // Rare path: Not in RFC 7159 format. Possible "inf", "nan", etc. Fallback to standard library: + // Rare path: Not in RFC 7159 format. Possible "inf", "nan", etc. if (!fast_parse_succeeded) { - std::stringstream ss; - Common::C_stringstream(ss); - ss << str; - ss >> tmp; + std::string strlower(str); + std::transform(strlower.begin(), strlower.end(), strlower.begin(), [](int c) -> char { return static_cast(::tolower(c)); }); + if (strlower == std::string("inf")) + tmp = std::numeric_limits::infinity(); + else if (strlower == std::string("-inf")) + tmp = -std::numeric_limits::infinity(); + else if (strlower == std::string("nan")) + tmp = std::numeric_limits::quiet_NaN(); + else + Log::Fatal("Failed to parse double: %s", str.c_str()); } return static_cast(tmp); From 108c6de58494d312acf7fc05741e9cc6d1dba2d3 Mon Sep 17 00:00:00 2001 From: mjmckp Date: Fri, 12 Feb 2021 21:54:02 +1100 Subject: [PATCH 12/14] Dummy commit to trigger CI --- include/LightGBM/utils/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 71dedfc75bff..6722331c0e3c 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -1084,7 +1084,7 @@ struct __StringToTHelper { // Rare path: Not in RFC 7159 format. Possible "inf", "nan", etc. if (!fast_parse_succeeded) { - std::string strlower(str); + std::string strlower(str); std::transform(strlower.begin(), strlower.end(), strlower.begin(), [](int c) -> char { return static_cast(::tolower(c)); }); if (strlower == std::string("inf")) tmp = std::numeric_limits::infinity(); From be702c03234d4bc0508314e14ba417420d6506cb Mon Sep 17 00:00:00 2001 From: matthew-peacock Date: Sun, 14 Feb 2021 13:53:18 +1100 Subject: [PATCH 13/14] Also handle -nan in double parsing method --- include/LightGBM/utils/common.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 71dedfc75bff..43573573d3eb 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -1092,6 +1092,8 @@ struct __StringToTHelper { tmp = -std::numeric_limits::infinity(); else if (strlower == std::string("nan")) tmp = std::numeric_limits::quiet_NaN(); + else if (strlower == std::string("-nan")) + tmp = -std::numeric_limits::quiet_NaN(); else Log::Fatal("Failed to parse double: %s", str.c_str()); } From 4f06459c66196a268ea6352d8afabd4a7e4f5ceb Mon Sep 17 00:00:00 2001 From: mjmckp Date: Thu, 18 Feb 2021 13:11:17 +1100 Subject: [PATCH 14/14] Update include/LightGBM/utils/common.h Remove trailing whitespace to pass linting tests Co-authored-by: Nikita Titov --- include/LightGBM/utils/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index c4cb64672c99..43573573d3eb 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -1084,7 +1084,7 @@ struct __StringToTHelper { // Rare path: Not in RFC 7159 format. Possible "inf", "nan", etc. if (!fast_parse_succeeded) { - std::string strlower(str); + std::string strlower(str); std::transform(strlower.begin(), strlower.end(), strlower.begin(), [](int c) -> char { return static_cast(::tolower(c)); }); if (strlower == std::string("inf")) tmp = std::numeric_limits::infinity();