diff --git a/.ci/test.sh b/.ci/test.sh index 4224864a4cae..9da8b48f4ddb 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -52,8 +52,8 @@ if [[ $TRAVIS == "true" ]] && [[ $TASK == "lint" ]]; then "r-lintr>=2.0" pip install --user cpplint echo "Linting Python code" - pycodestyle --ignore=E501,W503 --exclude=./compute,./.nuget . || exit -1 - pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^compute|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 + pycodestyle --ignore=E501,W503 --exclude=./compute,./.nuget,./external_libs . || exit -1 + pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^compute|external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 echo "Linting R code" Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1 echo "Linting C++ code" diff --git a/.gitmodules b/.gitmodules index 133ceb3889da..8f1772cd19fa 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,9 @@ [submodule "include/boost/compute"] path = compute url = https://github.com/boostorg/compute +[submodule "external_libs/fmt"] + path = external_libs/fmt + url = https://github.com/fmtlib/fmt.git +[submodule "external_libs/fast_double_parser"] + path = external_libs/fast_double_parser + url = https://github.com/lemire/fast_double_parser.git diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore index c231a58a126b..2aeecfe18cc8 100644 --- a/R-package/.Rbuildignore +++ b/R-package/.Rbuildignore @@ -1,10 +1,14 @@ +\.appveyor\.yml AUTOCONF_UBUNTU_VERSION ^autom4te.cache/.*$ ^.*\.bin ^build_r.R$ +\.clang-format ^cran-comments\.md$ ^docs$ ^.*\.dll +\.drone\.yml +\.git \.gitkeep$ ^.*\.history ^Makefile$ @@ -24,3 +28,22 @@ AUTOCONF_UBUNTU_VERSION ^src/compute/.gitignore$ ^src/compute/CONTRIBUTING.md$ ^src/compute/README.md$ +src/external_libs/fast_double_parser/benchmarks +src/external_libs/fast_double_parser/Makefile +src/external_libs/fast_double_parser/.*\.md +src/external_libs/fast_double_parser/tests +src/external_libs/fast_double_parser/.*\.yaml +src/external_libs/fast_double_parser/.*\.yml +src/external_libs/fmt/.*\.md +src/external_libs/fmt/doc +src/external_libs/fmt/support/Android\.mk +src/external_libs/fmt/support/.*\.gradle +src/external_libs/fmt/support/.*\.pro +src/external_libs/fmt/support/.*\.py +src/external_libs/fmt/support/rtd +src/external_libs/fmt/support/.*sublime-syntax +src/external_libs/fmt/support/Vagrantfile +src/external_libs/fmt/support/.*\.xml +src/external_libs/fmt/support/.*\.yml +src/external_libs/fmt/test +\.travis\.yml diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 6e969b087c30..6e61f459c99b 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -20,6 +20,9 @@ Authors@R: c( person("Jay", "Loden", role = c("cph")), person("Dave", "Daeschler", role = c("cph")), person("Giampaolo", "Rodola", role = c("cph")), + person("Alberto", "Ferreira", role = c("ctb")), + person("Daniel", "Lemire", role = c("ctb")), + person("Victor", "Zverovich", role = c("cph")), person("IBM Corporation", role = c("ctb")) ) Description: Tree based algorithms can be improved by introducing boosting frameworks. diff --git a/R-package/README.md b/R-package/README.md index f237f14686f6..fc787faed117 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -251,6 +251,7 @@ For more information on this approach, see ["Writing R Extensions"](https://cran From the root of the repository, run the following. ```shell +git submodule update --init --recursive sh build-cran-package.sh ``` diff --git a/build-cran-package.sh b/build-cran-package.sh index d21f5f051d16..a18c9456864b 100755 --- a/build-cran-package.sh +++ b/build-cran-package.sh @@ -28,6 +28,15 @@ cp -R R-package/* ${TEMP_R_DIR} cp -R include ${TEMP_R_DIR}/src/ cp -R src/* ${TEMP_R_DIR}/src/ +cp \ + external_libs/fast_double_parser/include/fast_double_parser.h \ + ${TEMP_R_DIR}/src/include/LightGBM + +mkdir -p ${TEMP_R_DIR}/src/include/LightGBM/fmt +cp \ + external_libs/fmt/include/fmt/*.h \ + ${TEMP_R_DIR}/src/include/LightGBM/fmt/ + cd ${TEMP_R_DIR} # Remove files not needed for CRAN @@ -67,6 +76,16 @@ cd ${TEMP_R_DIR} done find . -name '*.h.bak' -o -name '*.hpp.bak' -o -name '*.cpp.bak' -exec rm {} \; + sed \ + -i.bak \ + -e 's/\.\..*fmt\/format\.h/LightGBM\/fmt\/format\.h/' \ + src/include/LightGBM/utils/common.h + + sed \ + -i.bak \ + -e 's/\.\..*fast_double_parser\.h/LightGBM\/fast_double_parser\.h/' \ + src/include/LightGBM/utils/common.h + # When building an R package with 'configure', it seems # you're guaranteed to get a shared library called # .so/dll. The package source code expects diff --git a/build_r.R b/build_r.R index b3a98c45cc6a..719e622a9d01 100644 --- a/build_r.R +++ b/build_r.R @@ -135,6 +135,17 @@ result <- file.remove( ) .handle_result(result) +#------------# +# submodules # +#------------# +result <- file.copy( + from = "external_libs/" + , to = sprintf("%s/", TEMP_SOURCE_DIR) + , recursive = TRUE + , overwrite = TRUE +) +.handle_result(result) + # copy files into the place CMake expects for (src_file in c("lightgbm_R.cpp", "lightgbm_R.h", "R_object_helper.h")) { result <- file.copy( diff --git a/external_libs/fast_double_parser b/external_libs/fast_double_parser new file mode 160000 index 000000000000..ace60646c02d --- /dev/null +++ b/external_libs/fast_double_parser @@ -0,0 +1 @@ +Subproject commit ace60646c02dc54c57f19d644e49a61e7e7758ec diff --git a/external_libs/fmt b/external_libs/fmt new file mode 160000 index 000000000000..cc09f1a6798c --- /dev/null +++ b/external_libs/fmt @@ -0,0 +1 @@ +Subproject commit cc09f1a6798c085c325569ef466bcdcffdc266d4 diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 07b8484b5577..dd71782fb1ae 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -2,9 +2,12 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifndef LIGHTGBM_UTILS_COMMON_FUN_H_ -#define LIGHTGBM_UTILS_COMMON_FUN_H_ +#ifndef LIGHTGBM_UTILS_COMMON_H_ +#define LIGHTGBM_UTILS_COMMON_H_ +#if ((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))) +#include +#endif #include #include @@ -15,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +30,12 @@ #include #include +#if (!((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__)))) +#define FMT_HEADER_ONLY +#include "../../../external_libs/fmt/include/fmt/format.h" +#endif +#include "../../../external_libs/fast_double_parser/include/fast_double_parser.h" + #ifdef _MSC_VER #include #pragma intrinsic(_BitScanReverse) @@ -51,6 +61,13 @@ namespace LightGBM { namespace Common { +/*! +* Imbues the stream with the C locale. +*/ +static void C_stringstream(std::stringstream &ss) { + ss.imbue(std::locale::classic()); +} + inline static char tolower(char in) { if (in <= 'Z' && in >= 'A') return in - ('Z' - 'z'); @@ -329,94 +346,6 @@ inline static bool AtofAndCheck(const char* p, double* out) { return true; } -inline static unsigned CountDecimalDigit32(uint32_t n) { -#if defined(_MSC_VER) || defined(__GNUC__) - static const uint32_t powers_of_10[] = { - 0, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000 - }; -#ifdef _MSC_VER - // NOLINTNEXTLINE - unsigned long i = 0; - _BitScanReverse(&i, n | 1); - uint32_t t = (i + 1) * 1233 >> 12; -#elif __GNUC__ - uint32_t t = (32 - __builtin_clz(n | 1)) * 1233 >> 12; -#endif - return t - (n < powers_of_10[t]) + 1; -#else - if (n < 10) return 1; - if (n < 100) return 2; - if (n < 1000) return 3; - if (n < 10000) return 4; - if (n < 100000) return 5; - if (n < 1000000) return 6; - if (n < 10000000) return 7; - if (n < 100000000) return 8; - if (n < 1000000000) return 9; - return 10; -#endif -} - -inline static void Uint32ToStr(uint32_t value, char* buffer) { - const char kDigitsLut[200] = { - '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', '0', '5', '0', '6', '0', '7', '0', '8', '0', '9', - '1', '0', '1', '1', '1', '2', '1', '3', '1', '4', '1', '5', '1', '6', '1', '7', '1', '8', '1', '9', - '2', '0', '2', '1', '2', '2', '2', '3', '2', '4', '2', '5', '2', '6', '2', '7', '2', '8', '2', '9', - '3', '0', '3', '1', '3', '2', '3', '3', '3', '4', '3', '5', '3', '6', '3', '7', '3', '8', '3', '9', - '4', '0', '4', '1', '4', '2', '4', '3', '4', '4', '4', '5', '4', '6', '4', '7', '4', '8', '4', '9', - '5', '0', '5', '1', '5', '2', '5', '3', '5', '4', '5', '5', '5', '6', '5', '7', '5', '8', '5', '9', - '6', '0', '6', '1', '6', '2', '6', '3', '6', '4', '6', '5', '6', '6', '6', '7', '6', '8', '6', '9', - '7', '0', '7', '1', '7', '2', '7', '3', '7', '4', '7', '5', '7', '6', '7', '7', '7', '8', '7', '9', - '8', '0', '8', '1', '8', '2', '8', '3', '8', '4', '8', '5', '8', '6', '8', '7', '8', '8', '8', '9', - '9', '0', '9', '1', '9', '2', '9', '3', '9', '4', '9', '5', '9', '6', '9', '7', '9', '8', '9', '9' - }; - unsigned digit = CountDecimalDigit32(value); - buffer += digit; - *buffer = '\0'; - - while (value >= 100) { - const unsigned i = (value % 100) << 1; - value /= 100; - *--buffer = kDigitsLut[i + 1]; - *--buffer = kDigitsLut[i]; - } - - if (value < 10) { - *--buffer = static_cast(value) + '0'; - } else { - const unsigned i = value << 1; - *--buffer = kDigitsLut[i + 1]; - *--buffer = kDigitsLut[i]; - } -} - -inline static void Int32ToStr(int32_t value, char* buffer) { - uint32_t u = static_cast(value); - if (value < 0) { - *buffer++ = '-'; - u = ~u + 1; - } - Uint32ToStr(u, buffer); -} - -inline static void DoubleToStr(double value, char* buffer, size_t buffer_len) { - #ifdef _MSC_VER - int num_chars = sprintf_s(buffer, buffer_len, "%.17g", value); - #else - int num_chars = snprintf(buffer, buffer_len, "%.17g", value); - #endif - CHECK_GE(num_chars, 0); -} - inline static const char* SkipSpaceAndTab(const char* p) { while (*p == ' ' || *p == '\t') { ++p; @@ -440,67 +369,6 @@ inline static std::vector ArrayCast(const std::vector& arr) { return ret; } -template -struct __TToStringHelperFast { - void operator()(T value, char* buffer, size_t) const { - Int32ToStr(value, buffer); - } -}; - -template -struct __TToStringHelperFast { - void operator()(T value, char* buffer, size_t buf_len) - const { - #ifdef _MSC_VER - int num_chars = sprintf_s(buffer, buf_len, "%g", value); - #else - int num_chars = snprintf(buffer, buf_len, "%g", value); - #endif - CHECK_GE(num_chars, 0); - } -}; - -template -struct __TToStringHelperFast { - void operator()(T value, char* buffer, size_t) const { - Uint32ToStr(value, buffer); - } -}; - -template -inline static std::string ArrayToStringFast(const std::vector& arr, size_t n) { - if (arr.empty() || n == 0) { - return std::string(""); - } - __TToStringHelperFast::value, std::is_unsigned::value> helper; - const size_t buf_len = 16; - std::vector buffer(buf_len); - std::stringstream str_buf; - helper(arr[0], buffer.data(), buf_len); - str_buf << buffer.data(); - for (size_t i = 1; i < std::min(n, arr.size()); ++i) { - helper(arr[i], buffer.data(), buf_len); - str_buf << ' ' << buffer.data(); - } - return str_buf.str(); -} - -inline static std::string ArrayToString(const std::vector& arr, size_t n) { - if (arr.empty() || n == 0) { - return std::string(""); - } - const size_t buf_len = 32; - std::vector buffer(buf_len); - std::stringstream str_buf; - DoubleToStr(arr[0], buffer.data(), buf_len); - str_buf << buffer.data(); - for (size_t i = 1; i < std::min(n, arr.size()); ++i) { - DoubleToStr(arr[i], buffer.data(), buf_len); - str_buf << ' ' << buffer.data(); - } - return str_buf.str(); -} - template struct __StringToTHelper { T operator()(const std::string& str) const { @@ -588,11 +456,14 @@ inline static std::vector StringToArrayFast(const std::string& str, int n) { } template -inline static std::string Join(const std::vector& strs, const char* delimiter) { +inline static std::string Join(const std::vector& strs, const char* delimiter, const bool force_C_locale = false) { if (strs.empty()) { return std::string(""); } std::stringstream str_buf; + if (force_C_locale) { + C_stringstream(str_buf); + } str_buf << std::setprecision(std::numeric_limits::digits10 + 2); str_buf << strs[0]; for (size_t i = 1; i < strs.size(); ++i) { @@ -603,11 +474,14 @@ inline static std::string Join(const std::vector& strs, const char* delimiter } template<> -inline std::string Join(const std::vector& strs, const char* delimiter) { +inline std::string Join(const std::vector& strs, const char* delimiter, const bool force_C_locale) { if (strs.empty()) { return std::string(""); } std::stringstream str_buf; + if (force_C_locale) { + C_stringstream(str_buf); + } str_buf << std::setprecision(std::numeric_limits::digits10 + 2); str_buf << static_cast(strs[0]); for (size_t i = 1; i < strs.size(); ++i) { @@ -618,13 +492,16 @@ inline std::string Join(const std::vector& strs, const char* del } template -inline static std::string Join(const std::vector& strs, size_t start, size_t end, const char* delimiter) { +inline static std::string Join(const std::vector& strs, size_t start, size_t end, const char* delimiter, const bool force_C_locale = false) { if (end - start <= 0) { return std::string(""); } start = std::min(start, static_cast(strs.size()) - 1); end = std::min(end, static_cast(strs.size())); std::stringstream str_buf; + if (force_C_locale) { + C_stringstream(str_buf); + } str_buf << std::setprecision(std::numeric_limits::digits10 + 2); str_buf << strs[start]; for (size_t i = start + 1; i < end; ++i) { @@ -1137,6 +1014,217 @@ class FunctionTimer { extern Common::Timer global_timer; + +/*! +* Provides locale-independent alternatives to Common's methods. +* Essential to make models robust to locale settings. +*/ +namespace CommonC { + +template +inline static std::string Join(const std::vector& strs, const char* delimiter) { + return LightGBM::Common::Join(strs, delimiter, true); +} + +template +inline static std::string Join(const std::vector& strs, size_t start, size_t end, const char* delimiter) { + return LightGBM::Common::Join(strs, start, end, delimiter, true); +} + +inline static const char* Atof(const char* p, double* out) { + return LightGBM::Common::Atof(p, out); +} + +template +struct __StringToTHelperFast { + const char* operator()(const char*p, T* out) const { + return LightGBM::Common::Atoi(p, out); + } +}; + +/*! +* \warning Beware that ``Common::Atof`` in ``__StringToTHelperFast``, +* has **less** floating point precision than ``__StringToTHelper``. +* Both versions are kept to maintain bit-for-bit the "legacy" LightGBM behaviour in terms of precision. +* Check ``StringToArrayFast`` and ``StringToArray`` for more details on this. +*/ +template +struct __StringToTHelperFast { + const char* operator()(const char*p, T* out) const { + double tmp = 0.0f; + auto ret = Atof(p, &tmp); + *out = static_cast(tmp); + return ret; + } +}; + +template +struct __StringToTHelper { + T operator()(const std::string& str) const { + T ret = 0; + LightGBM::Common::Atoi(str.c_str(), &ret); + return ret; + } +}; + +/*! +* \warning Beware that ``Common::Atof`` in ``__StringToTHelperFast``, +* has **less** floating point precision than ``__StringToTHelper``. +* Both versions are kept to maintain bit-for-bit the "legacy" LightGBM behaviour in terms of precision. +* Check ``StringToArrayFast`` and ``StringToArray`` for more details on this. +* \note It is possible that ``fast_double_parser::parse_number`` is faster than ``Common::Atof``. +*/ +template +struct __StringToTHelper { + T operator()(const std::string& str) const { + double tmp; + + // Fast (common) path: For numeric inputs in RFC 7159 format: + const bool fast_parse_succeeded = fast_double_parser::parse_number(str.c_str(), &tmp); + + // Rare path: Not in RFC 7159 format. Possible "inf", "nan", etc. Fallback to standard library: + if (!fast_parse_succeeded) { + std::stringstream ss; + Common::C_stringstream(ss); + ss << str; + ss >> tmp; + } + + return static_cast(tmp); + } +}; + + +/*! +* \warning Beware that due to internal use of ``Common::Atof`` in ``__StringToTHelperFast``, +* this method has less precision for floating point numbers than ``StringToArray``, +* which calls ``__StringToTHelper``. +* As such, ``StringToArrayFast`` and ``StringToArray`` are not equivalent! +* Both versions were kept to maintain bit-for-bit the "legacy" LightGBM behaviour in terms of precision. +*/ +template +inline static std::vector StringToArrayFast(const std::string& str, int n) { + if (n == 0) { + return std::vector(); + } + auto p_str = str.c_str(); + __StringToTHelperFast::value> helper; + std::vector ret(n); + for (int i = 0; i < n; ++i) { + p_str = helper(p_str, &ret[i]); + } + return ret; +} + +/*! +* \warning Do not replace calls to this method by ``StringToArrayFast``. +* This method is more precise for floating point numbers. +* Check ``StringToArrayFast`` for more details. +*/ +template +inline static std::vector StringToArray(const std::string& str, int n) { + if (n == 0) { + return std::vector(); + } + std::vector strs = LightGBM::Common::Split(str.c_str(), ' '); + CHECK_EQ(strs.size(), static_cast(n)); + std::vector ret; + ret.reserve(strs.size()); + __StringToTHelper::value> helper; + for (const auto& s : strs) { + ret.push_back(helper(s)); + } + return ret; +} + +/*! +* \warning Do not replace calls to this method by ``StringToArrayFast``. +* This method is more precise for floating point numbers. +* Check ``StringToArrayFast`` for more details. +*/ +template +inline static std::vector StringToArray(const std::string& str, char delimiter) { + std::vector strs = LightGBM::Common::Split(str.c_str(), delimiter); + std::vector ret; + ret.reserve(strs.size()); + __StringToTHelper::value> helper; + for (const auto& s : strs) { + ret.push_back(helper(s)); + } + return ret; +} + +#if (!((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__)))) +/*! +* Safely formats a value onto a buffer according to a format string and null-terminates it. +* +* \note It checks that the full value was written or forcefully aborts. +* This safety check serves to prevent incorrect internal API usage. +* Correct usage will never incur in this problem: +* - The received buffer size shall be sufficient at all times for the input format string and value. +*/ +template +inline static void format_to_buf(char* buffer, const size_t buf_len, const char* format, const T value) { + auto result = fmt::format_to_n(buffer, buf_len, format, value); + if (result.size >= buf_len) { + Log::Fatal("Numerical conversion failed. Buffer is too small."); + } + buffer[result.size] = '\0'; +} + +template +struct __TToStringHelper { + void operator()(T value, char* buffer, size_t buf_len) const { + format_to_buf(buffer, buf_len, "{}", value); + } +}; + +template +struct __TToStringHelper { + void operator()(T value, char* buffer, size_t buf_len) const { + format_to_buf(buffer, buf_len, "{:g}", value); + } +}; + +template +struct __TToStringHelper { + void operator()(T value, char* buffer, size_t buf_len) const { + format_to_buf(buffer, buf_len, "{:.17g}", value); + } +}; + +/*! +* Converts an array to a string with with values separated by the space character. +* This method replaces Common's ``ArrayToString`` and ``ArrayToStringFast`` functionality +* and is locale-independent. +* +* \note If ``high_precision_output`` is set to true, +* floating point values are output with more digits of precision. +*/ +template +inline static std::string ArrayToString(const std::vector& arr, size_t n) { + if (arr.empty() || n == 0) { + return std::string(""); + } + __TToStringHelper::value, high_precision_output> helper; + const size_t buf_len = high_precision_output ? 32 : 16; + std::vector buffer(buf_len); + std::stringstream str_buf; + Common::C_stringstream(str_buf); + helper(arr[0], buffer.data(), buf_len); + str_buf << buffer.data(); + for (size_t i = 1; i < std::min(n, arr.size()); ++i) { + helper(arr[i], buffer.data(), buf_len); + str_buf << ' ' << buffer.data(); + } + return str_buf.str(); +} +#endif // (!((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__)))) + + +} // namespace CommonC + + } // namespace LightGBM -#endif // LightGBM_UTILS_COMMON_FUN_H_ +#endif // LIGHTGBM_UTILS_COMMON_H_ diff --git a/include/LightGBM/utils/common_legacy_solaris.h b/include/LightGBM/utils/common_legacy_solaris.h new file mode 100644 index 000000000000..97f977108fc6 --- /dev/null +++ b/include/LightGBM/utils/common_legacy_solaris.h @@ -0,0 +1,160 @@ +/*! + * Copyright (c) 2016 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +/*! + * This file is meant to be used ONLY IN SOLARIS! + * The newer code that replaced it is faster and safe regarding locale! + */ +#ifndef LIGHTGBM_UTILS_COMMON_LEGACY_SOLARIS_H_ +#define LIGHTGBM_UTILS_COMMON_LEGACY_SOLARIS_H_ + +#include + +#include +#include +#include +#include +#include + +namespace LightGBM { + +namespace CommonLegacy { + +inline static unsigned CountDecimalDigit32(uint32_t n) { + if (n < 10) return 1; + else if (n < 100) return 2; + else if (n < 1000) return 3; + else if (n < 10000) return 4; + else if (n < 100000) return 5; + else if (n < 1000000) return 6; + else if (n < 10000000) return 7; + else if (n < 100000000) return 8; + else if (n < 1000000000) return 9; + else + return 10; +} + +inline static void Uint32ToStr(uint32_t value, char* buffer) { + const char kDigitsLut[200] = { + '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', '0', '5', '0', '6', '0', '7', '0', '8', '0', '9', + '1', '0', '1', '1', '1', '2', '1', '3', '1', '4', '1', '5', '1', '6', '1', '7', '1', '8', '1', '9', + '2', '0', '2', '1', '2', '2', '2', '3', '2', '4', '2', '5', '2', '6', '2', '7', '2', '8', '2', '9', + '3', '0', '3', '1', '3', '2', '3', '3', '3', '4', '3', '5', '3', '6', '3', '7', '3', '8', '3', '9', + '4', '0', '4', '1', '4', '2', '4', '3', '4', '4', '4', '5', '4', '6', '4', '7', '4', '8', '4', '9', + '5', '0', '5', '1', '5', '2', '5', '3', '5', '4', '5', '5', '5', '6', '5', '7', '5', '8', '5', '9', + '6', '0', '6', '1', '6', '2', '6', '3', '6', '4', '6', '5', '6', '6', '6', '7', '6', '8', '6', '9', + '7', '0', '7', '1', '7', '2', '7', '3', '7', '4', '7', '5', '7', '6', '7', '7', '7', '8', '7', '9', + '8', '0', '8', '1', '8', '2', '8', '3', '8', '4', '8', '5', '8', '6', '8', '7', '8', '8', '8', '9', + '9', '0', '9', '1', '9', '2', '9', '3', '9', '4', '9', '5', '9', '6', '9', '7', '9', '8', '9', '9' + }; + unsigned digit = CountDecimalDigit32(value); + buffer += digit; + *buffer = '\0'; + + while (value >= 100) { + const unsigned i = (value % 100) << 1; + value /= 100; + *--buffer = kDigitsLut[i + 1]; + *--buffer = kDigitsLut[i]; + } + + if (value < 10) { + *--buffer = static_cast(value) + '0'; + } else { + const unsigned i = value << 1; + *--buffer = kDigitsLut[i + 1]; + *--buffer = kDigitsLut[i]; + } +} + +inline static void Int32ToStr(int32_t value, char* buffer) { + uint32_t u = static_cast(value); + if (value < 0) { + *buffer++ = '-'; + u = ~u + 1; + } + Uint32ToStr(u, buffer); +} + +inline static void DoubleToStr(double value, char* buffer, size_t buffer_len) { + int num_chars = snprintf(buffer, buffer_len, "%.17g", value); + CHECK_GE(num_chars, 0); +} + + +template +struct __TToStringHelperFast { + void operator()(T value, char* buffer, size_t) const { + Int32ToStr(value, buffer); + } +}; + +template +struct __TToStringHelperFast { + void operator()(T value, char* buffer, size_t buf_len) const { + int num_chars = snprintf(buffer, buf_len, "%g", value); + CHECK_GE(num_chars, 0); + } +}; + +template +struct __TToStringHelperFast { + void operator()(T value, char* buffer, size_t) const { + Uint32ToStr(value, buffer); + } +}; + +template +inline static std::string _ArrayToStringFast(const std::vector& arr, size_t n) { + if (arr.empty() || n == 0) { + return std::string(""); + } + __TToStringHelperFast::value, std::is_unsigned::value> helper; + const size_t buf_len = 16; + std::vector buffer(buf_len); + std::stringstream str_buf; + helper(arr[0], buffer.data(), buf_len); + str_buf << buffer.data(); + for (size_t i = 1; i < std::min(n, arr.size()); ++i) { + helper(arr[i], buffer.data(), buf_len); + str_buf << ' ' << buffer.data(); + } + return str_buf.str(); +} + +inline static std::string _ArrayToString(const std::vector& arr, size_t n) { + if (arr.empty() || n == 0) { + return std::string(""); + } + const size_t buf_len = 32; + std::vector buffer(buf_len); + std::stringstream str_buf; + DoubleToStr(arr[0], buffer.data(), buf_len); + str_buf << buffer.data(); + for (size_t i = 1; i < std::min(n, arr.size()); ++i) { + DoubleToStr(arr[i], buffer.data(), buf_len); + str_buf << ' ' << buffer.data(); + } + return str_buf.str(); +} + + +template +inline static typename std::enable_if::type +ArrayToString(const std::vector& arr, size_t n) { + return _ArrayToStringFast(arr, n); +} + +template +inline static typename std::enable_if< +(high_precision_output == true) && (std::is_same::value), std::string>::type +ArrayToString(const std::vector& arr, size_t n) { + return _ArrayToString(arr, n); +} + +} // namespace CommonLegacy + +} // namespace LightGBM + +#endif // LIGHTGBM_UTILS_COMMON_LEGACY_SOLARIS_H_ diff --git a/python-package/MANIFEST.in b/python-package/MANIFEST.in index 8104cdbe93e9..2732a9962e14 100644 --- a/python-package/MANIFEST.in +++ b/python-package/MANIFEST.in @@ -8,6 +8,7 @@ recursive-include compile/compute/ *.txt recursive-include compile/compute/cmake * recursive-include compile/compute/include * recursive-include compile/compute/meta * +recursive-include compile/external_libs * recursive-include compile/include * recursive-include compile/src * recursive-include compile/windows LightGBM.sln LightGBM.vcxproj diff --git a/python-package/setup.py b/python-package/setup.py index 4e3242fe2191..61a69c42c299 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -43,6 +43,7 @@ def copy_files_helper(folder_name): if not os.path.isfile(os.path.join(CURRENT_DIR, '_IS_SOURCE_PACKAGE.txt')): copy_files_helper('include') copy_files_helper('src') + copy_files_helper('external_libs') if not os.path.exists(os.path.join(CURRENT_DIR, "compile", "windows")): os.makedirs(os.path.join(CURRENT_DIR, "compile", "windows")) copy_file(os.path.join(CURRENT_DIR, os.path.pardir, "windows", "LightGBM.sln"), diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 4eeb731f587f..e5cec8b61db0 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -20,6 +20,7 @@ const char* kModelVersion = "v3"; std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_importance_type) const { std::stringstream str_buf; + Common::C_stringstream(str_buf); str_buf << "{"; str_buf << "\"name\":\"" << SubModelName() << "\"," << '\n'; @@ -34,16 +35,17 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ str_buf << "\"average_output\":" << (average_output_ ? "true" : "false") << ",\n"; - str_buf << "\"feature_names\":[\"" << Common::Join(feature_names_, "\",\"") + str_buf << "\"feature_names\":[\"" << CommonC::Join(feature_names_, "\",\"") << "\"]," << '\n'; str_buf << "\"monotone_constraints\":[" - << Common::Join(monotone_constraints_, ",") << "]," << '\n'; + << CommonC::Join(monotone_constraints_, ",") << "]," << '\n'; str_buf << "\"feature_infos\":" << "{"; bool first_obj = true; for (size_t i = 0; i < feature_infos_.size(); ++i) { std::stringstream json_str_buf; + Common::C_stringstream(json_str_buf); auto strs = Common::Split(feature_infos_[i].c_str(), ":"); if (strs[0][0] == '[') { strs[0].erase(0, 1); // remove '[' @@ -56,12 +58,12 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ json_str_buf << "\"max_value\":" << Common::AvoidInf(max_) << ","; json_str_buf << "\"values\":[]}"; } else if (strs[0] != "none") { // categorical feature - auto vals = Common::StringToArray(feature_infos_[i], ':'); + auto vals = CommonC::StringToArray(feature_infos_[i], ':'); auto max_idx = ArrayArgs::ArgMax(vals); auto min_idx = ArrayArgs::ArgMin(vals); json_str_buf << "{\"min_value\":" << vals[min_idx] << ","; json_str_buf << "\"max_value\":" << vals[max_idx] << ","; - json_str_buf << "\"values\":[" << Common::Join(vals, ",") << "]}"; + json_str_buf << "\"values\":[" << CommonC::Join(vals, ",") << "]}"; } else { // unused feature continue; } @@ -121,6 +123,7 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ std::string GBDT::ModelToIfElse(int num_iteration) const { std::stringstream str_buf; + Common::C_stringstream(str_buf); str_buf << "#include \"gbdt.h\"" << '\n'; str_buf << "#include " << '\n'; @@ -155,6 +158,7 @@ std::string GBDT::ModelToIfElse(int num_iteration) const { str_buf << " };" << '\n' << '\n'; std::stringstream pred_str_buf; + Common::C_stringstream(pred_str_buf); pred_str_buf << "\t" << "int early_stop_round_counter = 0;" << '\n'; pred_str_buf << "\t" << "std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);" << '\n'; @@ -186,6 +190,7 @@ std::string GBDT::ModelToIfElse(int num_iteration) const { str_buf << " };" << '\n' << '\n'; std::stringstream pred_str_buf_map; + Common::C_stringstream(pred_str_buf_map); pred_str_buf_map << "\t" << "int early_stop_round_counter = 0;" << '\n'; pred_str_buf_map << "\t" << "std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);" << '\n'; @@ -305,6 +310,7 @@ bool GBDT::SaveModelToIfElse(int num_iteration, const char* filename) const { std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int feature_importance_type) const { std::stringstream ss; + Common::C_stringstream(ss); // output model type ss << SubModelName() << '\n'; @@ -325,14 +331,14 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int ss << "average_output" << '\n'; } - ss << "feature_names=" << Common::Join(feature_names_, " ") << '\n'; + ss << "feature_names=" << CommonC::Join(feature_names_, " ") << '\n'; if (monotone_constraints_.size() != 0) { - ss << "monotone_constraints=" << Common::Join(monotone_constraints_, " ") + ss << "monotone_constraints=" << CommonC::Join(monotone_constraints_, " ") << '\n'; } - ss << "feature_infos=" << Common::Join(feature_infos_, " ") << '\n'; + ss << "feature_infos=" << CommonC::Join(feature_infos_, " ") << '\n'; int num_used_model = static_cast(models_.size()); int total_iteration = num_used_model / num_tree_per_iteration_; @@ -356,7 +362,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int tree_sizes[idx] = tree_strs[idx].size(); } - ss << "tree_sizes=" << Common::Join(tree_sizes, " ") << '\n'; + ss << "tree_sizes=" << CommonC::Join(tree_sizes, " ") << '\n'; ss << '\n'; for (int i = 0; i < num_used_model - start_model; ++i) { @@ -491,7 +497,7 @@ bool GBDT::LoadModelFromString(const char* buffer, size_t len) { // get monotone_constraints if (key_vals.count("monotone_constraints")) { - monotone_constraints_ = Common::StringToArray(key_vals["monotone_constraints"].c_str(), ' '); + monotone_constraints_ = CommonC::StringToArray(key_vals["monotone_constraints"].c_str(), ' '); if (monotone_constraints_.size() != static_cast(max_feature_idx_ + 1)) { Log::Fatal("Wrong size of monotone_constraints"); return false; @@ -533,7 +539,7 @@ bool GBDT::LoadModelFromString(const char* buffer, size_t len) { p = Common::SkipNewLine(p); } } else { - std::vector tree_sizes = Common::StringToArray(key_vals["tree_sizes"].c_str(), ' '); + std::vector tree_sizes = CommonC::StringToArray(key_vals["tree_sizes"].c_str(), ' '); std::vector tree_boundries(tree_sizes.size() + 1, 0); int num_trees = static_cast(tree_sizes.size()); for (int i = 0; i < num_trees; ++i) { @@ -564,6 +570,7 @@ bool GBDT::LoadModelFromString(const char* buffer, size_t len) { iter_ = 0; bool is_inparameter = false; std::stringstream ss; + Common::C_stringstream(ss); while (p < end) { auto line_len = Common::GetLine(p); if (line_len > 0) { diff --git a/src/io/tree.cpp b/src/io/tree.cpp index 92f016dfe6b0..a4e42831b0e7 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -219,37 +219,45 @@ double Tree::GetLowerBoundValue() const { std::string Tree::ToString() const { std::stringstream str_buf; + Common::C_stringstream(str_buf); + + #if ((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))) + using CommonLegacy::ArrayToString; // Slower & unsafe regarding locale. + #else + using CommonC::ArrayToString; + #endif + str_buf << "num_leaves=" << num_leaves_ << '\n'; str_buf << "num_cat=" << num_cat_ << '\n'; str_buf << "split_feature=" - << Common::ArrayToStringFast(split_feature_, num_leaves_ - 1) << '\n'; + << ArrayToString(split_feature_, num_leaves_ - 1) << '\n'; str_buf << "split_gain=" - << Common::ArrayToStringFast(split_gain_, num_leaves_ - 1) << '\n'; + << ArrayToString(split_gain_, num_leaves_ - 1) << '\n'; str_buf << "threshold=" - << Common::ArrayToString(threshold_, num_leaves_ - 1) << '\n'; + << ArrayToString(threshold_, num_leaves_ - 1) << '\n'; str_buf << "decision_type=" - << Common::ArrayToStringFast(Common::ArrayCast(decision_type_), num_leaves_ - 1) << '\n'; + << ArrayToString(Common::ArrayCast(decision_type_), num_leaves_ - 1) << '\n'; str_buf << "left_child=" - << Common::ArrayToStringFast(left_child_, num_leaves_ - 1) << '\n'; + << ArrayToString(left_child_, num_leaves_ - 1) << '\n'; str_buf << "right_child=" - << Common::ArrayToStringFast(right_child_, num_leaves_ - 1) << '\n'; + << ArrayToString(right_child_, num_leaves_ - 1) << '\n'; str_buf << "leaf_value=" - << Common::ArrayToString(leaf_value_, num_leaves_) << '\n'; + << ArrayToString(leaf_value_, num_leaves_) << '\n'; str_buf << "leaf_weight=" - << Common::ArrayToString(leaf_weight_, num_leaves_) << '\n'; + << ArrayToString(leaf_weight_, num_leaves_) << '\n'; str_buf << "leaf_count=" - << Common::ArrayToStringFast(leaf_count_, num_leaves_) << '\n'; + << ArrayToString(leaf_count_, num_leaves_) << '\n'; str_buf << "internal_value=" - << Common::ArrayToStringFast(internal_value_, num_leaves_ - 1) << '\n'; + << ArrayToString(internal_value_, num_leaves_ - 1) << '\n'; str_buf << "internal_weight=" - << Common::ArrayToStringFast(internal_weight_, num_leaves_ - 1) << '\n'; + << ArrayToString(internal_weight_, num_leaves_ - 1) << '\n'; str_buf << "internal_count=" - << Common::ArrayToStringFast(internal_count_, num_leaves_ - 1) << '\n'; + << ArrayToString(internal_count_, num_leaves_ - 1) << '\n'; if (num_cat_ > 0) { str_buf << "cat_boundaries=" - << Common::ArrayToStringFast(cat_boundaries_, num_cat_ + 1) << '\n'; + << ArrayToString(cat_boundaries_, num_cat_ + 1) << '\n'; str_buf << "cat_threshold=" - << Common::ArrayToStringFast(cat_threshold_, cat_threshold_.size()) << '\n'; + << ArrayToString(cat_threshold_, cat_threshold_.size()) << '\n'; } str_buf << "shrinkage=" << shrinkage_ << '\n'; str_buf << '\n'; @@ -258,6 +266,7 @@ std::string Tree::ToString() const { std::string Tree::ToJSON() const { std::stringstream str_buf; + Common::C_stringstream(str_buf); str_buf << std::setprecision(std::numeric_limits::digits10 + 2); str_buf << "\"num_leaves\":" << num_leaves_ << "," << '\n'; str_buf << "\"num_cat\":" << num_cat_ << "," << '\n'; @@ -273,6 +282,7 @@ std::string Tree::ToJSON() const { std::string Tree::NodeToJSON(int index) const { std::stringstream str_buf; + Common::C_stringstream(str_buf); str_buf << std::setprecision(std::numeric_limits::digits10 + 2); if (index >= 0) { // non-leaf @@ -292,7 +302,7 @@ std::string Tree::NodeToJSON(int index) const { } } } - str_buf << "\"threshold\":\"" << Common::Join(cats, "||") << "\"," << '\n'; + str_buf << "\"threshold\":\"" << CommonC::Join(cats, "||") << "\"," << '\n'; str_buf << "\"decision_type\":\"==\"," << '\n'; } else { str_buf << "\"threshold\":" << Common::AvoidInf(threshold_[index]) << "," << '\n'; @@ -333,6 +343,7 @@ std::string Tree::NodeToJSON(int index) const { std::string Tree::NumericalDecisionIfElse(int node) const { std::stringstream str_buf; + Common::C_stringstream(str_buf); uint8_t missing_type = GetMissingType(decision_type_[node]); bool default_left = GetDecisionType(decision_type_[node], kDefaultLeftMask); if (missing_type == MissingType::None @@ -357,6 +368,7 @@ std::string Tree::NumericalDecisionIfElse(int node) const { std::string Tree::CategoricalDecisionIfElse(int node) const { uint8_t missing_type = GetMissingType(decision_type_[node]); std::stringstream str_buf; + Common::C_stringstream(str_buf); if (missing_type == MissingType::NaN) { str_buf << "if (std::isnan(fval)) { int_fval = -1; } else { int_fval = static_cast(fval); }"; } else { @@ -372,6 +384,7 @@ std::string Tree::CategoricalDecisionIfElse(int node) const { std::string Tree::ToIfElse(int index, bool predict_leaf_index) const { std::stringstream str_buf; + Common::C_stringstream(str_buf); str_buf << "double PredictTree" << index; if (predict_leaf_index) { str_buf << "Leaf"; @@ -430,6 +443,7 @@ std::string Tree::ToIfElse(int index, bool predict_leaf_index) const { std::string Tree::NodeToIfElse(int index, bool predict_leaf_index) const { std::stringstream str_buf; + Common::C_stringstream(str_buf); str_buf << std::setprecision(std::numeric_limits::digits10 + 2); if (index >= 0) { // non-leaf @@ -461,6 +475,7 @@ std::string Tree::NodeToIfElse(int index, bool predict_leaf_index) const { std::string Tree::NodeToIfElseByMap(int index, bool predict_leaf_index) const { std::stringstream str_buf; + Common::C_stringstream(str_buf); str_buf << std::setprecision(std::numeric_limits::digits10 + 2); if (index >= 0) { // non-leaf @@ -523,13 +538,13 @@ Tree::Tree(const char* str, size_t* used_len) { Common::Atoi(key_vals["num_cat"].c_str(), &num_cat_); if (key_vals.count("leaf_value")) { - leaf_value_ = Common::StringToArray(key_vals["leaf_value"], num_leaves_); + leaf_value_ = CommonC::StringToArray(key_vals["leaf_value"], num_leaves_); } else { Log::Fatal("Tree model string format error, should contain leaf_value field"); } if (key_vals.count("shrinkage")) { - Common::Atof(key_vals["shrinkage"].c_str(), &shrinkage_); + CommonC::Atof(key_vals["shrinkage"].c_str(), &shrinkage_); } else { shrinkage_ = 1.0f; } @@ -537,80 +552,80 @@ Tree::Tree(const char* str, size_t* used_len) { if (num_leaves_ <= 1) { return; } if (key_vals.count("left_child")) { - left_child_ = Common::StringToArrayFast(key_vals["left_child"], num_leaves_ - 1); + left_child_ = CommonC::StringToArrayFast(key_vals["left_child"], num_leaves_ - 1); } else { Log::Fatal("Tree model string format error, should contain left_child field"); } if (key_vals.count("right_child")) { - right_child_ = Common::StringToArrayFast(key_vals["right_child"], num_leaves_ - 1); + right_child_ = CommonC::StringToArrayFast(key_vals["right_child"], num_leaves_ - 1); } else { Log::Fatal("Tree model string format error, should contain right_child field"); } if (key_vals.count("split_feature")) { - split_feature_ = Common::StringToArrayFast(key_vals["split_feature"], num_leaves_ - 1); + split_feature_ = CommonC::StringToArrayFast(key_vals["split_feature"], num_leaves_ - 1); } else { Log::Fatal("Tree model string format error, should contain split_feature field"); } if (key_vals.count("threshold")) { - threshold_ = Common::StringToArray(key_vals["threshold"], num_leaves_ - 1); + threshold_ = CommonC::StringToArray(key_vals["threshold"], num_leaves_ - 1); } else { Log::Fatal("Tree model string format error, should contain threshold field"); } if (key_vals.count("split_gain")) { - split_gain_ = Common::StringToArrayFast(key_vals["split_gain"], num_leaves_ - 1); + split_gain_ = CommonC::StringToArrayFast(key_vals["split_gain"], num_leaves_ - 1); } else { split_gain_.resize(num_leaves_ - 1); } if (key_vals.count("internal_count")) { - internal_count_ = Common::StringToArrayFast(key_vals["internal_count"], num_leaves_ - 1); + internal_count_ = CommonC::StringToArrayFast(key_vals["internal_count"], num_leaves_ - 1); } else { internal_count_.resize(num_leaves_ - 1); } if (key_vals.count("internal_value")) { - internal_value_ = Common::StringToArrayFast(key_vals["internal_value"], num_leaves_ - 1); + internal_value_ = CommonC::StringToArrayFast(key_vals["internal_value"], num_leaves_ - 1); } else { internal_value_.resize(num_leaves_ - 1); } if (key_vals.count("internal_weight")) { - internal_weight_ = Common::StringToArrayFast(key_vals["internal_weight"], num_leaves_ - 1); + internal_weight_ = CommonC::StringToArrayFast(key_vals["internal_weight"], num_leaves_ - 1); } else { internal_weight_.resize(num_leaves_ - 1); } if (key_vals.count("leaf_weight")) { - leaf_weight_ = Common::StringToArray(key_vals["leaf_weight"], num_leaves_); + leaf_weight_ = CommonC::StringToArray(key_vals["leaf_weight"], num_leaves_); } else { leaf_weight_.resize(num_leaves_); } if (key_vals.count("leaf_count")) { - leaf_count_ = Common::StringToArrayFast(key_vals["leaf_count"], num_leaves_); + leaf_count_ = CommonC::StringToArrayFast(key_vals["leaf_count"], num_leaves_); } else { leaf_count_.resize(num_leaves_); } if (key_vals.count("decision_type")) { - decision_type_ = Common::StringToArrayFast(key_vals["decision_type"], num_leaves_ - 1); + decision_type_ = CommonC::StringToArrayFast(key_vals["decision_type"], num_leaves_ - 1); } else { decision_type_ = std::vector(num_leaves_ - 1, 0); } if (num_cat_ > 0) { if (key_vals.count("cat_boundaries")) { - cat_boundaries_ = Common::StringToArrayFast(key_vals["cat_boundaries"], num_cat_ + 1); + cat_boundaries_ = CommonC::StringToArrayFast(key_vals["cat_boundaries"], num_cat_ + 1); } else { Log::Fatal("Tree model should contain cat_boundaries field."); } if (key_vals.count("cat_threshold")) { - cat_threshold_ = Common::StringToArrayFast(key_vals["cat_threshold"], cat_boundaries_.back()); + cat_threshold_ = CommonC::StringToArrayFast(key_vals["cat_threshold"], cat_boundaries_.back()); } else { Log::Fatal("Tree model should contain cat_threshold field"); }