From 01b7d3ec8dbfbbb64fab6d347347d5c8011ac458 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 28 Nov 2023 20:20:34 +0100 Subject: [PATCH 1/7] use array_interface for dense dmatrix construction --- R-package/src/xgboost_R.cc | 123 ++++++++++++++++-------- R-package/tests/testthat/test_dmatrix.R | 32 ++++++ 2 files changed, 117 insertions(+), 38 deletions(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 2938d4b6e92d..a1b8cc2e8e49 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -8,12 +8,16 @@ #include #include +#include #include #include #include #include #include #include +#ifdef __cpp_lib_endian +# include +#endif #include "../../src/c_api/c_api_error.h" #include "../../src/c_api/c_api_utils.h" // MakeSparseFromPtr @@ -21,6 +25,80 @@ #include "./xgboost_R.h" // Must follow other includes. +static bool get_is_little_endian() +{ +#ifdef __cpp_lib_endian + return std::endian::native == std::endian::little; +#else + const int one = 1; + return *((unsigned char*)&one) != 0; +#endif +} + +const bool is_little_endian = get_is_little_endian(); + +#define ARR_MAX_JSON_LENGTH 256 + +const void* get_R_pointer(SEXP &R_arr) +{ + const SEXPTYPE arr_type = TYPEOF(R_arr); + switch (arr_type) { + case REALSXP: return static_cast(REAL(R_arr)); + case INTSXP: return static_cast(INTEGER(R_arr)); + case LGLSXP: return static_cast(LOGICAL(R_arr)); + default: throw std::runtime_error("Error: array or matrix has unsupported type."); + } +} + +static void make_numpy_array_interface_from_R_mat(char *out_json, SEXP &R_mat) +{ + SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol); + const int *ptr_mat_dims = INTEGER(mat_dims); + const bool is_double = TYPEOF(R_mat) == REALSXP; + const void *ptr_mat_data = get_R_pointer(R_mat); + std::uintptr_t ptr_as_number = reinterpret_cast(ptr_mat_data); + const auto size_of_type = is_double? sizeof(double) : sizeof(int); + const std::uint64_t stride = static_cast(ptr_mat_dims[0]) * size_of_type; + std::snprintf( + out_json, + ARR_MAX_JSON_LENGTH, + "{\"data\": [%" PRIuPTR ", true], \"shape\": [%d,%d], \"strides\": [%d,%" PRIu64 "], \"typestr\": \"%s%s%d\", \"version\":3}", + ptr_as_number, ptr_mat_dims[0], ptr_mat_dims[1], + static_cast(size_of_type), stride, + is_little_endian? "<" : ">", is_double? "f" : "i", static_cast(size_of_type) + ); +} + +static void make_json_config_for_array(char *out_json, SEXP missing, SEXP n_threads, SEXPTYPE arr_type) +{ + char missing_str[128]; + const SEXPTYPE missing_type = TYPEOF(missing); + if (Rf_isNull(missing) || + (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) || + (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) || + (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) { + if (arr_type == REALSXP) { + std::strncpy(missing_str, "NaN", 128); + } else { + std::snprintf(missing_str, 128, "%d", R_NaInt); + } + } else { + const double missing_as_double = Rf_asReal(missing); + if (std::isinf(missing_as_double)) { + std::snprintf(missing_str, 128, "%sInfinity", (missing_as_double < 0)? "-" : ""); + } else { + std::snprintf(missing_str, 128, "%f", Rf_asReal(missing)); + } + } + + std::snprintf( + out_json, + ARR_MAX_JSON_LENGTH, + "{\"missing\": %s, \"nthread\": %d}", + missing_str, Rf_asInteger(n_threads) + ); +} + /*! * \brief macro to annotate begin of api */ @@ -94,47 +172,16 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { } XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) { - SEXP ret; + SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); R_API_BEGIN(); - SEXP dim = getAttrib(mat, R_DimSymbol); - size_t nrow = static_cast(INTEGER(dim)[0]); - size_t ncol = static_cast(INTEGER(dim)[1]); - const bool is_int = TYPEOF(mat) == INTSXP; - double *din; - int *iin; - if (is_int) { - iin = INTEGER(mat); - } else { - din = REAL(mat); - } - std::vector data(nrow * ncol); - xgboost::Context ctx; - ctx.nthread = asInteger(n_threads); - std::int32_t threads = ctx.Threads(); - - if (is_int) { - xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) { - for (size_t j = 0; j < ncol; ++j) { - auto v = iin[i + nrow * j]; - if (v == NA_INTEGER) { - data[i * ncol + j] = std::numeric_limits::quiet_NaN(); - } else { - data[i * ncol + j] = static_cast(v); - } - } - }); - } else { - xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) { - for (size_t j = 0; j < ncol; ++j) { - data[i * ncol + j] = din[i + nrow * j]; - } - }); - } + char json_mat_info[ARR_MAX_JSON_LENGTH]; + make_numpy_array_interface_from_R_mat(json_mat_info, mat); + char json_config_info[ARR_MAX_JSON_LENGTH]; + make_json_config_for_array(json_config_info, missing, n_threads, TYPEOF(mat)); DMatrixHandle handle; - CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol, - asReal(missing), &handle, threads)); - ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); + CHECK_CALL(XGDMatrixCreateFromDense(json_mat_info, json_config_info, &handle)); + R_SetExternalPtrAddr(ret, handle); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_API_END(); UNPROTECT(1); diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 461b7d158a9a..0f2414ef0b87 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -265,3 +265,35 @@ test_that("xgb.DMatrix: print", { }) expect_equal(txt, "xgb.DMatrix dim: 6513 x 126 info: NA colnames: no") }) + +test_that("xgb.DMatrix: Inf as missing", { + x_inf <- matrix(as.numeric(1:10), nrow=5) + x_inf[2, 1] <- Inf + + x_nan <- x_inf + x_nan[2, 1] <- NA_real_ + + m_inf <- xgb.DMatrix(x_inf, nthread = n_threads, missing = Inf) + xgb.DMatrix.save(m_inf, "inf.dmatrix") + + m_nan <- xgb.DMatrix(x_nan, nthread = n_threads, missing = NA_real_) + xgb.DMatrix.save(m_nan, "nan.dmatrix") + + infconn <- file("inf.dmatrix", "rb") + nanconn <- file("nan.dmatrix", "rb") + + expect_equal(file.size("inf.dmatrix"), file.size("nan.dmatrix")) + + bytes <- file.size("inf.dmatrix") + infdmatrix <- readBin(infconn, "raw", n = bytes) + nandmatrix <- readBin(nanconn, "raw", n = bytes) + + expect_equal(length(infdmatrix), length(nandmatrix)) + expect_equal(infdmatrix, nandmatrix) + + close(infconn) + close(nanconn) + + file.remove("inf.dmatrix") + file.remove("nan.dmatrix") +}) From fc6709ab33ce9ae10537dac6d47ec8c5b9587f5e Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 28 Nov 2023 20:41:48 +0100 Subject: [PATCH 2/7] make linters happy --- R-package/src/xgboost_R.cc | 30 ++++++++++++------------- R-package/tests/testthat/test_dmatrix.R | 2 +- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index a1b8cc2e8e49..a2d2c2648fa1 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -25,8 +25,7 @@ #include "./xgboost_R.h" // Must follow other includes. -static bool get_is_little_endian() -{ +static bool get_is_little_endian() { #ifdef __cpp_lib_endian return std::endian::native == std::endian::little; #else @@ -39,8 +38,7 @@ const bool is_little_endian = get_is_little_endian(); #define ARR_MAX_JSON_LENGTH 256 -const void* get_R_pointer(SEXP &R_arr) -{ +const void* get_R_pointer(SEXP R_arr) { const SEXPTYPE arr_type = TYPEOF(R_arr); switch (arr_type) { case REALSXP: return static_cast(REAL(R_arr)); @@ -50,8 +48,7 @@ const void* get_R_pointer(SEXP &R_arr) } } -static void make_numpy_array_interface_from_R_mat(char *out_json, SEXP &R_mat) -{ +static void make_numpy_array_interface_from_R_mat(char *out_json, SEXP R_mat) { SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol); const int *ptr_mat_dims = INTEGER(mat_dims); const bool is_double = TYPEOF(R_mat) == REALSXP; @@ -62,15 +59,16 @@ static void make_numpy_array_interface_from_R_mat(char *out_json, SEXP &R_mat) std::snprintf( out_json, ARR_MAX_JSON_LENGTH, - "{\"data\": [%" PRIuPTR ", true], \"shape\": [%d,%d], \"strides\": [%d,%" PRIu64 "], \"typestr\": \"%s%s%d\", \"version\":3}", + "{\"data\": [%" PRIuPTR ", true], \"shape\": [%d,%d], \"strides\": [%d,%" PRIu64 + "], \"typestr\": \"%s%s%d\", \"version\":3}", ptr_as_number, ptr_mat_dims[0], ptr_mat_dims[1], static_cast(size_of_type), stride, - is_little_endian? "<" : ">", is_double? "f" : "i", static_cast(size_of_type) - ); + is_little_endian? "<" : ">", is_double? "f" : "i", static_cast(size_of_type)); } -static void make_json_config_for_array(char *out_json, SEXP missing, SEXP n_threads, SEXPTYPE arr_type) -{ +static void make_json_config_for_array( + char *out_json, SEXP missing, SEXP n_threads, SEXPTYPE arr_type +) { char missing_str[128]; const SEXPTYPE missing_type = TYPEOF(missing); if (Rf_isNull(missing) || @@ -80,14 +78,15 @@ static void make_json_config_for_array(char *out_json, SEXP missing, SEXP n_thre if (arr_type == REALSXP) { std::strncpy(missing_str, "NaN", 128); } else { - std::snprintf(missing_str, 128, "%d", R_NaInt); + std::snprintf(missing_str, sizeof(missing_str), "%d", R_NaInt); } } else { const double missing_as_double = Rf_asReal(missing); if (std::isinf(missing_as_double)) { - std::snprintf(missing_str, 128, "%sInfinity", (missing_as_double < 0)? "-" : ""); + std::snprintf( + missing_str, sizeof(missing_str), "%sInfinity", (missing_as_double < 0)? "-" : ""); } else { - std::snprintf(missing_str, 128, "%f", Rf_asReal(missing)); + std::snprintf(missing_str, sizeof(missing_str), "%f", Rf_asReal(missing)); } } @@ -95,8 +94,7 @@ static void make_json_config_for_array(char *out_json, SEXP missing, SEXP n_thre out_json, ARR_MAX_JSON_LENGTH, "{\"missing\": %s, \"nthread\": %d}", - missing_str, Rf_asInteger(n_threads) - ); + missing_str, Rf_asInteger(n_threads)); } /*! diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 0f2414ef0b87..a0cf90088704 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -267,7 +267,7 @@ test_that("xgb.DMatrix: print", { }) test_that("xgb.DMatrix: Inf as missing", { - x_inf <- matrix(as.numeric(1:10), nrow=5) + x_inf <- matrix(as.numeric(1:10), nrow = 5) x_inf[2, 1] <- Inf x_nan <- x_inf From 1221d6d444263ad9788d6c52a1fce13894987b63 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 29 Nov 2023 16:54:21 +0800 Subject: [PATCH 3/7] Work on array interface. --- R-package/src/xgboost_R.cc | 113 ++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 59 deletions(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index a2d2c2648fa1..2b0318320fdd 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -25,77 +25,72 @@ #include "./xgboost_R.h" // Must follow other includes. -static bool get_is_little_endian() { -#ifdef __cpp_lib_endian - return std::endian::native == std::endian::little; -#else - const int one = 1; - return *((unsigned char*)&one) != 0; -#endif -} - -const bool is_little_endian = get_is_little_endian(); - -#define ARR_MAX_JSON_LENGTH 256 +namespace { +[[nodiscard]] std::string MakeArrayInterfaceFromRMat(SEXP R_mat) { + SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol); + const int *ptr_mat_dims = INTEGER(mat_dims); -const void* get_R_pointer(SEXP R_arr) { - const SEXPTYPE arr_type = TYPEOF(R_arr); + // Lambda for type dispatch. + auto make_matrix = [=](auto const *ptr) { + using namespace xgboost; // NOLINT + using T = std::remove_pointer_t; + + auto m = linalg::MatrixView{ + common::Span{ptr, static_cast(ptr_mat_dims[0] * ptr_mat_dims[1])}, + {ptr_mat_dims[0], ptr_mat_dims[1]}, // Shape + DeviceOrd::CPU(), + linalg::Order::kF // R uses column-major + }; + CHECK(m.FContiguous()); + auto array_str = linalg::ArrayInterfaceStr(m); + return array_str; + }; + + const SEXPTYPE arr_type = TYPEOF(R_mat); switch (arr_type) { - case REALSXP: return static_cast(REAL(R_arr)); - case INTSXP: return static_cast(INTEGER(R_arr)); - case LGLSXP: return static_cast(LOGICAL(R_arr)); - default: throw std::runtime_error("Error: array or matrix has unsupported type."); - } -} + case REALSXP: + return make_matrix(REAL(R_mat)); + case INTSXP: + return make_matrix(INTEGER(R_mat)); + case LGLSXP: + return make_matrix(LOGICAL(R_mat)); + default: + LOG(FATAL) << "Array or matrix has unsupported type."; + }; + + LOG(FATAL) << "Not reachable"; + return ""; +} + +[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) { + using namespace ::xgboost; // NOLINT + Json jconfig{Object{}}; -static void make_numpy_array_interface_from_R_mat(char *out_json, SEXP R_mat) { - SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol); - const int *ptr_mat_dims = INTEGER(mat_dims); - const bool is_double = TYPEOF(R_mat) == REALSXP; - const void *ptr_mat_data = get_R_pointer(R_mat); - std::uintptr_t ptr_as_number = reinterpret_cast(ptr_mat_data); - const auto size_of_type = is_double? sizeof(double) : sizeof(int); - const std::uint64_t stride = static_cast(ptr_mat_dims[0]) * size_of_type; - std::snprintf( - out_json, - ARR_MAX_JSON_LENGTH, - "{\"data\": [%" PRIuPTR ", true], \"shape\": [%d,%d], \"strides\": [%d,%" PRIu64 - "], \"typestr\": \"%s%s%d\", \"version\":3}", - ptr_as_number, ptr_mat_dims[0], ptr_mat_dims[1], - static_cast(size_of_type), stride, - is_little_endian? "<" : ">", is_double? "f" : "i", static_cast(size_of_type)); -} - -static void make_json_config_for_array( - char *out_json, SEXP missing, SEXP n_threads, SEXPTYPE arr_type -) { - char missing_str[128]; const SEXPTYPE missing_type = TYPEOF(missing); - if (Rf_isNull(missing) || - (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) || + if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) || (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) || (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) { + // missing is not specified if (arr_type == REALSXP) { - std::strncpy(missing_str, "NaN", 128); + jconfig["missing"] = std::numeric_limits::quiet_NaN(); } else { - std::snprintf(missing_str, sizeof(missing_str), "%d", R_NaInt); + jconfig["missing"] = R_NaInt; } } else { const double missing_as_double = Rf_asReal(missing); + // missing specified if (std::isinf(missing_as_double)) { - std::snprintf( - missing_str, sizeof(missing_str), "%sInfinity", (missing_as_double < 0)? "-" : ""); + jconfig["missing"] = (missing_as_double < 0) ? -std::numeric_limits::infinity() + : std::numeric_limits::infinity(); } else { - std::snprintf(missing_str, sizeof(missing_str), "%f", Rf_asReal(missing)); + jconfig["missing"] = Rf_asReal(missing); } } - std::snprintf( - out_json, - ARR_MAX_JSON_LENGTH, - "{\"missing\": %s, \"nthread\": %d}", - missing_str, Rf_asInteger(n_threads)); + jconfig["nthread"] = Rf_asInteger(n_threads); + return Json::Dump(jconfig); } +} // namespace /*! * \brief macro to annotate begin of api @@ -172,13 +167,13 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) { SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); R_API_BEGIN(); - char json_mat_info[ARR_MAX_JSON_LENGTH]; - make_numpy_array_interface_from_R_mat(json_mat_info, mat); - char json_config_info[ARR_MAX_JSON_LENGTH]; - make_json_config_for_array(json_config_info, missing, n_threads, TYPEOF(mat)); + + auto array_str = MakeArrayInterfaceFromRMat(mat); + auto config_str = MakeJsonConfigForArray(missing, n_threads, TYPEOF(mat)); DMatrixHandle handle; - CHECK_CALL(XGDMatrixCreateFromDense(json_mat_info, json_config_info, &handle)); + CHECK_CALL(XGDMatrixCreateFromDense(array_str.c_str(), config_str.c_str(), &handle)); + R_SetExternalPtrAddr(ret, handle); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_API_END(); From 04200153d9cdb4656f0b8a427c6e74450ebb21a5 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 29 Nov 2023 17:07:26 +0800 Subject: [PATCH 4/7] cleanup. --- R-package/src/xgboost_R.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 2b0318320fdd..747f0f7f6f71 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -15,9 +15,6 @@ #include #include #include -#ifdef __cpp_lib_endian -# include -#endif #include "../../src/c_api/c_api_error.h" #include "../../src/c_api/c_api_utils.h" // MakeSparseFromPtr @@ -42,8 +39,7 @@ namespace { linalg::Order::kF // R uses column-major }; CHECK(m.FContiguous()); - auto array_str = linalg::ArrayInterfaceStr(m); - return array_str; + return linalg::ArrayInterfaceStr(m); }; const SEXPTYPE arr_type = TYPEOF(R_mat); From 3158e17586da0e0c9f58b7ba8d71f57dce6909e6 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 29 Nov 2023 17:08:21 +0800 Subject: [PATCH 5/7] lint. --- R-package/src/xgboost_R.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 747f0f7f6f71..a8e1b9b2a495 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -52,7 +52,7 @@ namespace { return make_matrix(LOGICAL(R_mat)); default: LOG(FATAL) << "Array or matrix has unsupported type."; - }; + } LOG(FATAL) << "Not reachable"; return ""; From 54d8d59772e09770b9aeb7653a64022511f4fc95 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 29 Nov 2023 19:59:39 +0100 Subject: [PATCH 6/7] avoid overflow in multiplication --- R-package/src/xgboost_R.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index a8e1b9b2a495..9fd9957c06b5 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -33,7 +33,8 @@ namespace { using T = std::remove_pointer_t; auto m = linalg::MatrixView{ - common::Span{ptr, static_cast(ptr_mat_dims[0] * ptr_mat_dims[1])}, + common::Span{ptr, + static_cast(ptr_mat_dims[0]) * static_cast(ptr_mat_dims[1])}, {ptr_mat_dims[0], ptr_mat_dims[1]}, // Shape DeviceOrd::CPU(), linalg::Order::kF // R uses column-major From 71be145b4c89503a614ac970702509515cb07c6a Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 29 Nov 2023 20:00:35 +0100 Subject: [PATCH 7/7] simplify jsonification --- R-package/src/xgboost_R.cc | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 9fd9957c06b5..a7a05bf9e784 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -74,14 +74,8 @@ namespace { jconfig["missing"] = R_NaInt; } } else { - const double missing_as_double = Rf_asReal(missing); // missing specified - if (std::isinf(missing_as_double)) { - jconfig["missing"] = (missing_as_double < 0) ? -std::numeric_limits::infinity() - : std::numeric_limits::infinity(); - } else { - jconfig["missing"] = Rf_asReal(missing); - } + jconfig["missing"] = Rf_asReal(missing); } jconfig["nthread"] = Rf_asInteger(n_threads);