From f7005d32c182e1c8751b2549cde9d16f327486a1 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Fri, 23 Feb 2024 19:03:54 +0100 Subject: [PATCH 1/5] [R] Use inplace predict (#9829) --------- Co-authored-by: Hyunsu Cho --- R-package/R/xgb.Booster.R | 137 +++++++++++++++--- R-package/man/predict.xgb.Booster.Rd | 45 ++++-- R-package/src/init.c | 6 + R-package/src/xgboost_R.cc | 184 +++++++++++++++++++++--- R-package/src/xgboost_R.h | 44 ++++++ R-package/tests/testthat/test_basic.R | 49 ++++++- R-package/tests/testthat/test_dmatrix.R | 31 ++++ 7 files changed, 450 insertions(+), 46 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index febefb757129..8a5d66198834 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -77,26 +77,45 @@ xgb.get.handle <- function(object) { #' Predict method for XGBoost model #' -#' Predicted values based on either xgboost model or model handle object. +#' Predict values on data based on xgboost model. #' #' @param object Object of class `xgb.Booster`. -#' @param newdata Takes `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`, +#' @param newdata Takes `data.frame`, `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`, #' local data file, or `xgb.DMatrix`. -#' For single-row predictions on sparse data, it is recommended to use the CSR format. -#' If passing a sparse vector, it will take it as a row vector. -#' @param missing Only used when input is a dense matrix. Pick a float value that represents -#' missing values in data (e.g., 0 or some other extreme value). +#' +#' For single-row predictions on sparse data, it's recommended to use CSR format. If passing +#' a sparse vector, it will take it as a row vector. +#' +#' Note that, for repeated predictions on the same data, one might want to create a DMatrix to +#' pass here instead of passing R types like matrices or data frames, as predictions will be +#' faster on DMatrix. +#' +#' If `newdata` is a `data.frame`, be aware that:\itemize{ +#' \item Columns will be converted to numeric if they aren't already, which could potentially make +#' the operation slower than in an equivalent `matrix` object. +#' \item The order of the columns must match with that of the data from which the model was fitted +#' (i.e. columns will not be referenced by their names, just by their order in the data). +#' \item If the model was fitted to data with categorical columns, these columns must be of +#' `factor` type here, and must use the same encoding (i.e. have the same levels). +#' \item If `newdata` contains any `factor` columns, they will be converted to base-0 +#' encoding (same as during DMatrix creation) - hence, one should not pass a `factor` +#' under a column which during training had a different type. +#' } +#' @param missing Float value that represents missing values in data (e.g., 0 or some other extreme value). +#' +#' This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass +#' this as an argument to the DMatrix constructor instead. #' @param outputmargin Whether the prediction should be returned in the form of original untransformed #' sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for #' logistic regression would return log-odds instead of probabilities. -#' @param predleaf Whether to predict pre-tree leaf indices. +#' @param predleaf Whether to predict per-tree leaf indices. #' @param predcontrib Whether to return feature contributions to individual predictions (see Details). #' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details). #' @param predinteraction Whether to return contributions of feature interactions to individual predictions (see Details). #' @param reshape Whether to reshape the vector of predictions to matrix form when there are several #' prediction outputs per case. No effect if `predleaf`, `predcontrib`, #' or `predinteraction` is `TRUE`. -#' @param training Whether the predictions are used for training. For dart booster, +#' @param training Whether the prediction result is used for training. For dart booster, #' training predicting will perform dropout. #' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing #' a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e. @@ -111,6 +130,12 @@ xgb.get.handle <- function(object) { #' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. #' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output #' type and shape of predictions are invariant to the model type. +#' @param base_margin Base margin used for boosting from existing model. +#' +#' Note that, if `newdata` is an `xgb.DMatrix` object, this argument will +#' be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as +#' an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}). +#' #' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names #' match (only applicable when both `object` and `newdata` have feature names). #' @@ -287,16 +312,80 @@ xgb.get.handle <- function(object) { predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE, reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, - validate_features = FALSE, ...) { + validate_features = FALSE, base_margin = NULL, ...) { if (validate_features) { newdata <- validate.features(object, newdata) } - if (!inherits(newdata, "xgb.DMatrix")) { + is_dmatrix <- inherits(newdata, "xgb.DMatrix") + if (is_dmatrix && !is.null(base_margin)) { + stop( + "'base_margin' is not supported when passing 'xgb.DMatrix' as input.", + " Should be passed as argument to 'xgb.DMatrix' constructor." + ) + } + + use_as_df <- FALSE + use_as_dense_matrix <- FALSE + use_as_csr_matrix <- FALSE + n_row <- NULL + if (!is_dmatrix) { + + inplace_predict_supported <- !predcontrib && !predinteraction && !predleaf + if (inplace_predict_supported) { + booster_type <- xgb.booster_type(object) + if (booster_type == "gblinear" || (booster_type == "dart" && training)) { + inplace_predict_supported <- FALSE + } + } + if (inplace_predict_supported) { + + if (is.matrix(newdata)) { + use_as_dense_matrix <- TRUE + } else if (is.data.frame(newdata)) { + # note: since here it turns it into a non-data-frame list, + # needs to keep track of the number of rows it had for later + n_row <- nrow(newdata) + newdata <- lapply( + newdata, + function(x) { + if (is.factor(x)) { + return(as.numeric(x) - 1) + } else { + return(as.numeric(x)) + } + } + ) + use_as_df <- TRUE + } else if (inherits(newdata, "dgRMatrix")) { + use_as_csr_matrix <- TRUE + csr_data <- list(newdata@p, newdata@j, newdata@x, ncol(newdata)) + } else if (inherits(newdata, "dsparseVector")) { + use_as_csr_matrix <- TRUE + n_row <- 1L + i <- newdata@i - 1L + if (storage.mode(i) != "integer") { + storage.mode(i) <- "integer" + } + csr_data <- list(c(0L, length(i)), i, newdata@x, length(newdata)) + } + + } + + } # if (!is_dmatrix) + + if (!is_dmatrix && !use_as_dense_matrix && !use_as_csr_matrix && !use_as_df) { nthread <- xgb.nthread(object) newdata <- xgb.DMatrix( newdata, - missing = missing, nthread = NVL(nthread, -1) + missing = missing, + base_margin = base_margin, + nthread = NVL(nthread, -1) ) + is_dmatrix <- TRUE + } + + if (is.null(n_row)) { + n_row <- nrow(newdata) } @@ -354,18 +443,30 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA args$type <- set_type(6) } - predts <- .Call( - XGBoosterPredictFromDMatrix_R, - xgb.get.handle(object), - newdata, - jsonlite::toJSON(args, auto_unbox = TRUE) - ) + json_conf <- jsonlite::toJSON(args, auto_unbox = TRUE) + if (is_dmatrix) { + predts <- .Call( + XGBoosterPredictFromDMatrix_R, xgb.get.handle(object), newdata, json_conf + ) + } else if (use_as_dense_matrix) { + predts <- .Call( + XGBoosterPredictFromDense_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin + ) + } else if (use_as_csr_matrix) { + predts <- .Call( + XGBoosterPredictFromCSR_R, xgb.get.handle(object), csr_data, missing, json_conf, base_margin + ) + } else if (use_as_df) { + predts <- .Call( + XGBoosterPredictFromColumnar_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin + ) + } + names(predts) <- c("shape", "results") shape <- predts$shape arr <- predts$results n_ret <- length(arr) - n_row <- nrow(newdata) if (n_row != shape[1]) { stop("Incorrect predict shape.") } diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index 95e7a51fd043..88a2f203efcd 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -18,25 +18,47 @@ iterationrange = NULL, strict_shape = FALSE, validate_features = FALSE, + base_margin = NULL, ... ) } \arguments{ \item{object}{Object of class \code{xgb.Booster}.} -\item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector}, +\item{newdata}{Takes \code{data.frame}, \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector}, local data file, or \code{xgb.DMatrix}. -For single-row predictions on sparse data, it is recommended to use the CSR format. -If passing a sparse vector, it will take it as a row vector.} -\item{missing}{Only used when input is a dense matrix. Pick a float value that represents -missing values in data (e.g., 0 or some other extreme value).} +\if{html}{\out{
}}\preformatted{ For single-row predictions on sparse data, it's recommended to use CSR format. If passing + a sparse vector, it will take it as a row vector. + + Note that, for repeated predictions on the same data, one might want to create a DMatrix to + pass here instead of passing R types like matrices or data frames, as predictions will be + faster on DMatrix. + + If `newdata` is a `data.frame`, be aware that:\\itemize\{ + \\item Columns will be converted to numeric if they aren't already, which could potentially make + the operation slower than in an equivalent `matrix` object. + \\item The order of the columns must match with that of the data from which the model was fitted + (i.e. columns will not be referenced by their names, just by their order in the data). + \\item If the model was fitted to data with categorical columns, these columns must be of + `factor` type here, and must use the same encoding (i.e. have the same levels). + \\item If `newdata` contains any `factor` columns, they will be converted to base-0 + encoding (same as during DMatrix creation) - hence, one should not pass a `factor` + under a column which during training had a different type. + \} +}\if{html}{\out{
}}} + +\item{missing}{Float value that represents missing values in data (e.g., 0 or some other extreme value). + +\if{html}{\out{
}}\preformatted{ This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass + this as an argument to the DMatrix constructor instead. +}\if{html}{\out{
}}} \item{outputmargin}{Whether the prediction should be returned in the form of original untransformed sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for logistic regression would return log-odds instead of probabilities.} -\item{predleaf}{Whether to predict pre-tree leaf indices.} +\item{predleaf}{Whether to predict per-tree leaf indices.} \item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).} @@ -48,7 +70,7 @@ logistic regression would return log-odds instead of probabilities.} prediction outputs per case. No effect if \code{predleaf}, \code{predcontrib}, or \code{predinteraction} is \code{TRUE}.} -\item{training}{Whether the predictions are used for training. For dart booster, +\item{training}{Whether the prediction result is used for training. For dart booster, training predicting will perform dropout.} \item{iterationrange}{Sequence of rounds/iterations from the model to use for prediction, specified by passing @@ -84,6 +106,13 @@ match (only applicable when both \code{object} and \code{newdata} have feature n recommended to disable it for performance-sensitive applications. }\if{html}{\out{}}} +\item{base_margin}{Base margin used for boosting from existing model. + +\if{html}{\out{
}}\preformatted{ Note that, if `newdata` is an `xgb.DMatrix` object, this argument will + be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as + an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}). +}\if{html}{\out{
}}} + \item{...}{Not used.} } \value{ @@ -115,7 +144,7 @@ When \code{strict_shape = TRUE}, the output is always an array: } } \description{ -Predicted values based on either xgboost model or model handle object. +Predict values on data based on xgboost model. } \details{ Note that \code{iterationrange} would currently do nothing for predictions from "gblinear", diff --git a/R-package/src/init.c b/R-package/src/init.c index 36f3e8953639..f2635742ebd7 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -37,6 +37,9 @@ extern SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value); extern SEXP XGBoosterSerializeToBuffer_R(SEXP handle); extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw); extern SEXP XGBoosterPredictFromDMatrix_R(SEXP, SEXP, SEXP); +extern SEXP XGBoosterPredictFromDense_R(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP XGBoosterPredictFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP XGBoosterPredictFromColumnar_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGBoosterSaveModel_R(SEXP, SEXP); extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP); @@ -96,6 +99,9 @@ static const R_CallMethodDef CallEntries[] = { {"XGBoosterSerializeToBuffer_R", (DL_FUNC) &XGBoosterSerializeToBuffer_R, 1}, {"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2}, {"XGBoosterPredictFromDMatrix_R", (DL_FUNC) &XGBoosterPredictFromDMatrix_R, 3}, + {"XGBoosterPredictFromDense_R", (DL_FUNC) &XGBoosterPredictFromDense_R, 5}, + {"XGBoosterPredictFromCSR_R", (DL_FUNC) &XGBoosterPredictFromCSR_R, 5}, + {"XGBoosterPredictFromColumnar_R", (DL_FUNC) &XGBoosterPredictFromColumnar_R, 5}, {"XGBoosterSaveModel_R", (DL_FUNC) &XGBoosterSaveModel_R, 2}, {"XGBoosterSetAttr_R", (DL_FUNC) &XGBoosterSetAttr_R, 3}, {"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3}, diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 4192f82fbaa6..5baf8d41282e 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -207,25 +208,24 @@ SEXP SafeAllocInteger(size_t size, SEXP continuation_token) { return xgboost::Json::Dump(jinterface); } -[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) { - using namespace ::xgboost; // NOLINT - Json jconfig{Object{}}; - - const SEXPTYPE missing_type = TYPEOF(missing); - if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) || - (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) || - (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) { +void AddMissingToJson(xgboost::Json *jconfig, SEXP missing, SEXPTYPE arr_type) { + if (Rf_isNull(missing) || ISNAN(Rf_asReal(missing))) { // missing is not specified if (arr_type == REALSXP) { - jconfig["missing"] = std::numeric_limits::quiet_NaN(); + (*jconfig)["missing"] = std::numeric_limits::quiet_NaN(); } else { - jconfig["missing"] = R_NaInt; + (*jconfig)["missing"] = R_NaInt; } } else { // missing specified - jconfig["missing"] = Rf_asReal(missing); + (*jconfig)["missing"] = Rf_asReal(missing); } +} +[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) { + using namespace ::xgboost; // NOLINT + Json jconfig{Object{}}; + AddMissingToJson(&jconfig, missing, arr_type); jconfig["nthread"] = Rf_asInteger(n_threads); return Json::Dump(jconfig); } @@ -411,7 +411,7 @@ XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) { DMatrixHandle handle; std::int32_t rc{0}; { - std::string sinterface = MakeArrayInterfaceFromRDataFrame(df); + const std::string sinterface = MakeArrayInterfaceFromRDataFrame(df); xgboost::Json jconfig{xgboost::Object{}}; jconfig["missing"] = asReal(missing); jconfig["nthread"] = asInteger(n_threads); @@ -463,7 +463,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP Json jconfig{Object{}}; // Construct configuration jconfig["nthread"] = Integer{threads}; - jconfig["missing"] = xgboost::Number{asReal(missing)}; + AddMissingToJson(&jconfig, missing, TYPEOF(data)); std::string config; Json::Dump(jconfig, &config); res_code = XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow, @@ -498,7 +498,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP Json jconfig{Object{}}; // Construct configuration jconfig["nthread"] = Integer{threads}; - jconfig["missing"] = xgboost::Number{asReal(missing)}; + AddMissingToJson(&jconfig, missing, TYPEOF(data)); std::string config; Json::Dump(jconfig, &config); res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol, @@ -1247,7 +1247,60 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn return mkString(ret); } -XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) { +namespace { + +struct ProxyDmatrixError : public std::exception {}; + +struct ProxyDmatrixWrapper { + DMatrixHandle proxy_dmat_handle; + + ProxyDmatrixWrapper() { + int res_code = XGProxyDMatrixCreate(&this->proxy_dmat_handle); + if (res_code != 0) { + throw ProxyDmatrixError(); + } + } + + ~ProxyDmatrixWrapper() { + if (this->proxy_dmat_handle) { + XGDMatrixFree(this->proxy_dmat_handle); + this->proxy_dmat_handle = nullptr; + } + } + + DMatrixHandle get_handle() { + return this->proxy_dmat_handle; + } +}; + +std::unique_ptr GetProxyDMatrixWithBaseMargin(SEXP base_margin) { + if (Rf_isNull(base_margin)) { + return std::unique_ptr(nullptr); + } + + SEXP base_margin_dim = Rf_getAttrib(base_margin, R_DimSymbol); + int res_code; + try { + const std::string array_str = Rf_isNull(base_margin_dim)? + MakeArrayInterfaceFromRVector(base_margin) : MakeArrayInterfaceFromRMat(base_margin); + std::unique_ptr proxy_dmat(new ProxyDmatrixWrapper()); + res_code = XGDMatrixSetInfoFromInterface(proxy_dmat->get_handle(), + "base_margin", + array_str.c_str()); + if (res_code != 0) { + throw ProxyDmatrixError(); + } + return proxy_dmat; + } catch(ProxyDmatrixError &err) { + Rf_error("%s", XGBGetLastError()); + } +} + +enum class PredictionInputType {DMatrix, DenseMatrix, CSRMatrix, DataFrame}; + +SEXP XGBoosterPredictGeneric(SEXP handle, SEXP input_data, SEXP json_config, + PredictionInputType input_type, SEXP missing, + SEXP base_margin) { SEXP r_out_shape; SEXP r_out_result; SEXP r_out = PROTECT(allocVector(VECSXP, 2)); @@ -1259,9 +1312,79 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con bst_ulong out_dim; bst_ulong const *out_shape; float const *out_result; - CHECK_CALL(XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle), - R_ExternalPtrAddr(dmat), c_json_config, - &out_shape, &out_dim, &out_result)); + + int res_code; + { + switch (input_type) { + case PredictionInputType::DMatrix: { + res_code = XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle), + R_ExternalPtrAddr(input_data), c_json_config, + &out_shape, &out_dim, &out_result); + break; + } + + case PredictionInputType::CSRMatrix: { + std::unique_ptr proxy_dmat = GetProxyDMatrixWithBaseMargin( + base_margin); + DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr; + + SEXP indptr = VECTOR_ELT(input_data, 0); + SEXP indices = VECTOR_ELT(input_data, 1); + SEXP data = VECTOR_ELT(input_data, 2); + const int ncol_csr = Rf_asInteger(VECTOR_ELT(input_data, 3)); + const SEXPTYPE type_data = TYPEOF(data); + CHECK_EQ(type_data, REALSXP); + std::string sindptr, sindices, sdata; + CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata); + + xgboost::StringView json_str(c_json_config); + xgboost::Json new_json = xgboost::Json::Load(json_str); + AddMissingToJson(&new_json, missing, type_data); + const std::string new_c_json = xgboost::Json::Dump(new_json); + + res_code = XGBoosterPredictFromCSR( + R_ExternalPtrAddr(handle), sindptr.c_str(), sindices.c_str(), sdata.c_str(), + ncol_csr, new_c_json.c_str(), proxy_dmat_handle, &out_shape, &out_dim, &out_result); + break; + } + + case PredictionInputType::DenseMatrix: { + std::unique_ptr proxy_dmat = GetProxyDMatrixWithBaseMargin( + base_margin); + DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr; + const std::string array_str = MakeArrayInterfaceFromRMat(input_data); + + xgboost::StringView json_str(c_json_config); + xgboost::Json new_json = xgboost::Json::Load(json_str); + AddMissingToJson(&new_json, missing, TYPEOF(input_data)); + const std::string new_c_json = xgboost::Json::Dump(new_json); + + res_code = XGBoosterPredictFromDense( + R_ExternalPtrAddr(handle), array_str.c_str(), new_c_json.c_str(), + proxy_dmat_handle, &out_shape, &out_dim, &out_result); + break; + } + + case PredictionInputType::DataFrame: { + std::unique_ptr proxy_dmat = GetProxyDMatrixWithBaseMargin( + base_margin); + DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr; + + const std::string df_str = MakeArrayInterfaceFromRDataFrame(input_data); + + xgboost::StringView json_str(c_json_config); + xgboost::Json new_json = xgboost::Json::Load(json_str); + AddMissingToJson(&new_json, missing, REALSXP); + const std::string new_c_json = xgboost::Json::Dump(new_json); + + res_code = XGBoosterPredictFromColumnar( + R_ExternalPtrAddr(handle), df_str.c_str(), new_c_json.c_str(), + proxy_dmat_handle, &out_shape, &out_dim, &out_result); + break; + } + } + } + CHECK_CALL(res_code); r_out_shape = PROTECT(allocVector(INTSXP, out_dim)); size_t len = 1; @@ -1282,6 +1405,31 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con return r_out; } +} // namespace + +XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) { + return XGBoosterPredictGeneric(handle, dmat, json_config, + PredictionInputType::DMatrix, R_NilValue, R_NilValue); +} + +XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing, + SEXP json_config, SEXP base_margin) { + return XGBoosterPredictGeneric(handle, R_mat, json_config, + PredictionInputType::DenseMatrix, missing, base_margin); +} + +XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing, + SEXP json_config, SEXP base_margin) { + return XGBoosterPredictGeneric(handle, lst, json_config, + PredictionInputType::CSRMatrix, missing, base_margin); +} + +XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing, + SEXP json_config, SEXP base_margin) { + return XGBoosterPredictGeneric(handle, R_df, json_config, + PredictionInputType::DataFrame, missing, base_margin); +} + XGB_DLL SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) { R_API_BEGIN(); CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)))); diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 652345e52b64..70fd885e7f12 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -371,6 +371,50 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn * \return A list containing 2 vectors, first one for shape while second one for prediction result. */ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config); + +/*! + * \brief Run prediction on R dense matrix + * \param handle handle + * \param R_mat R matrix + * \param missing missing value + * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing' + * \param base_margin base margin for the prediction + * + * \return A list containing 2 vectors, first one for shape while second one for prediction result. + */ +XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing, + SEXP json_config, SEXP base_margin); + +/*! + * \brief Run prediction on R CSR matrix + * \param handle handle + * \param lst An R list, containing, in this order: + * (a) 'p' array (a.k.a. indptr) + * (b) 'j' array (a.k.a. indices) + * (c) 'x' array (a.k.a. data / values) + * (d) number of columns + * \param missing missing value + * \param json_config See `XGBoosterPredictFromCSR` in xgboost c_api.h. Doesn't include 'missing' + * \param base_margin base margin for the prediction + * + * \return A list containing 2 vectors, first one for shape while second one for prediction result. + */ +XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing, + SEXP json_config, SEXP base_margin); + +/*! + * \brief Run prediction on R data.frame + * \param handle handle + * \param R_df R data.frame + * \param missing missing value + * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing' + * \param base_margin base margin for the prediction + * + * \return A list containing 2 vectors, first one for shape while second one for prediction result. + */ +XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing, + SEXP json_config, SEXP base_margin); + /*! * \brief load model from existing file * \param handle handle diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index fb3162e423ce..5438c8bb2235 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -139,8 +139,8 @@ test_that("dart prediction works", { pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, iterationrange = c(1, nrounds)) pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE) - expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE))) - expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE))) + expect_equal(pred_by_train_0, pred_by_xgboost_0, tolerance = 1e-6) + expect_equal(pred_by_train_1, pred_by_xgboost_1, tolerance = 1e-6) expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE))) }) @@ -651,6 +651,51 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", { expect_equal(pred_qid, pred_gr) }) +test_that("Can predict on data.frame objects", { + data("mtcars") + y <- mtcars$mpg + x_df <- mtcars[, -1] + x_mat <- as.matrix(x_df) + dm <- xgb.DMatrix(x_mat, label = y, nthread = n_threads) + model <- xgb.train( + params = list( + tree_method = "hist", + objective = "reg:squarederror", + nthread = n_threads + ), + data = dm, + nrounds = 5 + ) + + pred_mat <- predict(model, xgb.DMatrix(x_mat), nthread = n_threads) + pred_df <- predict(model, x_df, nthread = n_threads) + expect_equal(pred_mat, pred_df) +}) + +test_that("'base_margin' gives the same result in DMatrix as in inplace_predict", { + data("mtcars") + y <- mtcars$mpg + x <- as.matrix(mtcars[, -1]) + dm <- xgb.DMatrix(x, label = y, nthread = n_threads) + model <- xgb.train( + params = list( + tree_method = "hist", + objective = "reg:squarederror", + nthread = n_threads + ), + data = dm, + nrounds = 5 + ) + + set.seed(123) + base_margin <- rnorm(nrow(x)) + dm_w_base <- xgb.DMatrix(data = x, base_margin = base_margin) + pred_from_dm <- predict(model, dm_w_base) + pred_from_mat <- predict(model, x, base_margin = base_margin) + + expect_equal(pred_from_dm, pred_from_mat) +}) + test_that("Coefficients from gblinear have the expected shape and names", { # Single-column coefficients data(mtcars) diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 45bcac08d479..0612406444ae 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -302,6 +302,37 @@ test_that("xgb.DMatrix: Inf as missing", { file.remove(fname_nan) }) +test_that("xgb.DMatrix: missing in CSR", { + x_dense <- matrix(as.numeric(1:10), nrow = 5) + x_dense[2, 1] <- NA_real_ + + x_csr <- as(x_dense, "RsparseMatrix") + + m_dense <- xgb.DMatrix(x_dense, nthread = n_threads, missing = NA_real_) + xgb.DMatrix.save(m_dense, "dense.dmatrix") + + m_csr <- xgb.DMatrix(x_csr, nthread = n_threads, missing = NA) + xgb.DMatrix.save(m_csr, "csr.dmatrix") + + denseconn <- file("dense.dmatrix", "rb") + csrconn <- file("csr.dmatrix", "rb") + + expect_equal(file.size("dense.dmatrix"), file.size("csr.dmatrix")) + + bytes <- file.size("dense.dmatrix") + densedmatrix <- readBin(denseconn, "raw", n = bytes) + csrmatrix <- readBin(csrconn, "raw", n = bytes) + + expect_equal(length(densedmatrix), length(csrmatrix)) + expect_equal(densedmatrix, csrmatrix) + + close(denseconn) + close(csrconn) + + file.remove("dense.dmatrix") + file.remove("csr.dmatrix") +}) + test_that("xgb.DMatrix: error on three-dimensional array", { set.seed(123) x <- matrix(rnorm(500), nrow = 50) From 0ce4372bd49ffd79bbbafef6438af62d3cf8a342 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sun, 25 Feb 2024 00:18:23 +0800 Subject: [PATCH 2/5] Use UBJSON for serializing splits for vertical data split. (#10059) --- .github/workflows/main.yml | 2 +- .github/workflows/python_tests.yml | 2 +- .github/workflows/python_wheels.yml | 2 +- .github/workflows/r_tests.yml | 2 +- R-package/src/Makevars.in | 1 + R-package/src/Makevars.win | 1 + src/collective/communicator-inl.cc | 34 ++++ src/collective/communicator-inl.h | 47 ++--- src/tree/hist/evaluate_splits.h | 164 +++++++----------- src/tree/hist/expand_entry.h | 15 +- src/tree/updater_quantile_hist.cc | 5 +- .../cpp/collective/test_rabit_communicator.cc | 45 ++++- tests/cpp/common/test_json.cc | 2 +- tests/cpp/tree/test_quantile_hist.cc | 5 +- 14 files changed, 162 insertions(+), 165 deletions(-) create mode 100644 src/collective/communicator-inl.cc diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 20e91a5d93f6..133e151e5e4f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -174,7 +174,7 @@ jobs: - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0 with: submodules: 'true' - - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0 + - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: "3.8" architecture: 'x64' diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 0fca76673962..3fbcc7a01acf 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -310,7 +310,7 @@ jobs: submodules: 'true' - name: Set up Python 3.8 - uses: actions/setup-python@v4 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml index f46b772950c9..12ae8a244e4f 100644 --- a/.github/workflows/python_wheels.yml +++ b/.github/workflows/python_wheels.yml @@ -21,7 +21,7 @@ jobs: with: submodules: 'true' - name: Setup Python - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: "3.8" - name: Build wheels diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index d004ab15ca15..ad6853281c1f 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -74,7 +74,7 @@ jobs: key: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }} restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }} - - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0 + - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: "3.8" architecture: 'x64' diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index dd13983f5b59..0f4b3ac6f6a7 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -104,6 +104,7 @@ OBJECTS= \ $(PKGROOT)/src/collective/broadcast.o \ $(PKGROOT)/src/collective/comm.o \ $(PKGROOT)/src/collective/coll.o \ + $(PKGROOT)/src/collective/communicator-inl.o \ $(PKGROOT)/src/collective/tracker.o \ $(PKGROOT)/src/collective/communicator.o \ $(PKGROOT)/src/collective/in_memory_communicator.o \ diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win index 46a862711dc6..0c2084de940c 100644 --- a/R-package/src/Makevars.win +++ b/R-package/src/Makevars.win @@ -104,6 +104,7 @@ OBJECTS= \ $(PKGROOT)/src/collective/broadcast.o \ $(PKGROOT)/src/collective/comm.o \ $(PKGROOT)/src/collective/coll.o \ + $(PKGROOT)/src/collective/communicator-inl.o \ $(PKGROOT)/src/collective/tracker.o \ $(PKGROOT)/src/collective/communicator.o \ $(PKGROOT)/src/collective/in_memory_communicator.o \ diff --git a/src/collective/communicator-inl.cc b/src/collective/communicator-inl.cc new file mode 100644 index 000000000000..4164855f1cef --- /dev/null +++ b/src/collective/communicator-inl.cc @@ -0,0 +1,34 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#include "communicator-inl.h" + +namespace xgboost::collective { +[[nodiscard]] std::vector> VectorAllgatherV( + std::vector> const &input) { + auto n_inputs = input.size(); + std::vector sizes(n_inputs); + std::transform(input.cbegin(), input.cend(), sizes.begin(), + [](auto const &vec) { return vec.size(); }); + + std::vector global_sizes = AllgatherV(sizes); + std::vector offset(global_sizes.size() + 1); + offset[0] = 0; + for (std::size_t i = 1; i < offset.size(); i++) { + offset[i] = offset[i - 1] + global_sizes[i - 1]; + } + + std::vector collected; + for (auto const &vec : input) { + collected.insert(collected.end(), vec.cbegin(), vec.cend()); + } + auto out = AllgatherV(collected); + + std::vector> result; + for (std::size_t i = 1; i < offset.size(); ++i) { + std::vector local(out.cbegin() + offset[i - 1], out.cbegin() + offset[i]); + result.emplace_back(std::move(local)); + } + return result; +} +} // namespace xgboost::collective diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h index 34212def2377..991e19f2c65a 100644 --- a/src/collective/communicator-inl.h +++ b/src/collective/communicator-inl.h @@ -1,5 +1,5 @@ /** - * Copyright 2022-2023 by XGBoost contributors + * Copyright 2022-2024, XGBoost contributors */ #pragma once #include @@ -192,6 +192,18 @@ inline std::vector AllgatherV(std::vector const &input) { return result; } +/** + * @brief Gathers variable-length data from all processes and distributes it to all processes. + * + * @param inputs All the inputs from the local worker. The number of inputs can vary + * across different workers. Along with which, the size of each vector in + * the input can also vary. + * + * @return The AllgatherV result, containing vectors from all workers. + */ +[[nodiscard]] std::vector> VectorAllgatherV( + std::vector> const &input); + /** * @brief Gathers variable-length strings from all processes and distributes them to all processes. * @param input Variable-length list of variable-length strings. @@ -294,38 +306,5 @@ template inline void Allreduce(double *send_receive_buffer, size_t count) { Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op); } - -template -struct SpecialAllgatherVResult { - std::vector offsets; - std::vector sizes; - std::vector result; -}; - -/** - * @brief Gathers variable-length data from all processes and distributes it to all processes. - * - * We assume each worker has the same number of inputs, but each input may be of a different size. - * - * @param inputs All the inputs from the local worker. - * @param sizes Sizes of each input. - */ -template -inline SpecialAllgatherVResult SpecialAllgatherV(std::vector const &inputs, - std::vector const &sizes) { - // Gather the sizes across all workers. - auto const all_sizes = Allgather(sizes); - - // Calculate input offsets (std::exclusive_scan). - std::vector offsets(all_sizes.size()); - for (std::size_t i = 1; i < offsets.size(); i++) { - offsets[i] = offsets[i - 1] + all_sizes[i - 1]; - } - - // Gather all the inputs. - auto const all_inputs = AllgatherV(inputs); - - return {offsets, all_sizes, all_inputs}; -} } // namespace collective } // namespace xgboost diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index bc534d351f17..d25a41cb0b3c 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -1,5 +1,5 @@ /** - * Copyright 2021-2023 by XGBoost Contributors + * Copyright 2021-2024, XGBoost Contributors */ #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_ #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_ @@ -26,6 +26,47 @@ #include "xgboost/linalg.h" // for Constants, Vector namespace xgboost::tree { +/** + * @brief Gather the expand entries from all the workers. + * @param entries Local expand entries on this worker. + * @return Global expand entries gathered from all workers. + */ +template +std::enable_if_t || + std::is_same_v, + std::vector> +AllgatherColumnSplit(std::vector const &entries) { + auto const n_entries = entries.size(); + + // First, gather all the primitive fields. + std::vector local_entries(n_entries); + + // Collect and serialize all entries + std::vector> serialized_entries; + for (std::size_t i = 0; i < n_entries; ++i) { + Json jentry{Object{}}; + entries[i].Save(&jentry); + + std::vector out; + Json::Dump(jentry, &out, std::ios::binary); + + serialized_entries.emplace_back(std::move(out)); + } + auto all_serialized = collective::VectorAllgatherV(serialized_entries); + CHECK_GE(all_serialized.size(), local_entries.size()); + + std::vector all_entries(all_serialized.size()); + std::transform(all_serialized.cbegin(), all_serialized.cend(), all_entries.begin(), + [](std::vector const &e) { + ExpandEntry entry; + auto je = Json::Load(StringView{e.data(), e.size()}, std::ios::binary); + entry.Load(je); + return entry; + }); + + return all_entries; +} + class HistEvaluator { private: struct NodeEntry { @@ -36,8 +77,8 @@ class HistEvaluator { }; private: - Context const* ctx_; - TrainParam const* param_; + Context const *ctx_; + TrainParam const *param_; std::shared_ptr column_sampler_; TreeEvaluator tree_evaluator_; bool is_col_split_{false}; @@ -202,7 +243,7 @@ class HistEvaluator { common::CatBitField cat_bits{best.cat_bits}; bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin); CHECK_GT(partition, 0); - std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) { + std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](std::size_t c) { auto cat = cut_val[c + f_begin]; cat_bits.Set(cat); }); @@ -285,57 +326,23 @@ class HistEvaluator { return left_sum; } - /** - * @brief Gather the expand entries from all the workers. - * @param entries Local expand entries on this worker. - * @return Global expand entries gathered from all workers. - */ - std::vector Allgather(std::vector const &entries) { - auto const world = collective::GetWorldSize(); - auto const num_entries = entries.size(); - - // First, gather all the primitive fields. - std::vector local_entries(num_entries); - std::vector cat_bits; - std::vector cat_bits_sizes; - for (std::size_t i = 0; i < num_entries; i++) { - local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes); - } - auto all_entries = collective::Allgather(local_entries); - - // Gather all the cat_bits. - auto gathered = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes); - - common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) { - // Copy the cat_bits back into all expand entries. - all_entries[i].split.cat_bits.resize(gathered.sizes[i]); - std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i], - all_entries[i].split.cat_bits.begin()); - }); - - return all_entries; - } - public: void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut, common::Span feature_types, const RegTree &tree, std::vector *p_entries) { auto n_threads = ctx_->Threads(); - auto& entries = *p_entries; + auto &entries = *p_entries; // All nodes are on the same level, so we can store the shared ptr. - std::vector>> features( - entries.size()); + std::vector>> features(entries.size()); for (size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) { auto nidx = entries[nidx_in_set].nid; - features[nidx_in_set] = - column_sampler_->GetFeatureSet(tree.GetDepth(nidx)); + features[nidx_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nidx)); } CHECK(!features.empty()); - const size_t grain_size = - std::max(1, features.front()->Size() / n_threads); - common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) { - return features[nidx_in_set]->Size(); - }, grain_size); + const size_t grain_size = std::max(1, features.front()->Size() / n_threads); + common::BlockedSpace2d space( + entries.size(), [&](size_t nidx_in_set) { return features[nidx_in_set]->Size(); }, + grain_size); std::vector tloc_candidates(n_threads * entries.size()); for (size_t i = 0; i < entries.size(); ++i) { @@ -344,7 +351,7 @@ class HistEvaluator { } } auto evaluator = tree_evaluator_.GetEvaluator(); - auto const& cut_ptrs = cut.Ptrs(); + auto const &cut_ptrs = cut.Ptrs(); common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) { auto tidx = omp_get_thread_num(); @@ -385,18 +392,16 @@ class HistEvaluator { } }); - for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); - ++nidx_in_set) { + for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) { for (auto tidx = 0; tidx < n_threads; ++tidx) { - entries[nidx_in_set].split.Update( - tloc_candidates[n_threads * nidx_in_set + tidx].split); + entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split); } } if (is_col_split_) { // With column-wise data split, we gather the best splits from all the workers and update the // expand entries accordingly. - auto all_entries = Allgather(entries); + auto all_entries = AllgatherColumnSplit(entries); for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) { for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) { entries[nidx_in_set].split.Update( @@ -407,7 +412,7 @@ class HistEvaluator { } // Add splits to tree, handles all statistic - void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) { + void ApplyTreeSplit(CPUExpandEntry const &candidate, RegTree *p_tree) { auto evaluator = tree_evaluator_.GetEvaluator(); RegTree &tree = *p_tree; @@ -437,8 +442,7 @@ class HistEvaluator { auto left_child = tree[candidate.nid].LeftChild(); auto right_child = tree[candidate.nid].RightChild(); tree_evaluator_.AddSplit(candidate.nid, left_child, right_child, - tree[candidate.nid].SplitIndex(), left_weight, - right_weight); + tree[candidate.nid].SplitIndex(), left_weight, right_weight); evaluator = tree_evaluator_.GetEvaluator(); snode_.resize(tree.GetNodes().size()); @@ -449,8 +453,7 @@ class HistEvaluator { snode_.at(right_child).root_gain = evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.right_sum}); - interaction_constraints_.Split(candidate.nid, - tree[candidate.nid].SplitIndex(), left_child, + interaction_constraints_.Split(candidate.nid, tree[candidate.nid].SplitIndex(), left_child, right_child); } @@ -571,53 +574,6 @@ class HistMultiEvaluator { return false; } - /** - * @brief Gather the expand entries from all the workers. - * @param entries Local expand entries on this worker. - * @return Global expand entries gathered from all workers. - */ - std::vector Allgather(std::vector const &entries) { - auto const world = collective::GetWorldSize(); - auto const num_entries = entries.size(); - - // First, gather all the primitive fields. - std::vector local_entries(num_entries); - std::vector cat_bits; - std::vector cat_bits_sizes; - std::vector gradients; - for (std::size_t i = 0; i < num_entries; i++) { - local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes, &gradients); - } - auto all_entries = collective::Allgather(local_entries); - - // Gather all the cat_bits. - auto gathered_cat_bits = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes); - - // Gather all the gradients. - auto const num_gradients = gradients.size(); - auto const all_gradients = collective::Allgather(gradients); - - auto const total_entries = num_entries * world; - auto const gradients_per_entry = num_gradients / num_entries; - auto const gradients_per_side = gradients_per_entry / 2; - common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) { - // Copy the cat_bits back into all expand entries. - all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]); - std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i], - gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin()); - - // Copy the gradients back into all expand entries. - all_entries[i].split.left_sum.resize(gradients_per_side); - std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side, - all_entries[i].split.left_sum.begin()); - all_entries[i].split.right_sum.resize(gradients_per_side); - std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side, - gradients_per_side, all_entries[i].split.right_sum.begin()); - }); - - return all_entries; - } - public: void EvaluateSplits(RegTree const &tree, common::Span hist, common::HistogramCuts const &cut, std::vector *p_entries) { @@ -676,7 +632,7 @@ class HistMultiEvaluator { if (is_col_split_) { // With column-wise data split, we gather the best splits from all the workers and update the // expand entries accordingly. - auto all_entries = Allgather(entries); + auto all_entries = AllgatherColumnSplit(entries); for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) { for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) { entries[nidx_in_set].split.Update( diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h index d6315877d1e0..fd16397e1193 100644 --- a/src/tree/hist/expand_entry.h +++ b/src/tree/hist/expand_entry.h @@ -90,7 +90,6 @@ struct ExpandEntryImpl { } self->split.is_cat = get(split["is_cat"]); - self->LoadGrad(split); } }; @@ -106,8 +105,8 @@ struct CPUExpandEntry : public ExpandEntryImpl { void SaveGrad(Json* p_out) const { auto& out = *p_out; auto save = [&](std::string const& name, GradStats const& sum) { - out[name] = F32Array{2}; - auto& array = get(out[name]); + out[name] = F64Array{2}; + auto& array = get(out[name]); array[0] = sum.GetGrad(); array[1] = sum.GetHess(); }; @@ -115,9 +114,9 @@ struct CPUExpandEntry : public ExpandEntryImpl { save("right_sum", this->split.right_sum); } void LoadGrad(Json const& in) { - auto const& left_sum = get(in["left_sum"]); + auto const& left_sum = get(in["left_sum"]); this->split.left_sum = GradStats{left_sum[0], left_sum[1]}; - auto const& right_sum = get(in["right_sum"]); + auto const& right_sum = get(in["right_sum"]); this->split.right_sum = GradStats{right_sum[0], right_sum[1]}; } @@ -173,8 +172,8 @@ struct MultiExpandEntry : public ExpandEntryImpl { void SaveGrad(Json* p_out) const { auto& out = *p_out; auto save = [&](std::string const& name, std::vector const& sum) { - out[name] = F32Array{sum.size() * 2}; - auto& array = get(out[name]); + out[name] = F64Array{sum.size() * 2}; + auto& array = get(out[name]); for (std::size_t i = 0, j = 0; i < sum.size(); i++, j += 2) { array[j] = sum[i].GetGrad(); array[j + 1] = sum[i].GetHess(); @@ -185,7 +184,7 @@ struct MultiExpandEntry : public ExpandEntryImpl { } void LoadGrad(Json const& in) { auto load = [&](std::string const& name, std::vector* p_sum) { - auto const& array = get(in[name]); + auto const& array = get(in[name]); auto& sum = *p_sum; sum.resize(array.size() / 2); for (std::size_t i = 0, j = 0; i < sum.size(); ++i, j += 2) { diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index c2aaedafac95..cd60e6602cf7 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023, XGBoost Contributors + * Copyright 2017-2024, XGBoost Contributors * \file updater_quantile_hist.cc * \brief use quantized feature values to construct a tree * \author Philip Cho, Tianqi Checn, Egor Smirnov @@ -149,9 +149,6 @@ class MultiTargetHistBuilder { } void InitData(DMatrix *p_fmat, RegTree const *p_tree) { - if (collective::IsDistributed()) { - LOG(FATAL) << "Distributed training for vector-leaf is not yet supported."; - } monitor_->Start(__func__); p_last_fmat_ = p_fmat; diff --git a/tests/cpp/collective/test_rabit_communicator.cc b/tests/cpp/collective/test_rabit_communicator.cc index ba22d8fdb84f..9711e1aede71 100644 --- a/tests/cpp/collective/test_rabit_communicator.cc +++ b/tests/cpp/collective/test_rabit_communicator.cc @@ -1,13 +1,12 @@ -/*! - * Copyright 2022 XGBoost contributors +/** + * Copyright 2022-2024, XGBoost contributors */ #include #include "../../../src/collective/rabit_communicator.h" +#include "../helpers.h" -namespace xgboost { -namespace collective { - +namespace xgboost::collective { TEST(RabitCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) { auto construct = []() { RabitCommunicator comm{0, 0}; }; EXPECT_THROW(construct(), dmlc::Error); @@ -35,5 +34,37 @@ TEST(RabitCommunicatorSimpleTest, IsNotDistributed) { EXPECT_FALSE(comm.IsDistributed()); } -} // namespace collective -} // namespace xgboost +namespace { +void VerifyVectorAllgatherV() { + auto n_workers = collective::GetWorldSize(); + ASSERT_EQ(n_workers, 3); + auto rank = collective::GetRank(); + // Construct input that has different length for each worker. + std::vector> inputs; + for (std::int32_t i = 0; i < rank + 1; ++i) { + std::vector in; + for (std::int32_t j = 0; j < rank + 1; ++j) { + in.push_back(static_cast(j)); + } + inputs.emplace_back(std::move(in)); + } + + auto outputs = VectorAllgatherV(inputs); + + ASSERT_EQ(outputs.size(), (1 + n_workers) * n_workers / 2); + auto const& res = outputs; + + for (std::int32_t i = 0; i < n_workers; ++i) { + std::int32_t k = 0; + for (auto v : res[i]) { + ASSERT_EQ(v, k++); + } + } +} +} // namespace + +TEST(VectorAllgatherV, Basic) { + std::int32_t n_workers{3}; + RunWithInMemoryCommunicator(n_workers, VerifyVectorAllgatherV); +} +} // namespace xgboost::collective diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc index 72163efd78cc..3ee041a339ed 100644 --- a/tests/cpp/common/test_json.cc +++ b/tests/cpp/common/test_json.cc @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023, XGBoost Contributors + * Copyright 2019-2024, XGBoost Contributors */ #include diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index cf806536a861..4021c9959440 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -1,5 +1,5 @@ /** - * Copyright 2018-2023 by XGBoost Contributors + * Copyright 2018-2024, XGBoost Contributors */ #include #include @@ -18,7 +18,6 @@ #include "xgboost/data.h" namespace xgboost::tree { - namespace { template void TestPartitioner(bst_target_t n_targets) { @@ -253,5 +252,5 @@ void TestColumnSplit(bst_target_t n_targets) { TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); } -TEST(QuantileHist, DISABLED_ColumnSplitMultiTarget) { TestColumnSplit(3); } +TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); } } // namespace xgboost::tree From 761845f59402d61b575c3fa2f4fc60ec66804d1e Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin Date: Mon, 26 Feb 2024 14:07:36 +0100 Subject: [PATCH 3/5] [SYCL] Implement row set collection. (#10057) Co-authored-by: Dmitry Razdoburdin <> --- plugin/sycl/common/row_set.h | 123 ++++++++++++++++++ .../plugin/test_sycl_row_set_collection.cc | 78 +++++++++++ 2 files changed, 201 insertions(+) create mode 100644 plugin/sycl/common/row_set.h create mode 100644 tests/cpp/plugin/test_sycl_row_set_collection.cc diff --git a/plugin/sycl/common/row_set.h b/plugin/sycl/common/row_set.h new file mode 100644 index 000000000000..574adbf8d9b9 --- /dev/null +++ b/plugin/sycl/common/row_set.h @@ -0,0 +1,123 @@ +/*! + * Copyright 2017-2023 XGBoost contributors + */ +#ifndef PLUGIN_SYCL_COMMON_ROW_SET_H_ +#define PLUGIN_SYCL_COMMON_ROW_SET_H_ + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtautological-constant-compare" +#pragma GCC diagnostic ignored "-W#pragma-messages" +#include +#pragma GCC diagnostic pop +#include +#include +#include + +#include "../data.h" + +#include + +namespace xgboost { +namespace sycl { +namespace common { + + +/*! \brief Collection of rowsets stored on device in USM memory */ +class RowSetCollection { + public: + /*! \brief data structure to store an instance set, a subset of + * rows (instances) associated with a particular node in a decision + * tree. */ + struct Elem { + const size_t* begin{nullptr}; + const size_t* end{nullptr}; + bst_node_t node_id{-1}; // id of node associated with this instance set; -1 means uninitialized + Elem() + = default; + Elem(const size_t* begin, + const size_t* end, + bst_node_t node_id = -1) + : begin(begin), end(end), node_id(node_id) {} + + + inline size_t Size() const { + return end - begin; + } + }; + + inline size_t Size() const { + return elem_of_each_node_.size(); + } + + /*! \brief return corresponding element set given the node_id */ + inline const Elem& operator[](unsigned node_id) const { + const Elem& e = elem_of_each_node_[node_id]; + CHECK(e.begin != nullptr) + << "access element that is not in the set"; + return e; + } + + /*! \brief return corresponding element set given the node_id */ + inline Elem& operator[](unsigned node_id) { + Elem& e = elem_of_each_node_[node_id]; + return e; + } + + // clear up things + inline void Clear() { + elem_of_each_node_.clear(); + } + // initialize node id 0->everything + inline void Init() { + CHECK_EQ(elem_of_each_node_.size(), 0U); + + const size_t* begin = row_indices_.Begin(); + const size_t* end = row_indices_.End(); + elem_of_each_node_.emplace_back(Elem(begin, end, 0)); + } + + auto& Data() { return row_indices_; } + + // split rowset into two + inline void AddSplit(unsigned node_id, + unsigned left_node_id, + unsigned right_node_id, + size_t n_left, + size_t n_right) { + const Elem e = elem_of_each_node_[node_id]; + CHECK(e.begin != nullptr); + size_t* all_begin = row_indices_.Begin(); + size_t* begin = all_begin + (e.begin - all_begin); + + + CHECK_EQ(n_left + n_right, e.Size()); + CHECK_LE(begin + n_left, e.end); + CHECK_EQ(begin + n_left + n_right, e.end); + + + if (left_node_id >= elem_of_each_node_.size()) { + elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1)); + } + if (right_node_id >= elem_of_each_node_.size()) { + elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1)); + } + + + elem_of_each_node_[left_node_id] = Elem(begin, begin + n_left, left_node_id); + elem_of_each_node_[right_node_id] = Elem(begin + n_left, e.end, right_node_id); + elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1); + } + + private: + // stores the row indexes in the set + USMVector row_indices_; + // vector: node_id -> elements + std::vector elem_of_each_node_; +}; + +} // namespace common +} // namespace sycl +} // namespace xgboost + + +#endif // PLUGIN_SYCL_COMMON_ROW_SET_H_ diff --git a/tests/cpp/plugin/test_sycl_row_set_collection.cc b/tests/cpp/plugin/test_sycl_row_set_collection.cc new file mode 100644 index 000000000000..f527d9f16d1b --- /dev/null +++ b/tests/cpp/plugin/test_sycl_row_set_collection.cc @@ -0,0 +1,78 @@ +/** + * Copyright 2020-2023 by XGBoost contributors + */ +#include + +#include +#include +#include + +#include "../../../plugin/sycl/common/row_set.h" +#include "../../../plugin/sycl/device_manager.h" +#include "../helpers.h" + +namespace xgboost::sycl::common { +TEST(SyclRowSetCollection, AddSplits) { + const size_t num_rows = 16; + + DeviceManager device_manager; + auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault()); + + RowSetCollection row_set_collection; + + auto& row_indices = row_set_collection.Data(); + row_indices.Resize(&qu, num_rows); + size_t* p_row_indices = row_indices.Data(); + + qu.submit([&](::sycl::handler& cgh) { + cgh.parallel_for<>(::sycl::range<1>(num_rows), + [p_row_indices](::sycl::item<1> pid) { + const size_t idx = pid.get_id(0); + p_row_indices[idx] = idx; + }); + }).wait_and_throw(); + row_set_collection.Init(); + + CHECK_EQ(row_set_collection.Size(), 1); + { + size_t nid_test = 0; + auto& elem = row_set_collection[nid_test]; + CHECK_EQ(elem.begin, row_indices.Begin()); + CHECK_EQ(elem.end, row_indices.End()); + CHECK_EQ(elem.node_id , 0); + } + + size_t nid = 0; + size_t nid_left = 1; + size_t nid_right = 2; + size_t n_left = 4; + size_t n_right = num_rows - n_left; + row_set_collection.AddSplit(nid, nid_left, nid_right, n_left, n_right); + CHECK_EQ(row_set_collection.Size(), 3); + + { + size_t nid_test = 0; + auto& elem = row_set_collection[nid_test]; + CHECK_EQ(elem.begin, nullptr); + CHECK_EQ(elem.end, nullptr); + CHECK_EQ(elem.node_id , -1); + } + + { + size_t nid_test = 1; + auto& elem = row_set_collection[nid_test]; + CHECK_EQ(elem.begin, row_indices.Begin()); + CHECK_EQ(elem.end, row_indices.Begin() + n_left); + CHECK_EQ(elem.node_id , nid_test); + } + + { + size_t nid_test = 2; + auto& elem = row_set_collection[nid_test]; + CHECK_EQ(elem.begin, row_indices.Begin() + n_left); + CHECK_EQ(elem.end, row_indices.End()); + CHECK_EQ(elem.node_id , nid_test); + } + +} +} // namespace xgboost::sycl::common From 5ac233280e1218fcf9de011fbbbe7841402d9866 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 28 Feb 2024 03:12:42 +0800 Subject: [PATCH 4/5] Require context in aggregators. (#10075) --- .clang-format | 2 +- include/xgboost/collective/result.h | 12 ++++++-- src/collective/aggregator.h | 46 ++++++++++++++++------------- src/common/quantile.cc | 34 +++++++++++++-------- src/learner.cc | 6 ++-- src/metric/auc.cc | 10 ++++--- src/metric/elementwise_metric.cu | 37 ++++++++++++----------- src/metric/metric_common.h | 7 ++--- src/metric/multiclass_metric.cu | 20 ++++++------- src/metric/rank_metric.cc | 11 +++---- src/metric/rank_metric.cu | 2 +- src/metric/survival_metric.cu | 28 ++++++++---------- src/objective/adaptive.cc | 8 ++--- src/objective/adaptive.cu | 9 +++--- src/objective/adaptive.h | 23 ++++++++------- src/objective/quantile_obj.cu | 6 ++-- src/objective/regression_obj.cu | 10 +++++-- src/tree/fit_stump.cc | 11 ++++--- src/tree/fit_stump.cu | 17 +++++------ src/tree/gpu_hist/histogram.cu | 11 ++++--- src/tree/updater_approx.cc | 7 +++-- src/tree/updater_gpu_hist.cu | 6 ++-- src/tree/updater_quantile_hist.cc | 11 +++++-- 23 files changed, 190 insertions(+), 144 deletions(-) diff --git a/.clang-format b/.clang-format index 0984d5a7bf30..737cf9006bae 100644 --- a/.clang-format +++ b/.clang-format @@ -17,7 +17,7 @@ AllowShortEnumsOnASingleLine: true AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: All -AllowShortLambdasOnASingleLine: All +AllowShortLambdasOnASingleLine: Inline AllowShortIfStatementsOnASingleLine: WithoutElse AllowShortLoopsOnASingleLine: true AlwaysBreakAfterDefinitionReturnType: None diff --git a/include/xgboost/collective/result.h b/include/xgboost/collective/result.h index 507171dd4ff1..919d3a902298 100644 --- a/include/xgboost/collective/result.h +++ b/include/xgboost/collective/result.h @@ -1,8 +1,10 @@ /** - * Copyright 2023, XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #pragma once +#include + #include // for unique_ptr #include // for stringstream #include // for stack @@ -160,10 +162,16 @@ struct Result { // We don't have monad, a simple helper would do. template -Result operator<<(Result&& r, Fn&& fn) { +[[nodiscard]] Result operator<<(Result&& r, Fn&& fn) { if (!r.OK()) { return std::forward(r); } return fn(); } + +inline void SafeColl(Result const& rc) { + if (!rc.OK()) { + LOG(FATAL) << rc.Report(); + } +} } // namespace xgboost::collective diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h index f2a9ff528366..8a5b31c36546 100644 --- a/src/collective/aggregator.h +++ b/src/collective/aggregator.h @@ -1,22 +1,21 @@ /** - * Copyright 2023 by XGBoost contributors + * Copyright 2023-2024, XGBoost contributors * * Higher level functions built on top the Communicator API, taking care of behavioral differences * between row-split vs column-split distributed training, and horizontal vs vertical federated * learning. */ #pragma once -#include - #include #include #include #include #include "communicator-inl.h" +#include "xgboost/collective/result.h" // for Result +#include "xgboost/data.h" // for MetaINfo -namespace xgboost { -namespace collective { +namespace xgboost::collective { /** * @brief Apply the given function where the labels are. @@ -31,15 +30,16 @@ namespace collective { * @param size The size of the buffer. * @param function The function used to calculate the results. */ -template -void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& function) { +template +void ApplyWithLabels(Context const*, MetaInfo const& info, void* buffer, std::size_t size, + FN&& function) { if (info.IsVerticalFederated()) { // We assume labels are only available on worker 0, so the calculation is done there and result // broadcast to other workers. std::string message; if (collective::GetRank() == 0) { try { - std::forward(function)(); + std::forward(function)(); } catch (dmlc::Error& e) { message = e.what(); } @@ -52,7 +52,7 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& LOG(FATAL) << &message[0]; } } else { - std::forward(function)(); + std::forward(function)(); } } @@ -70,7 +70,8 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& * @param function The function used to calculate the results. */ template -void ApplyWithLabels(MetaInfo const& info, HostDeviceVector* result, Function&& function) { +void ApplyWithLabels(Context const*, MetaInfo const& info, HostDeviceVector* result, + Function&& function) { if (info.IsVerticalFederated()) { // We assume labels are only available on worker 0, so the calculation is done there and result // broadcast to other workers. @@ -114,7 +115,9 @@ void ApplyWithLabels(MetaInfo const& info, HostDeviceVector* result, Function * @return The global max of the input. */ template -T GlobalMax(MetaInfo const& info, T value) { +std::enable_if_t, T> GlobalMax(Context const*, + MetaInfo const& info, + T value) { if (info.IsRowSplit()) { collective::Allreduce(&value, 1); } @@ -132,16 +135,18 @@ T GlobalMax(MetaInfo const& info, T value) { * @param values Pointer to the inputs to sum. * @param size Number of values to sum. */ -template -void GlobalSum(MetaInfo const& info, T* values, size_t size) { +template +[[nodiscard]] Result GlobalSum(Context const*, MetaInfo const& info, + linalg::TensorView values) { if (info.IsRowSplit()) { - collective::Allreduce(values, size); + collective::Allreduce(values.Values().data(), values.Size()); } + return Success(); } template -void GlobalSum(MetaInfo const& info, Container* values) { - GlobalSum(info, values->data(), values->size()); +[[nodiscard]] Result GlobalSum(Context const* ctx, MetaInfo const& info, Container* values) { + return GlobalSum(ctx, info, values->data(), values->size()); } /** @@ -157,9 +162,10 @@ void GlobalSum(MetaInfo const& info, Container* values) { * @return The global ratio of the two inputs. */ template -T GlobalRatio(MetaInfo const& info, T dividend, T divisor) { +T GlobalRatio(Context const* ctx, MetaInfo const& info, T dividend, T divisor) { std::array results{dividend, divisor}; - GlobalSum(info, &results); + auto rc = GlobalSum(ctx, info, linalg::MakeVec(results.data(), results.size())); + collective::SafeColl(rc); std::tie(dividend, divisor) = std::tuple_cat(results); if (divisor <= 0) { return std::numeric_limits::quiet_NaN(); @@ -167,6 +173,4 @@ T GlobalRatio(MetaInfo const& info, T dividend, T divisor) { return dividend / divisor; } } - -} // namespace collective -} // namespace xgboost +} // namespace xgboost::collective diff --git a/src/common/quantile.cc b/src/common/quantile.cc index c74db99e4006..e521fae69b1d 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -1,5 +1,5 @@ -/*! - * Copyright 2020-2022 by XGBoost Contributors +/** + * Copyright 2020-2024, XGBoost Contributors */ #include "quantile.h" @@ -145,7 +145,7 @@ struct QuantileAllreduce { template void SketchContainerImpl::GatherSketchInfo( - Context const *, MetaInfo const &info, + Context const *ctx, MetaInfo const &info, std::vector const &reduced, std::vector *p_worker_segments, std::vector *p_sketches_scan, std::vector *p_global_sketches) { @@ -171,7 +171,9 @@ void SketchContainerImpl::GatherSketchInfo( std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1); // Gather all column pointers - collective::GlobalSum(info, sketches_scan.data(), sketches_scan.size()); + auto rc = + collective::GlobalSum(ctx, info, linalg::MakeVec(sketches_scan.data(), sketches_scan.size())); + collective::SafeColl(rc); for (int32_t i = 0; i < world; ++i) { size_t back = (i + 1) * (n_columns + 1) - 1; auto n_entries = sketches_scan.at(back); @@ -199,14 +201,15 @@ void SketchContainerImpl::GatherSketchInfo( static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float), "Unexpected size of sketch entry."); - collective::GlobalSum( - info, - reinterpret_cast(global_sketches.data()), - global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float)); + rc = collective::GlobalSum( + ctx, info, + linalg::MakeVec(reinterpret_cast(global_sketches.data()), + global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float))); + collective::SafeColl(rc); } template -void SketchContainerImpl::AllreduceCategories(Context const*, MetaInfo const& info) { +void SketchContainerImpl::AllreduceCategories(Context const* ctx, MetaInfo const& info) { auto world_size = collective::GetWorldSize(); auto rank = collective::GetRank(); if (world_size == 1 || info.IsColumnSplit()) { @@ -226,7 +229,8 @@ void SketchContainerImpl::AllreduceCategories(Context const*, MetaInfo std::vector global_feat_ptrs(feature_ptr.size() * world_size, 0); size_t feat_begin = rank * feature_ptr.size(); // pointer to current worker std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin); - collective::GlobalSum(info, global_feat_ptrs.data(), global_feat_ptrs.size()); + auto rc = collective::GlobalSum( + ctx, info, linalg::MakeVec(global_feat_ptrs.data(), global_feat_ptrs.size())); // move all categories into a flatten vector to prepare for allreduce size_t total = feature_ptr.back(); @@ -239,7 +243,8 @@ void SketchContainerImpl::AllreduceCategories(Context const*, MetaInfo // indptr for indexing workers std::vector global_worker_ptr(world_size + 1, 0); global_worker_ptr[rank + 1] = total; // shift 1 to right for constructing the indptr - collective::GlobalSum(info, global_worker_ptr.data(), global_worker_ptr.size()); + rc = collective::GlobalSum(ctx, info, + linalg::MakeVec(global_worker_ptr.data(), global_worker_ptr.size())); std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin()); // total number of categories in all workers with all features auto gtotal = global_worker_ptr.back(); @@ -251,7 +256,8 @@ void SketchContainerImpl::AllreduceCategories(Context const*, MetaInfo CHECK_EQ(rank_size, total); std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin); // gather values from all workers. - collective::GlobalSum(info, global_categories.data(), global_categories.size()); + rc = collective::GlobalSum(ctx, info, + linalg::MakeVec(global_categories.data(), global_categories.size())); QuantileAllreduce allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs, categories_.size()}; ParallelFor(categories_.size(), n_threads_, [&](auto fidx) { @@ -293,7 +299,9 @@ void SketchContainerImpl::AllReduce( // Prune the intermediate num cuts for synchronization. std::vector global_column_size(columns_size_); - collective::GlobalSum(info, &global_column_size); + auto rc = collective::GlobalSum( + ctx, info, linalg::MakeVec(global_column_size.data(), global_column_size.size())); + collective::SafeColl(rc); ParallelFor(sketches_.size(), n_threads_, [&](size_t i) { int32_t intermediate_num_cuts = static_cast( diff --git a/src/learner.cc b/src/learner.cc index db72f71644cb..eed9dd5cdcd7 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -1,5 +1,5 @@ /** - * Copyright 2014-2023 by XGBoost Contributors + * Copyright 2014-2024, XGBoost Contributors * \file learner.cc * \brief Implementation of learning algorithm. * \author Tianqi Chen @@ -846,7 +846,7 @@ class LearnerConfiguration : public Learner { void InitEstimation(MetaInfo const& info, linalg::Tensor* base_score) { base_score->Reshape(1); - collective::ApplyWithLabels(info, base_score->Data(), + collective::ApplyWithLabels(this->Ctx(), info, base_score->Data(), [&] { UsePtr(obj_)->InitEstimation(info, base_score); }); } }; @@ -1472,7 +1472,7 @@ class LearnerImpl : public LearnerIO { void GetGradient(HostDeviceVector const& preds, MetaInfo const& info, std::int32_t iter, linalg::Matrix* out_gpair) { out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength()); - collective::ApplyWithLabels(info, out_gpair->Data(), + collective::ApplyWithLabels(&ctx_, info, out_gpair->Data(), [&] { obj_->GetGradient(preds, info, iter, out_gpair); }); } diff --git a/src/metric/auc.cc b/src/metric/auc.cc index 4a8aa8a4bbdc..212a3a027d35 100644 --- a/src/metric/auc.cc +++ b/src/metric/auc.cc @@ -1,5 +1,5 @@ /** - * Copyright 2021-2023 by XGBoost Contributors + * Copyright 2021-2024, XGBoost Contributors */ #include "auc.h" @@ -112,7 +112,9 @@ double MultiClassOVR(Context const *ctx, common::Span predts, MetaI // we have 2 averages going in here, first is among workers, second is among // classes. allreduce sums up fp/tp auc for each class. - collective::GlobalSum(info, &results.Values()); + auto rc = collective::GlobalSum(ctx, info, results); + collective::SafeColl(rc); + double auc_sum{0}; double tp_sum{0}; for (size_t c = 0; c < n_classes; ++c) { @@ -286,7 +288,7 @@ class EvalAUC : public MetricNoCache { InvalidGroupAUC(); } - auc = collective::GlobalRatio(info, auc, static_cast(valid_groups)); + auc = collective::GlobalRatio(ctx_, info, auc, static_cast(valid_groups)); if (!std::isnan(auc)) { CHECK_LE(auc, 1) << "Total AUC across groups: " << auc * valid_groups << ", valid groups: " << valid_groups; @@ -307,7 +309,7 @@ class EvalAUC : public MetricNoCache { std::tie(fp, tp, auc) = static_cast(this)->EvalBinary(preds, info); } - auc = collective::GlobalRatio(info, auc, fp * tp); + auc = collective::GlobalRatio(ctx_, info, auc, fp * tp); if (!std::isnan(auc)) { CHECK_LE(auc, 1.0); } diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu index f245f3e06306..9c26011aa99f 100644 --- a/src/metric/elementwise_metric.cu +++ b/src/metric/elementwise_metric.cu @@ -1,5 +1,5 @@ /** - * Copyright 2015-2023 by XGBoost Contributors + * Copyright 2015-2024, XGBoost Contributors * \file elementwise_metric.cu * \brief evaluation metrics for elementwise binary or regression. * \author Kailong Chen, Tianqi Chen @@ -12,13 +12,14 @@ #include #include "../collective/communicator-inl.h" -#include "../common/common.h" // MetricNoCache +#include "../common/common.h" // MetricNoCache #include "../common/math.h" #include "../common/optional_weight.h" // OptionalWeights #include "../common/pseudo_huber.h" #include "../common/quantile_loss_utils.h" // QuantileLossParam #include "../common/threading_utils.h" #include "metric_common.h" +#include "xgboost/collective/result.h" // for SafeColl #include "xgboost/metric.h" #if defined(XGBOOST_USE_CUDA) @@ -30,8 +31,7 @@ #include "../common/device_helpers.cuh" #endif // XGBOOST_USE_CUDA -namespace xgboost { -namespace metric { +namespace xgboost::metric { // tag the this file, used by force static link later. DMLC_REGISTRY_FILE_TAG(elementwise_metric); @@ -199,7 +199,8 @@ class PseudoErrorLoss : public MetricNoCache { return std::make_tuple(v, wt); }); std::array dat{result.Residue(), result.Weights()}; - collective::GlobalSum(info, &dat); + auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size())); + collective::SafeColl(rc); return EvalRowMAPE::GetFinal(dat[0], dat[1]); } }; @@ -243,11 +244,11 @@ struct EvalError { }; struct EvalPoissonNegLogLik { - const char *Name() const { + [[nodiscard]] const char *Name() const { return "poisson-nloglik"; } - XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const { + [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const { const bst_float eps = 1e-16f; if (py < eps) py = eps; return common::LogGamma(y + 1.0f) + py - std::log(py) * y; @@ -266,9 +267,9 @@ struct EvalPoissonNegLogLik { * predt >= 0 */ struct EvalGammaDeviance { - const char *Name() const { return "gamma-deviance"; } + [[nodiscard]] const char *Name() const { return "gamma-deviance"; } - XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const { + [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const { predt += kRtEps; label += kRtEps; return std::log(predt / label) + label / predt - 1; @@ -287,7 +288,7 @@ struct EvalGammaNLogLik { return "gamma-nloglik"; } - XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const { + [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const { py = std::max(py, 1e-6f); // hardcoded dispersion. float constexpr kPsi = 1.0; @@ -313,7 +314,7 @@ struct EvalTweedieNLogLik { CHECK(rho_ < 2 && rho_ >= 1) << "tweedie variance power must be in interval [1, 2)"; } - const char *Name() const { + [[nodiscard]] const char *Name() const { static thread_local std::string name; std::ostringstream os; os << "tweedie-nloglik@" << rho_; @@ -321,7 +322,7 @@ struct EvalTweedieNLogLik { return name.c_str(); } - XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const { + [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const { bst_float a = y * std::exp((1 - rho_) * std::log(p)) / (1 - rho_); bst_float b = std::exp((2 - rho_) * std::log(p)) / (2 - rho_); return -a + b; @@ -366,7 +367,8 @@ struct EvalEWiseBase : public MetricNoCache { }); std::array dat{result.Residue(), result.Weights()}; - collective::GlobalSum(info, &dat); + auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size())); + collective::SafeColl(rc); return Policy::GetFinal(dat[0], dat[1]); } @@ -438,7 +440,8 @@ class QuantileError : public MetricNoCache { if (info.num_row_ == 0) { // empty DMatrix on distributed env std::array dat{0.0, 0.0}; - collective::GlobalSum(info, &dat); + auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size())); + collective::SafeColl(rc); CHECK_GT(dat[1], 0); return dat[0] / dat[1]; } @@ -476,7 +479,8 @@ class QuantileError : public MetricNoCache { return std::make_tuple(l, w); }); std::array dat{result.Residue(), result.Weights()}; - collective::GlobalSum(info, &dat); + auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), dat.size())); + collective::SafeColl(rc); CHECK_GT(dat[1], 0); return dat[0] / dat[1]; } @@ -501,5 +505,4 @@ class QuantileError : public MetricNoCache { XGBOOST_REGISTER_METRIC(QuantileError, "quantile") .describe("Quantile regression error.") .set_body([](const char*) { return new QuantileError{}; }); -} // namespace metric -} // namespace xgboost +} // namespace xgboost::metric diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h index 1b148ab0f47c..53c38ff2a8c2 100644 --- a/src/metric/metric_common.h +++ b/src/metric/metric_common.h @@ -1,6 +1,5 @@ -/*! - * Copyright 2018-2022 by Contributors - * \file metric_common.h +/** + * Copyright 2018-2024, Contributors */ #ifndef XGBOOST_METRIC_METRIC_COMMON_H_ #define XGBOOST_METRIC_METRIC_COMMON_H_ @@ -24,7 +23,7 @@ class MetricNoCache : public Metric { double Evaluate(HostDeviceVector const &predts, std::shared_ptr p_fmat) final { double result{0.0}; auto const &info = p_fmat->Info(); - collective::ApplyWithLabels(info, &result, sizeof(double), + collective::ApplyWithLabels(ctx_, info, &result, sizeof(double), [&] { result = this->Eval(predts, info); }); return result; } diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu index 897c91dabe96..acaef7cf7e84 100644 --- a/src/metric/multiclass_metric.cu +++ b/src/metric/multiclass_metric.cu @@ -1,5 +1,5 @@ /** - * Copyright 2015-2023 by XGBoost Contributors + * Copyright 2015-2024, XGBoost Contributors * \file multiclass_metric.cc * \brief evaluation metrics for multiclass classification. * \author Kailong Chen, Tianqi Chen @@ -24,8 +24,7 @@ #include "../common/device_helpers.cuh" #endif // XGBOOST_USE_CUDA -namespace xgboost { -namespace metric { +namespace xgboost::metric { // tag the this file, used by force static link later. DMLC_REGISTRY_FILE_TAG(multiclass_metric); @@ -40,11 +39,10 @@ class MultiClassMetricsReduction { public: MultiClassMetricsReduction() = default; - PackedReduceResult - CpuReduceMetrics(const HostDeviceVector &weights, - const HostDeviceVector &labels, - const HostDeviceVector &preds, - const size_t n_class, int32_t n_threads) const { + [[nodiscard]] PackedReduceResult CpuReduceMetrics(const HostDeviceVector& weights, + const HostDeviceVector& labels, + const HostDeviceVector& preds, + const size_t n_class, int32_t n_threads) const { size_t ndata = labels.Size(); const auto& h_labels = labels.HostVector(); @@ -182,7 +180,8 @@ struct EvalMClassBase : public MetricNoCache { dat[0] = result.Residue(); dat[1] = result.Weights(); } - collective::GlobalSum(info, &dat); + auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size())); + collective::SafeColl(rc); return Derived::GetFinal(dat[0], dat[1]); } /*! @@ -245,5 +244,4 @@ XGBOOST_REGISTER_METRIC(MatchError, "merror") XGBOOST_REGISTER_METRIC(MultiLogLoss, "mlogloss") .describe("Multiclass negative loglikelihood.") .set_body([](const char*) { return new EvalMultiLogLoss(); }); -} // namespace metric -} // namespace xgboost +} // namespace xgboost::metric diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc index 6762aec32d46..53841c05135e 100644 --- a/src/metric/rank_metric.cc +++ b/src/metric/rank_metric.cc @@ -101,7 +101,7 @@ struct EvalAMS : public MetricNoCache { } } - const char* Name() const override { + [[nodiscard]] const char* Name() const override { return name_.c_str(); } @@ -159,7 +159,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig { exc.Rethrow(); } - return collective::GlobalRatio(info, sum_metric, static_cast(ngroups)); + return collective::GlobalRatio(ctx_, info, sum_metric, static_cast(ngroups)); } [[nodiscard]] const char* Name() const override { @@ -274,7 +274,7 @@ class EvalRankWithCache : public Metric { double Evaluate(HostDeviceVector const& preds, std::shared_ptr p_fmat) override { double result{0.0}; auto const& info = p_fmat->Info(); - collective::ApplyWithLabels(info, &result, sizeof(double), [&] { + collective::ApplyWithLabels(ctx_, info, &result, sizeof(double), [&] { auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_); if (p_cache->Param() != param_) { p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_); @@ -294,9 +294,10 @@ class EvalRankWithCache : public Metric { }; namespace { -double Finalize(Context const*, MetaInfo const& info, double score, double sw) { +double Finalize(Context const* ctx, MetaInfo const& info, double score, double sw) { std::array dat{score, sw}; - collective::GlobalSum(info, &dat); + auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), 2)); + collective::SafeColl(rc); std::tie(score, sw) = std::tuple_cat(dat); if (sw > 0.0) { score = score / sw; diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu index 372eb680531e..d43125dcbaf7 100644 --- a/src/metric/rank_metric.cu +++ b/src/metric/rank_metric.cu @@ -1,5 +1,5 @@ /** - * Copyright 2020-2023 by XGBoost Contributors + * Copyright 2020-2024, XGBoost Contributors */ #include #include // for make_counting_iterator diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu index c13702a1917f..c64fece6c1d3 100644 --- a/src/metric/survival_metric.cu +++ b/src/metric/survival_metric.cu @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023 by Contributors + * Copyright 2019-2024, Contributors * \file survival_metric.cu * \brief Metrics for survival analysis * \author Avinash Barnwal, Hyunsu Cho and Toby Hocking @@ -30,8 +30,7 @@ using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType template using AFTLoss = xgboost::common::AFTLoss; -namespace xgboost { -namespace metric { +namespace xgboost::metric { // tag the this file, used by force static link later. DMLC_REGISTRY_FILE_TAG(survival_metric); @@ -43,12 +42,11 @@ class ElementWiseSurvivalMetricsReduction { policy_ = policy; } - PackedReduceResult - CpuReduceMetrics(const HostDeviceVector &weights, - const HostDeviceVector &labels_lower_bound, - const HostDeviceVector &labels_upper_bound, - const HostDeviceVector &preds, - int32_t n_threads) const { + [[nodiscard]] PackedReduceResult CpuReduceMetrics( + const HostDeviceVector& weights, + const HostDeviceVector& labels_lower_bound, + const HostDeviceVector& labels_upper_bound, + const HostDeviceVector& preds, int32_t n_threads) const { size_t ndata = labels_lower_bound.Size(); CHECK_EQ(ndata, labels_upper_bound.Size()); @@ -155,7 +153,7 @@ class ElementWiseSurvivalMetricsReduction { struct EvalIntervalRegressionAccuracy { void Configure(const Args&) {} - const char* Name() const { + [[nodiscard]] const char* Name() const { return "interval-regression-accuracy"; } @@ -177,7 +175,7 @@ struct EvalAFTNLogLik { param_.UpdateAllowUnknown(args); } - const char* Name() const { + [[nodiscard]] const char* Name() const { return "aft-nloglik"; } @@ -213,7 +211,8 @@ struct EvalEWiseSurvivalBase : public MetricNoCache { info.labels_upper_bound_, preds); std::array dat{result.Residue(), result.Weights()}; - collective::GlobalSum(info, &dat); + auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size())); + collective::SafeColl(rc); return Policy::GetFinal(dat[0], dat[1]); } @@ -230,7 +229,7 @@ struct EvalEWiseSurvivalBase : public MetricNoCache { // This class exists because we want to perform dispatch according to the distribution type at // configuration time, not at prediction time. struct AFTNLogLikDispatcher : public MetricNoCache { - const char* Name() const override { + [[nodiscard]] const char* Name() const override { return "aft-nloglik"; } @@ -282,5 +281,4 @@ XGBOOST_REGISTER_METRIC(IntervalRegressionAccuracy, "interval-regression-accurac return new EvalEWiseSurvivalBase(); }); -} // namespace metric -} // namespace xgboost +} // namespace xgboost::metric diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc index b195dffd793a..e7778c464762 100644 --- a/src/objective/adaptive.cc +++ b/src/objective/adaptive.cc @@ -1,5 +1,5 @@ /** - * Copyright 2022-2023 by XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors */ #include "adaptive.h" @@ -85,7 +85,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector const& posit size_t n_leaf = nidx.size(); if (nptr.empty()) { std::vector quantiles; - UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree); + UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree); return; } @@ -100,7 +100,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector const& posit predt.Size() / info.num_row_); collective::ApplyWithLabels( - info, static_cast(quantiles.data()), quantiles.size() * sizeof(float), [&] { + ctx, info, static_cast(quantiles.data()), quantiles.size() * sizeof(float), [&] { // loop over each leaf common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) { auto nidx = h_node_idx[k]; @@ -134,7 +134,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector const& posit }); }); - UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree); + UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree); } #if !defined(XGBOOST_USE_CUDA) diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu index 07644146b2ec..235e284198f3 100644 --- a/src/objective/adaptive.cu +++ b/src/objective/adaptive.cu @@ -1,5 +1,5 @@ /** - * Copyright 2022-2023 by XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors */ #include @@ -150,7 +150,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span pos if (nptr.Empty()) { std::vector quantiles; - UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree); + UpdateLeafValues(ctx, &quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree); } predt.SetDevice(ctx->Device()); @@ -160,7 +160,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span pos auto t_predt = d_predt.Slice(linalg::All(), group_idx); HostDeviceVector quantiles; - collective::ApplyWithLabels(info, &quantiles, [&] { + collective::ApplyWithLabels(ctx, info, &quantiles, [&] { auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx)); auto d_row_index = dh::ToSpan(ridx); auto seg_beg = nptr.DevicePointer(); @@ -186,6 +186,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span pos w_it + d_weights.size(), &quantiles); } }); - UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree); + UpdateLeafValues(ctx, &quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, + p_tree); } } // namespace xgboost::obj::detail diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h index a64f37f63c83..cbe69e79a6cc 100644 --- a/src/objective/adaptive.h +++ b/src/objective/adaptive.h @@ -1,5 +1,5 @@ /** - * Copyright 2022-2023 by XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors */ #pragma once @@ -17,8 +17,7 @@ #include "xgboost/host_device_vector.h" // HostDeviceVector #include "xgboost/tree_model.h" // RegTree -namespace xgboost { -namespace obj { +namespace xgboost::obj { namespace detail { inline void FillMissingLeaf(std::vector const& maybe_missing, std::vector* p_nidx, std::vector* p_nptr) { @@ -36,13 +35,14 @@ inline void FillMissingLeaf(std::vector const& maybe_missing, } } -inline void UpdateLeafValues(std::vector* p_quantiles, std::vector const& nidx, - MetaInfo const& info, float learning_rate, RegTree* p_tree) { +inline void UpdateLeafValues(Context const* ctx, std::vector* p_quantiles, + std::vector const& nidx, MetaInfo const& info, + float learning_rate, RegTree* p_tree) { auto& tree = *p_tree; auto& quantiles = *p_quantiles; auto const& h_node_idx = nidx; - size_t n_leaf = collective::GlobalMax(info, h_node_idx.size()); + size_t n_leaf = collective::GlobalMax(ctx, info, h_node_idx.size()); CHECK(quantiles.empty() || quantiles.size() == n_leaf); if (quantiles.empty()) { quantiles.resize(n_leaf, std::numeric_limits::quiet_NaN()); @@ -52,12 +52,16 @@ inline void UpdateLeafValues(std::vector* p_quantiles, std::vector n_valids(quantiles.size()); std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(), [](float q) { return static_cast(!std::isnan(q)); }); - collective::GlobalSum(info, &n_valids); + auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(n_valids.data(), n_valids.size())); + collective::SafeColl(rc); + // convert to 0 for all reduce std::replace_if( quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f); // use the mean value - collective::GlobalSum(info, &quantiles); + rc = collective::GlobalSum(ctx, info, linalg::MakeVec(quantiles.data(), quantiles.size())); + collective::SafeColl(rc); + for (size_t i = 0; i < n_leaf; ++i) { if (n_valids[i] > 0) { quantiles[i] /= static_cast(n_valids[i]); @@ -105,5 +109,4 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector cons predt, alpha, p_tree); } } -} // namespace obj -} // namespace xgboost +} // namespace xgboost::obj diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu index 15ec72f95d91..7029a201ada8 100644 --- a/src/objective/quantile_obj.cu +++ b/src/objective/quantile_obj.cu @@ -1,5 +1,5 @@ /** - * Copyright 2023 by XGBoost contributors + * Copyright 2023-2024, XGBoost contributors */ #include // std::array #include // std::size_t @@ -170,7 +170,9 @@ class QuantileRegression : public ObjFunction { double meanq = temp(0) * sw; std::array dat{meanq, sw}; - collective::GlobalSum(info, &dat); + auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size())); + collective::SafeColl(rc); + std::tie(meanq, sw) = std::tuple_cat(dat); meanq /= (sw + kRtEps); base_score->Reshape(1); diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index df30b354b48a..3b60ff111b33 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -1,5 +1,5 @@ /** - * Copyright 2015-2023 by XGBoost Contributors + * Copyright 2015-2024, XGBoost Contributors * \file regression_obj.cu * \brief Definition of single-value regression and classification objectives. * \author Tianqi Chen, Kailong Chen @@ -672,8 +672,12 @@ class MeanAbsoluteError : public ObjFunction { std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out), [w](float v) { return v * w; }); - collective::GlobalSum(info, &out.Values()); - collective::GlobalSum(info, &w, 1); + auto rc = collective::Success() << [&] { + return collective::GlobalSum(ctx_, info, out); + } << [&] { + return collective::GlobalSum(ctx_, info, linalg::MakeVec(&w, 1)); + }; + collective::SafeColl(rc); if (common::CloseTo(w, 0.0)) { // Mostly for handling empty dataset test. diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc index 21a050536a2e..5e4d16e4e13e 100644 --- a/src/tree/fit_stump.cc +++ b/src/tree/fit_stump.cc @@ -1,7 +1,7 @@ /** - * Copyright 2022 by XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors * - * \brief Utilities for estimating initial score. + * @brief Utilities for estimating initial score. */ #include "fit_stump.h" @@ -44,8 +44,11 @@ void FitStump(Context const* ctx, MetaInfo const& info, } } CHECK(h_sum.CContiguous()); - - collective::GlobalSum(info, reinterpret_cast(h_sum.Values().data()), h_sum.Size() * 2); + auto as_double = linalg::MakeTensorView( + ctx, common::Span{reinterpret_cast(h_sum.Values().data()), h_sum.Size() * 2}, + h_sum.Size() * 2); + auto rc = collective::GlobalSum(ctx, info, as_double); + collective::SafeColl(rc); for (std::size_t i = 0; i < h_sum.Size(); ++i) { out(i) = static_cast(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess())); diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu index 9fcacd081996..832d40754ec9 100644 --- a/src/tree/fit_stump.cu +++ b/src/tree/fit_stump.cu @@ -1,19 +1,18 @@ /** - * Copyright 2022-2023 by XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors * - * \brief Utilities for estimating initial score. + * @brief Utilities for estimating initial score. */ #if !defined(NOMINMAX) && defined(_WIN32) #define NOMINMAX -#endif // !defined(NOMINMAX) -#include // cuda::par -#include // thrust::make_counting_iterator +#endif // !defined(NOMINMAX) +#include // cuda::par +#include // thrust::make_counting_iterator -#include // std::size_t +#include // std::size_t -#include "../collective/aggregator.cuh" -#include "../collective/communicator-inl.cuh" -#include "../common/device_helpers.cuh" // dh::MakeTransformIterator +#include "../collective/aggregator.cuh" // for GlobalSum +#include "../common/device_helpers.cuh" // dh::MakeTransformIterator #include "fit_stump.h" #include "xgboost/base.h" // GradientPairPrecise, GradientPair, XGBOOST_DEVICE #include "xgboost/context.h" // Context diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index c473c9269580..90c151556566 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -1,5 +1,5 @@ /** - * Copyright 2020-2023 by XGBoost Contributors + * Copyright 2020-2024, XGBoost Contributors */ #include #include @@ -52,7 +52,7 @@ struct Clip : public thrust::unary_function { * * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree. */ -GradientQuantiser::GradientQuantiser(Context const*, common::Span gpair, +GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span gpair, MetaInfo const& info) { using GradientSumT = GradientPairPrecise; using T = typename GradientSumT::ValueT; @@ -65,11 +65,14 @@ GradientQuantiser::GradientQuantiser(Context const*, common::Span(&p), 4); + auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(reinterpret_cast(&p), 4)); + collective::SafeColl(rc); + GradientPair positive_sum{p.first}, negative_sum{p.second}; std::size_t total_rows = gpair.size(); - collective::GlobalSum(info, &total_rows, 1); + rc = collective::GlobalSum(ctx, info, linalg::MakeVec(&total_rows, 1)); + collective::SafeColl(rc); auto histogram_rounding = GradientSumT{common::CreateRoundingFactor( diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc index 94e7547ee209..68317fc41a85 100644 --- a/src/tree/updater_approx.cc +++ b/src/tree/updater_approx.cc @@ -1,5 +1,5 @@ /** - * Copyright 2021-2023 by XGBoost contributors + * Copyright 2021-2024, XGBoost contributors * * \brief Implementation for the approx tree method. */ @@ -107,7 +107,10 @@ class GloablApproxBuilder { for (auto const &g : gpair) { root_sum.Add(g); } - collective::GlobalSum(p_fmat->Info(), reinterpret_cast(&root_sum), 2); + auto rc = collective::GlobalSum(ctx_, p_fmat->Info(), + linalg::MakeVec(reinterpret_cast(&root_sum), 2)); + collective::SafeColl(rc); + std::vector nodes{best}; this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_, linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 3c9c61f88847..4911cec093c8 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023 by XGBoost contributors + * Copyright 2017-2024, XGBoost contributors */ #include #include @@ -729,7 +729,9 @@ struct GPUHistMakerDevice { dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(), GradientPairInt64{}, thrust::plus{}); using ReduceT = typename decltype(root_sum_quantised)::ValueT; - collective::GlobalSum(info_, reinterpret_cast(&root_sum_quantised), 2); + auto rc = collective::GlobalSum( + ctx_, info_, linalg::MakeVec(reinterpret_cast(&root_sum_quantised), 2)); + collective::SafeColl(rc); hist.AllocateHistograms({kRootNIdx}); this->BuildHist(kRootNIdx); diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index cd60e6602cf7..ced277773055 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -199,8 +199,10 @@ class MultiTargetHistBuilder { } } CHECK(root_sum.CContiguous()); - collective::GlobalSum(p_fmat->Info(), reinterpret_cast(root_sum.Values().data()), - root_sum.Size() * 2); + auto rc = collective::GlobalSum( + ctx_, p_fmat->Info(), + linalg::MakeVec(reinterpret_cast(root_sum.Values().data()), root_sum.Size() * 2)); + collective::SafeColl(rc); histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_)); @@ -408,7 +410,9 @@ class HistUpdater { for (auto const &grad : gpair_h) { grad_stat.Add(grad.GetGrad(), grad.GetHess()); } - collective::GlobalSum(p_fmat->Info(), reinterpret_cast(&grad_stat), 2); + auto rc = collective::GlobalSum(ctx_, p_fmat->Info(), + linalg::MakeVec(reinterpret_cast(&grad_stat), 2)); + collective::SafeColl(rc); } auto weight = evaluator_->InitRoot(GradStats{grad_stat}); @@ -471,6 +475,7 @@ class QuantileHistMaker : public TreeUpdater { std::unique_ptr p_impl_{nullptr}; std::unique_ptr p_mtimpl_{nullptr}; std::shared_ptr column_sampler_; + common::Monitor monitor_; ObjInfo const *task_{nullptr}; HistMakerTrainParam hist_param_; From fe732944401ccb4244327b86ab134058c58a7b39 Mon Sep 17 00:00:00 2001 From: Ziyue Xu <71786575+ZiyueXu77@users.noreply.github.com> Date: Fri, 1 Mar 2024 14:29:58 -0500 Subject: [PATCH 5/5] [secure boost] Vertical pipeline with hist sync (#10037) The first phase is to implement an alternative vertical pipeline that syncs the histograms from clients to the label owner. --- include/xgboost/data.h | 9 ++- src/common/quantile.cc | 39 +++++++++-- src/tree/hist/evaluate_splits.h | 94 +++++++++++++++------------ src/tree/hist/histogram.h | 22 ++++++- src/tree/updater_approx.cc | 2 +- src/tree/updater_quantile_hist.cc | 4 +- tests/cpp/tree/hist/test_histogram.cc | 59 +++++++++++------ 7 files changed, 153 insertions(+), 76 deletions(-) diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 08d3d119a8ff..c449164ca572 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -40,7 +40,7 @@ enum class DataType : uint8_t { enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 }; -enum class DataSplitMode : int { kRow = 0, kCol = 1 }; +enum class DataSplitMode : int { kRow = 0, kCol = 1, kColSecure = 2 }; /*! * \brief Meta information about dataset, always sit in memory. @@ -186,7 +186,12 @@ class MetaInfo { } /** @brief Whether the data is split column-wise. */ - bool IsColumnSplit() const { return data_split_mode == DataSplitMode::kCol; } + bool IsColumnSplit() const { return (data_split_mode == DataSplitMode::kCol) + || (data_split_mode == DataSplitMode::kColSecure); } + + /** @brief Whether the data is split column-wise with secure computation. */ + bool IsSecure() const { return data_split_mode == DataSplitMode::kColSecure; } + /** @brief Whether this is a learning to rank data. */ bool IsRanking() const { return !group_ptr_.empty(); } diff --git a/src/common/quantile.cc b/src/common/quantile.cc index e521fae69b1d..49a2594e4a52 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -362,14 +362,27 @@ void SketchContainerImpl::AllReduce( template void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_bin, - HistogramCuts *cuts) { + HistogramCuts *cuts, bool secure) { size_t required_cuts = std::min(summary.size, static_cast(max_bin)); + // make a copy of required_cuts for mode selection + size_t required_cuts_original = required_cuts; + if (secure) { + // Sync the required_cuts across all workers + collective::Allreduce(&required_cuts, 1); + } auto &cut_values = cuts->cut_values_.HostVector(); - // we use the min_value as the first (0th) element, hence starting from 1. - for (size_t i = 1; i < required_cuts; ++i) { - bst_float cpt = summary.data[i].value; - if (i == 1 || cpt > cut_values.back()) { - cut_values.push_back(cpt); + // if empty column, fill the cut values with 0 + if (secure && (required_cuts_original == 0)) { + for (size_t i = 1; i < required_cuts; ++i) { + cut_values.push_back(0.0); + } + } else { + // we use the min_value as the first (0th) element, hence starting from 1. + for (size_t i = 1; i < required_cuts; ++i) { + bst_float cpt = summary.data[i].value; + if (i == 1 || cpt > cut_values.back()) { + cut_values.push_back(cpt); + } } } } @@ -423,11 +436,16 @@ void SketchContainerImpl::MakeCuts(Context const *ctx, MetaInfo const float max_cat{-1.f}; for (size_t fid = 0; fid < reduced.size(); ++fid) { size_t max_num_bins = std::min(num_cuts[fid], max_bins_); + // If vertical and secure mode, we need to sync the max_num_bins aross workers + if (info.IsVerticalFederated() && info.IsSecure()) { + collective::Allreduce(&max_num_bins, 1); + } typename WQSketch::SummaryContainer const &a = final_summaries[fid]; if (IsCat(feature_types_, fid)) { max_cat = std::max(max_cat, AddCategories(categories_.at(fid), p_cuts)); } else { - AddCutPoint(a, max_num_bins, p_cuts); + // use special AddCutPoint scheme for secure vertical federated learning + AddCutPoint(a, max_num_bins, p_cuts, info.IsSecure()); // push a value that is greater than anything const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value : p_cuts->min_vals_.HostVector()[fid]; @@ -443,6 +461,13 @@ void SketchContainerImpl::MakeCuts(Context const *ctx, MetaInfo const p_cuts->cut_ptrs_.HostVector().push_back(cut_size); } + if (info.IsVerticalFederated() && info.IsSecure()) { + // cut values need to be synced across all workers via Allreduce + auto cut_val = p_cuts->cut_values_.HostVector().data(); + std::size_t n = p_cuts->cut_values_.HostVector().size(); + collective::Allreduce(cut_val, n); + } + p_cuts->SetCategorical(this->has_categorical_, max_cat); monitor_.Stop(__func__); } diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index d25a41cb0b3c..85cd5ab67176 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -82,6 +82,7 @@ class HistEvaluator { std::shared_ptr column_sampler_; TreeEvaluator tree_evaluator_; bool is_col_split_{false}; + bool is_secure_{false}; FeatureInteractionConstraintHost interaction_constraints_; std::vector snode_; @@ -321,7 +322,6 @@ class HistEvaluator { } } } - p_best->Update(best); return left_sum; } @@ -353,54 +353,63 @@ class HistEvaluator { auto evaluator = tree_evaluator_.GetEvaluator(); auto const &cut_ptrs = cut.Ptrs(); - common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) { - auto tidx = omp_get_thread_num(); - auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx]; - auto best = &entry->split; - auto nidx = entry->nid; - auto histogram = hist[nidx]; - auto features_set = features[nidx_in_set]->ConstHostSpan(); - for (auto fidx_in_set = r.begin(); fidx_in_set < r.end(); fidx_in_set++) { - auto fidx = features_set[fidx_in_set]; - bool is_cat = common::IsCat(feature_types, fidx); - if (!interaction_constraints_.Query(nidx, fidx)) { - continue; - } - if (is_cat) { - auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx]; - if (common::UseOneHot(n_bins, param_->max_cat_to_onehot)) { - EnumerateOneHot(cut, histogram, fidx, nidx, evaluator, best); - } else { - std::vector sorted_idx(n_bins); - std::iota(sorted_idx.begin(), sorted_idx.end(), 0); - auto feat_hist = histogram.subspan(cut_ptrs[fidx], n_bins); - // Sort the histogram to get contiguous partitions. - std::stable_sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t l, size_t r) { - auto ret = evaluator.CalcWeightCat(*param_, feat_hist[l]) < - evaluator.CalcWeightCat(*param_, feat_hist[r]); - return ret; - }); - EnumeratePart<+1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best); - EnumeratePart<-1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best); + // Under secure vertical setting, only the active party is able to evaluate the split + // based on global histogram. Other parties will receive the final best split information + // Hence the below computation is not performed by the passive parties + if ((!is_secure_) || (collective::GetRank() == 0)) { + // Evaluate the splits for each feature + common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) { + auto tidx = omp_get_thread_num(); + auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx]; + auto best = &entry->split; + auto nidx = entry->nid; + auto histogram = hist[nidx]; + auto features_set = features[nidx_in_set]->ConstHostSpan(); + for (auto fidx_in_set = r.begin(); fidx_in_set < r.end(); fidx_in_set++) { + auto fidx = features_set[fidx_in_set]; + bool is_cat = common::IsCat(feature_types, fidx); + if (!interaction_constraints_.Query(nidx, fidx)) { + continue; } - } else { - auto grad_stats = EnumerateSplit<+1>(cut, histogram, fidx, nidx, evaluator, best); - if (SplitContainsMissingValues(grad_stats, snode_[nidx])) { - EnumerateSplit<-1>(cut, histogram, fidx, nidx, evaluator, best); + if (is_cat) { + auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx]; + if (common::UseOneHot(n_bins, param_->max_cat_to_onehot)) { + EnumerateOneHot(cut, histogram, fidx, nidx, evaluator, best); + } else { + std::vector sorted_idx(n_bins); + std::iota(sorted_idx.begin(), sorted_idx.end(), 0); + auto feat_hist = histogram.subspan(cut_ptrs[fidx], n_bins); + // Sort the histogram to get contiguous partitions. + std::stable_sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t l, size_t r) { + auto ret = evaluator.CalcWeightCat(*param_, feat_hist[l]) < + evaluator.CalcWeightCat(*param_, feat_hist[r]); + return ret; + }); + EnumeratePart<+1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best); + EnumeratePart<-1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best); + } + } else { + auto grad_stats = EnumerateSplit<+1>(cut, histogram, fidx, nidx, evaluator, best); + if (SplitContainsMissingValues(grad_stats, snode_[nidx])) { + EnumerateSplit<-1>(cut, histogram, fidx, nidx, evaluator, best); + } } } - } - }); + }); - for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) { - for (auto tidx = 0; tidx < n_threads; ++tidx) { - entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split); + for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) { + for (auto tidx = 0; tidx < n_threads; ++tidx) { + entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split); + } } } if (is_col_split_) { // With column-wise data split, we gather the best splits from all the workers and update the // expand entries accordingly. + // Note that under secure vertical setting, only the label owner is able to evaluate the split + // based on the global histogram. The other parties will receive the final best splits + // allgather is capable of performing this (0-gain entries for non-label owners), auto all_entries = AllgatherColumnSplit(entries); for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) { for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) { @@ -480,7 +489,8 @@ class HistEvaluator { param_{param}, column_sampler_{std::move(sampler)}, tree_evaluator_{*param, static_cast(info.num_col_), DeviceOrd::CPU()}, - is_col_split_{info.IsColumnSplit()} { + is_col_split_{info.IsColumnSplit()}, + is_secure_{info.IsSecure()}{ interaction_constraints_.Configure(*param, info.num_col_); column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(), param_->colsample_bynode, param_->colsample_bylevel, @@ -496,6 +506,7 @@ class HistMultiEvaluator { std::shared_ptr column_sampler_; Context const *ctx_; bool is_col_split_{false}; + bool is_secure_{false}; private: static double MultiCalcSplitGain(TrainParam const ¶m, @@ -709,7 +720,8 @@ class HistMultiEvaluator { : param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx}, - is_col_split_{info.IsColumnSplit()} { + is_col_split_{info.IsColumnSplit()}, + is_secure_{info.IsSecure()} { interaction_constraints_.Configure(*param, info.num_col_); column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(), param_->colsample_bynode, param_->colsample_bylevel, diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h index 033d2221ec89..d4cea58d0f72 100644 --- a/src/tree/hist/histogram.h +++ b/src/tree/hist/histogram.h @@ -50,6 +50,7 @@ class HistogramBuilder { // Whether XGBoost is running in distributed environment. bool is_distributed_{false}; bool is_col_split_{false}; + bool is_secure_{false}; public: /** @@ -60,13 +61,14 @@ class HistogramBuilder { * of using global rabit variable. */ void Reset(Context const *ctx, bst_bin_t total_bins, BatchParam const &p, bool is_distributed, - bool is_col_split, HistMakerTrainParam const *param) { + bool is_col_split, bool is_secure, HistMakerTrainParam const *param) { n_threads_ = ctx->Threads(); param_ = p; hist_.Reset(total_bins, param->max_cached_hist_node); buffer_.Init(total_bins); is_distributed_ = is_distributed; is_col_split_ = is_col_split; + is_secure_ = is_secure; } template @@ -175,6 +177,7 @@ class HistogramBuilder { std::vector const &nodes_to_build, std::vector const &nodes_to_trick) { auto n_total_bins = buffer_.TotalBins(); + common::BlockedSpace2d space( nodes_to_build.size(), [&](std::size_t) { return n_total_bins; }, 1024); common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) { @@ -190,6 +193,18 @@ class HistogramBuilder { reinterpret_cast(this->hist_[first_nidx].data()), n); } + if (is_distributed_ && is_col_split_ && is_secure_) { + // Under secure vertical mode, we perform allgather for all nodes + CHECK(!nodes_to_build.empty()); + // in theory the operation is AllGather, under current histogram setting of + // same length with 0s for empty slots, + // AllReduce is the most efficient way of achieving the global histogram + auto first_nidx = nodes_to_build.front(); + std::size_t n = n_total_bins * nodes_to_build.size() * 2; + collective::Allreduce( + reinterpret_cast(this->hist_[first_nidx].data()), n); + } + common::BlockedSpace2d const &subspace = nodes_to_trick.size() == nodes_to_build.size() ? space @@ -329,12 +344,13 @@ class MultiHistogramBuilder { [[nodiscard]] auto &Histogram(bst_target_t t) { return target_builders_[t].Histogram(); } void Reset(Context const *ctx, bst_bin_t total_bins, bst_target_t n_targets, BatchParam const &p, - bool is_distributed, bool is_col_split, HistMakerTrainParam const *param) { + bool is_distributed, bool is_col_split, bool is_secure, + HistMakerTrainParam const *param) { ctx_ = ctx; target_builders_.resize(n_targets); CHECK_GE(n_targets, 1); for (auto &v : target_builders_) { - v.Reset(ctx, total_bins, p, is_distributed, is_col_split, param); + v.Reset(ctx, total_bins, p, is_distributed, is_col_split, is_secure, param); } } }; diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc index 68317fc41a85..c1e3cc3d7c14 100644 --- a/src/tree/updater_approx.cc +++ b/src/tree/updater_approx.cc @@ -93,7 +93,7 @@ class GloablApproxBuilder { histogram_builder_.Reset(ctx_, n_total_bins, p_tree->NumTargets(), BatchSpec(*param_, hess), collective::IsDistributed(), p_fmat->Info().IsColumnSplit(), - hist_param_); + p_fmat->Info().IsSecure(), hist_param_); monitor_->Stop(__func__); } diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index ced277773055..b042f1631f2e 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -167,7 +167,7 @@ class MultiTargetHistBuilder { histogram_builder_ = std::make_unique(); histogram_builder_->Reset(ctx_, n_total_bins, n_targets, HistBatch(param_), collective::IsDistributed(), p_fmat->Info().IsColumnSplit(), - hist_param_); + p_fmat->Info().IsSecure(), hist_param_); evaluator_ = std::make_unique(ctx_, p_fmat->Info(), param_, col_sampler_); p_last_tree_ = p_tree; @@ -357,7 +357,7 @@ class HistUpdater { fmat->Info().IsColumnSplit()); } histogram_builder_->Reset(ctx_, n_total_bins, 1, HistBatch(param_), collective::IsDistributed(), - fmat->Info().IsColumnSplit(), hist_param_); + fmat->Info().IsColumnSplit(), fmat->Info().IsSecure(), hist_param_); evaluator_ = std::make_unique(ctx_, this->param_, fmat->Info(), col_sampler_); p_last_tree_ = p_tree; monitor_->Stop(__func__); diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc index 25a800367c49..76428d1d83b4 100644 --- a/tests/cpp/tree/hist/test_histogram.cc +++ b/tests/cpp/tree/hist/test_histogram.cc @@ -68,8 +68,8 @@ void TestAddHistRows(bool is_distributed) { HistMakerTrainParam hist_param; HistogramBuilder histogram_builder; - histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed, false, - &hist_param); + histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed, + false, false, &hist_param); histogram_builder.AddHistRows(&tree, &nodes_to_build, &nodes_to_sub, false); for (bst_node_t const &nidx : nodes_to_build) { @@ -102,7 +102,7 @@ void TestSyncHist(bool is_distributed) { HistogramBuilder histogram; uint32_t total_bins = gmat.cut.Ptrs().back(); HistMakerTrainParam hist_param; - histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, false, &hist_param); + histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, false, false, &hist_param); common::RowSetCollection row_set_collection; { @@ -222,13 +222,13 @@ TEST(CPUHistogram, SyncHist) { TestSyncHist(false); } -void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split) { +void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split, bool is_secure) { size_t constexpr kNRows = 8, kNCols = 16; int32_t constexpr kMaxBins = 4; Context ctx; auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix(); - if (is_col_split) { + if (is_col_split && !is_secure) { p_fmat = std::shared_ptr{ p_fmat->SliceCol(collective::GetWorldSize(), collective::GetRank())}; } @@ -244,7 +244,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_ bst_node_t nid = 0; HistogramBuilder histogram; HistMakerTrainParam hist_param; - histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, &hist_param); + histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, is_secure, &hist_param); RegTree tree; @@ -286,22 +286,41 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_ // Now validate the computed histogram returned by BuildHist for (size_t i = 0; i < histogram.Histogram()[nid].size(); ++i) { GradientPairPrecise sol = histogram_expected[i]; - ASSERT_NEAR(sol.GetGrad(), histogram.Histogram()[nid][i].GetGrad(), kEps); - ASSERT_NEAR(sol.GetHess(), histogram.Histogram()[nid][i].GetHess(), kEps); + double grad = sol.GetGrad(); + double hess = sol.GetHess(); + if (is_distributed && (!is_col_split || (is_secure && is_col_split))) { + // the solution also needs to be allreduce + collective::Allreduce(&grad, 1); + collective::Allreduce(&hess, 1); + } + ASSERT_NEAR(grad, histogram.Histogram()[nid][i].GetGrad(), kEps); + ASSERT_NEAR(hess, histogram.Histogram()[nid][i].GetHess(), kEps); } } TEST(CPUHistogram, BuildHist) { - TestBuildHistogram(true, false, false); - TestBuildHistogram(false, false, false); - TestBuildHistogram(true, true, false); - TestBuildHistogram(false, true, false); + TestBuildHistogram(true, false, false, false); + TestBuildHistogram(false, false, false, false); + TestBuildHistogram(true, true, false, false); + TestBuildHistogram(false, true, false, false); +} + +TEST(CPUHistogram, BuildHistDist) { + auto constexpr kWorkers = 4; + RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, false, false); + RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, false, false); +} + +TEST(CPUHistogram, BuildHistDistColSplit) { + auto constexpr kWorkers = 4; + RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true, false); + RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true, false); } -TEST(CPUHistogram, BuildHistColSplit) { +TEST(CPUHistogram, BuildHistDistColSplitSecure) { auto constexpr kWorkers = 4; - RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true); - RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true); + RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true, true); + RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true, true); } namespace { @@ -360,7 +379,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) { HistogramBuilder cat_hist; for (auto const &gidx : cat_m->GetBatches(&ctx, {kBins, 0.5})) { auto total_bins = gidx.cut.TotalBins(); - cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param); + cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, false, &hist_param); cat_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false); cat_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build, linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()), @@ -376,7 +395,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) { HistogramBuilder onehot_hist; for (auto const &gidx : encode_m->GetBatches(&ctx, {kBins, 0.5})) { auto total_bins = gidx.cut.TotalBins(); - onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param); + onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, false, &hist_param); onehot_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false); onehot_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build, linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()), @@ -442,7 +461,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo } ASSERT_EQ(n_samples, m->Info().num_row_); - multi_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param); + multi_build.Reset(ctx, total_bins, batch_param, false, false, false, &hist_param); multi_build.AddHistRows(&tree, &nodes, &dummy_sub, false); std::size_t page_idx{0}; for (auto const &page : m->GetBatches(ctx, batch_param)) { @@ -465,7 +484,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo common::RowSetCollection row_set_collection; InitRowPartitionForTest(&row_set_collection, n_samples); - single_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param); + single_build.Reset(ctx, total_bins, batch_param, false, false, false, &hist_param); SparsePage concat; std::vector hess(m->Info().num_row_, 1.0f); for (auto const &page : m->GetBatches()) { @@ -542,7 +561,7 @@ class OverflowTest : public ::testing::TestWithParam> { CHECK_EQ(Xy->Info().IsColumnSplit(), is_col_split); hist_builder.Reset(&ctx, n_total_bins, tree.NumTargets(), batch, is_distributed, - Xy->Info().IsColumnSplit(), &hist_param); + Xy->Info().IsColumnSplit(), Xy->Info().IsSecure(), &hist_param); std::vector partitioners; partitioners.emplace_back(&ctx, Xy->Info().num_row_, /*base_rowid=*/0,