From f7005d32c182e1c8751b2549cde9d16f327486a1 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Fri, 23 Feb 2024 19:03:54 +0100
Subject: [PATCH 1/5] [R] Use inplace predict (#9829)

---------

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
---
 R-package/R/xgb.Booster.R               | 137 +++++++++++++++---
 R-package/man/predict.xgb.Booster.Rd    |  45 ++++--
 R-package/src/init.c                    |   6 +
 R-package/src/xgboost_R.cc              | 184 +++++++++++++++++++++---
 R-package/src/xgboost_R.h               |  44 ++++++
 R-package/tests/testthat/test_basic.R   |  49 ++++++-
 R-package/tests/testthat/test_dmatrix.R |  31 ++++
 7 files changed, 450 insertions(+), 46 deletions(-)

diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index febefb757129..8a5d66198834 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -77,26 +77,45 @@ xgb.get.handle <- function(object) {
 
 #' Predict method for XGBoost model
 #'
-#' Predicted values based on either xgboost model or model handle object.
+#' Predict values on data based on xgboost model.
 #'
 #' @param object Object of class `xgb.Booster`.
-#' @param newdata Takes `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
+#' @param newdata Takes `data.frame`, `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
 #'        local data file, or `xgb.DMatrix`.
-#'        For single-row predictions on sparse data, it is recommended to use the CSR format.
-#'        If passing a sparse vector, it will take it as a row vector.
-#' @param missing Only used when input is a dense matrix. Pick a float value that represents
-#'        missing values in data (e.g., 0 or some other extreme value).
+#'
+#'        For single-row predictions on sparse data, it's recommended to use CSR format. If passing
+#'        a sparse vector, it will take it as a row vector.
+#'
+#'        Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+#'        pass here instead of passing R types like matrices or data frames, as predictions will be
+#'        faster on DMatrix.
+#'
+#'        If `newdata` is a `data.frame`, be aware that:\itemize{
+#'        \item Columns will be converted to numeric if they aren't already, which could potentially make
+#'              the operation slower than in an equivalent `matrix` object.
+#'        \item The order of the columns must match with that of the data from which the model was fitted
+#'              (i.e. columns will not be referenced by their names, just by their order in the data).
+#'        \item If the model was fitted to data with categorical columns, these columns must be of
+#'              `factor` type here, and must use the same encoding (i.e. have the same levels).
+#'        \item If `newdata` contains any `factor` columns, they will be converted to base-0
+#'              encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
+#'              under a column which during training had a different type.
+#'        }
+#' @param missing Float value that represents missing values in data (e.g., 0 or some other extreme value).
+#'
+#'        This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
+#'        this as an argument to the DMatrix constructor instead.
 #' @param outputmargin Whether the prediction should be returned in the form of original untransformed
 #'        sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for
 #'        logistic regression would return log-odds instead of probabilities.
-#' @param predleaf Whether to predict pre-tree leaf indices.
+#' @param predleaf Whether to predict per-tree leaf indices.
 #' @param predcontrib Whether to return feature contributions to individual predictions (see Details).
 #' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details).
 #' @param predinteraction Whether to return contributions of feature interactions to individual predictions (see Details).
 #' @param reshape Whether to reshape the vector of predictions to matrix form when there are several
 #'        prediction outputs per case. No effect if `predleaf`, `predcontrib`,
 #'        or `predinteraction` is `TRUE`.
-#' @param training Whether the predictions are used for training. For dart booster,
+#' @param training Whether the prediction result is used for training. For dart booster,
 #'        training predicting will perform dropout.
 #' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing
 #'        a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
@@ -111,6 +130,12 @@ xgb.get.handle <- function(object) {
 #'        If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
 #' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
 #'        type and shape of predictions are invariant to the model type.
+#' @param base_margin Base margin used for boosting from existing model.
+#'
+#'        Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
+#'        be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+#'        an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
+#'
 #' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names
 #'        match (only applicable when both `object` and `newdata` have feature names).
 #'
@@ -287,16 +312,80 @@ xgb.get.handle <- function(object) {
 predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
                                 predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
                                 reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE,
-                                validate_features = FALSE, ...) {
+                                validate_features = FALSE, base_margin = NULL, ...) {
   if (validate_features) {
     newdata <- validate.features(object, newdata)
   }
-  if (!inherits(newdata, "xgb.DMatrix")) {
+  is_dmatrix <- inherits(newdata, "xgb.DMatrix")
+  if (is_dmatrix && !is.null(base_margin)) {
+    stop(
+      "'base_margin' is not supported when passing 'xgb.DMatrix' as input.",
+      " Should be passed as argument to 'xgb.DMatrix' constructor."
+    )
+  }
+
+  use_as_df <- FALSE
+  use_as_dense_matrix <- FALSE
+  use_as_csr_matrix <- FALSE
+  n_row <- NULL
+  if (!is_dmatrix) {
+
+    inplace_predict_supported <- !predcontrib && !predinteraction && !predleaf
+    if (inplace_predict_supported) {
+      booster_type <- xgb.booster_type(object)
+      if (booster_type == "gblinear" || (booster_type == "dart" && training)) {
+        inplace_predict_supported <- FALSE
+      }
+    }
+    if (inplace_predict_supported) {
+
+      if (is.matrix(newdata)) {
+        use_as_dense_matrix <- TRUE
+      } else if (is.data.frame(newdata)) {
+        # note: since here it turns it into a non-data-frame list,
+        # needs to keep track of the number of rows it had for later
+        n_row <- nrow(newdata)
+        newdata <- lapply(
+          newdata,
+          function(x) {
+            if (is.factor(x)) {
+              return(as.numeric(x) - 1)
+            } else {
+              return(as.numeric(x))
+            }
+          }
+        )
+        use_as_df <- TRUE
+      } else if (inherits(newdata, "dgRMatrix")) {
+        use_as_csr_matrix <- TRUE
+        csr_data <- list(newdata@p, newdata@j, newdata@x, ncol(newdata))
+      } else if (inherits(newdata, "dsparseVector")) {
+        use_as_csr_matrix <- TRUE
+        n_row <- 1L
+        i <- newdata@i - 1L
+        if (storage.mode(i) != "integer") {
+          storage.mode(i) <- "integer"
+        }
+        csr_data <- list(c(0L, length(i)), i, newdata@x, length(newdata))
+      }
+
+    }
+
+  } # if (!is_dmatrix)
+
+  if (!is_dmatrix && !use_as_dense_matrix && !use_as_csr_matrix && !use_as_df) {
     nthread <- xgb.nthread(object)
     newdata <- xgb.DMatrix(
       newdata,
-      missing = missing, nthread = NVL(nthread, -1)
+      missing = missing,
+      base_margin = base_margin,
+      nthread = NVL(nthread, -1)
     )
+    is_dmatrix <- TRUE
+  }
+
+  if (is.null(n_row)) {
+    n_row <- nrow(newdata)
   }
 
 
@@ -354,18 +443,30 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
     args$type <- set_type(6)
   }
 
-  predts <- .Call(
-    XGBoosterPredictFromDMatrix_R,
-    xgb.get.handle(object),
-    newdata,
-    jsonlite::toJSON(args, auto_unbox = TRUE)
-  )
+  json_conf <- jsonlite::toJSON(args, auto_unbox = TRUE)
+  if (is_dmatrix) {
+    predts <- .Call(
+      XGBoosterPredictFromDMatrix_R, xgb.get.handle(object), newdata, json_conf
+    )
+  } else if (use_as_dense_matrix) {
+    predts <- .Call(
+      XGBoosterPredictFromDense_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
+    )
+  } else if (use_as_csr_matrix) {
+    predts <- .Call(
+      XGBoosterPredictFromCSR_R, xgb.get.handle(object), csr_data, missing, json_conf, base_margin
+    )
+  } else if (use_as_df) {
+    predts <- .Call(
+      XGBoosterPredictFromColumnar_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
+    )
+  }
+
   names(predts) <- c("shape", "results")
   shape <- predts$shape
   arr <- predts$results
 
   n_ret <- length(arr)
-  n_row <- nrow(newdata)
   if (n_row != shape[1]) {
     stop("Incorrect predict shape.")
   }
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 95e7a51fd043..88a2f203efcd 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -18,25 +18,47 @@
   iterationrange = NULL,
   strict_shape = FALSE,
   validate_features = FALSE,
+  base_margin = NULL,
   ...
 )
 }
 \arguments{
 \item{object}{Object of class \code{xgb.Booster}.}
 
-\item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
+\item{newdata}{Takes \code{data.frame}, \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
 local data file, or \code{xgb.DMatrix}.
-For single-row predictions on sparse data, it is recommended to use the CSR format.
-If passing a sparse vector, it will take it as a row vector.}
 
-\item{missing}{Only used when input is a dense matrix. Pick a float value that represents
-missing values in data (e.g., 0 or some other extreme value).}
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   For single-row predictions on sparse data, it's recommended to use CSR format. If passing
+   a sparse vector, it will take it as a row vector.
+
+   Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+   pass here instead of passing R types like matrices or data frames, as predictions will be
+   faster on DMatrix.
+
+   If `newdata` is a `data.frame`, be aware that:\\itemize\{
+   \\item Columns will be converted to numeric if they aren't already, which could potentially make
+         the operation slower than in an equivalent `matrix` object.
+   \\item The order of the columns must match with that of the data from which the model was fitted
+         (i.e. columns will not be referenced by their names, just by their order in the data).
+   \\item If the model was fitted to data with categorical columns, these columns must be of
+         `factor` type here, and must use the same encoding (i.e. have the same levels).
+   \\item If `newdata` contains any `factor` columns, they will be converted to base-0
+         encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
+         under a column which during training had a different type.
+   \}
+}\if{html}{\out{</div>}}}
+
+\item{missing}{Float value that represents missing values in data (e.g., 0 or some other extreme value).
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
+   this as an argument to the DMatrix constructor instead.
+}\if{html}{\out{</div>}}}
 
 \item{outputmargin}{Whether the prediction should be returned in the form of original untransformed
 sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
 logistic regression would return log-odds instead of probabilities.}
 
-\item{predleaf}{Whether to predict pre-tree leaf indices.}
+\item{predleaf}{Whether to predict per-tree leaf indices.}
 
 \item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).}
 
@@ -48,7 +70,7 @@ logistic regression would return log-odds instead of probabilities.}
 prediction outputs per case. No effect if \code{predleaf}, \code{predcontrib},
 or \code{predinteraction} is \code{TRUE}.}
 
-\item{training}{Whether the predictions are used for training. For dart booster,
+\item{training}{Whether the prediction result is used for training. For dart booster,
 training predicting will perform dropout.}
 
 \item{iterationrange}{Sequence of rounds/iterations from the model to use for prediction, specified by passing
@@ -84,6 +106,13 @@ match (only applicable when both \code{object} and \code{newdata} have feature n
    recommended to disable it for performance-sensitive applications.
 }\if{html}{\out{</div>}}}
 
+\item{base_margin}{Base margin used for boosting from existing model.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
+   be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+   an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
+}\if{html}{\out{</div>}}}
+
 \item{...}{Not used.}
 }
 \value{
@@ -115,7 +144,7 @@ When \code{strict_shape = TRUE}, the output is always an array:
 }
 }
 \description{
-Predicted values based on either xgboost model or model handle object.
+Predict values on data based on xgboost model.
 }
 \details{
 Note that \code{iterationrange} would currently do nothing for predictions from "gblinear",
diff --git a/R-package/src/init.c b/R-package/src/init.c
index 36f3e8953639..f2635742ebd7 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -37,6 +37,9 @@ extern SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value);
 extern SEXP XGBoosterSerializeToBuffer_R(SEXP handle);
 extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw);
 extern SEXP XGBoosterPredictFromDMatrix_R(SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromDense_R(SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromColumnar_R(SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSaveModel_R(SEXP, SEXP);
 extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
@@ -96,6 +99,9 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGBoosterSerializeToBuffer_R",     (DL_FUNC) &XGBoosterSerializeToBuffer_R,     1},
   {"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2},
   {"XGBoosterPredictFromDMatrix_R", (DL_FUNC) &XGBoosterPredictFromDMatrix_R, 3},
+  {"XGBoosterPredictFromDense_R", (DL_FUNC) &XGBoosterPredictFromDense_R, 5},
+  {"XGBoosterPredictFromCSR_R",   (DL_FUNC) &XGBoosterPredictFromCSR_R,   5},
+  {"XGBoosterPredictFromColumnar_R", (DL_FUNC) &XGBoosterPredictFromColumnar_R, 5},
   {"XGBoosterSaveModel_R",        (DL_FUNC) &XGBoosterSaveModel_R,        2},
   {"XGBoosterSetAttr_R",          (DL_FUNC) &XGBoosterSetAttr_R,          3},
   {"XGBoosterSetParam_R",         (DL_FUNC) &XGBoosterSetParam_R,         3},
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 4192f82fbaa6..5baf8d41282e 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -13,6 +13,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
+#include <memory>
 #include <limits>
 #include <sstream>
 #include <string>
@@ -207,25 +208,24 @@ SEXP SafeAllocInteger(size_t size, SEXP continuation_token) {
   return xgboost::Json::Dump(jinterface);
 }
 
-[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
-  using namespace ::xgboost;  // NOLINT
-  Json jconfig{Object{}};
-
-  const SEXPTYPE missing_type = TYPEOF(missing);
-  if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) ||
-      (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) ||
-      (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) {
+void AddMissingToJson(xgboost::Json *jconfig, SEXP missing, SEXPTYPE arr_type) {
+  if (Rf_isNull(missing) || ISNAN(Rf_asReal(missing))) {
     // missing is not specified
     if (arr_type == REALSXP) {
-      jconfig["missing"] = std::numeric_limits<double>::quiet_NaN();
+      (*jconfig)["missing"] = std::numeric_limits<double>::quiet_NaN();
     } else {
-      jconfig["missing"] = R_NaInt;
+      (*jconfig)["missing"] = R_NaInt;
     }
   } else {
     // missing specified
-    jconfig["missing"] = Rf_asReal(missing);
+    (*jconfig)["missing"] = Rf_asReal(missing);
   }
+}
 
+[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
+  using namespace ::xgboost;  // NOLINT
+  Json jconfig{Object{}};
+  AddMissingToJson(&jconfig, missing, arr_type);
   jconfig["nthread"] = Rf_asInteger(n_threads);
   return Json::Dump(jconfig);
 }
@@ -411,7 +411,7 @@ XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
   DMatrixHandle handle;
   std::int32_t rc{0};
   {
-    std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
+    const std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
     xgboost::Json jconfig{xgboost::Object{}};
     jconfig["missing"] = asReal(missing);
     jconfig["nthread"] = asInteger(n_threads);
@@ -463,7 +463,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP
     Json jconfig{Object{}};
     // Construct configuration
     jconfig["nthread"] = Integer{threads};
-    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    AddMissingToJson(&jconfig, missing, TYPEOF(data));
     std::string config;
     Json::Dump(jconfig, &config);
     res_code = XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
@@ -498,7 +498,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
     Json jconfig{Object{}};
     // Construct configuration
     jconfig["nthread"] = Integer{threads};
-    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    AddMissingToJson(&jconfig, missing, TYPEOF(data));
     std::string config;
     Json::Dump(jconfig, &config);
     res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
@@ -1247,7 +1247,60 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
   return mkString(ret);
 }
 
-XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config)  {
+namespace {
+
+struct ProxyDmatrixError : public std::exception {};
+
+struct ProxyDmatrixWrapper {
+  DMatrixHandle proxy_dmat_handle;
+
+  ProxyDmatrixWrapper() {
+    int res_code = XGProxyDMatrixCreate(&this->proxy_dmat_handle);
+    if (res_code != 0) {
+      throw ProxyDmatrixError();
+    }
+  }
+
+  ~ProxyDmatrixWrapper() {
+    if (this->proxy_dmat_handle) {
+      XGDMatrixFree(this->proxy_dmat_handle);
+      this->proxy_dmat_handle = nullptr;
+    }
+  }
+
+  DMatrixHandle get_handle() {
+    return this->proxy_dmat_handle;
+  }
+};
+
+std::unique_ptr<ProxyDmatrixWrapper> GetProxyDMatrixWithBaseMargin(SEXP base_margin) {
+  if (Rf_isNull(base_margin)) {
+    return std::unique_ptr<ProxyDmatrixWrapper>(nullptr);
+  }
+
+  SEXP base_margin_dim = Rf_getAttrib(base_margin, R_DimSymbol);
+  int res_code;
+  try {
+    const std::string array_str = Rf_isNull(base_margin_dim)?
+      MakeArrayInterfaceFromRVector(base_margin) : MakeArrayInterfaceFromRMat(base_margin);
+    std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat(new ProxyDmatrixWrapper());
+    res_code = XGDMatrixSetInfoFromInterface(proxy_dmat->get_handle(),
+                                             "base_margin",
+                                             array_str.c_str());
+    if (res_code != 0) {
+      throw ProxyDmatrixError();
+    }
+    return proxy_dmat;
+  } catch(ProxyDmatrixError &err) {
+    Rf_error("%s", XGBGetLastError());
+  }
+}
+
+enum class PredictionInputType {DMatrix, DenseMatrix, CSRMatrix, DataFrame};
+
+SEXP XGBoosterPredictGeneric(SEXP handle, SEXP input_data, SEXP json_config,
+                                    PredictionInputType input_type, SEXP missing,
+                                    SEXP base_margin) {
   SEXP r_out_shape;
   SEXP r_out_result;
   SEXP r_out = PROTECT(allocVector(VECSXP, 2));
@@ -1259,9 +1312,79 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
   bst_ulong out_dim;
   bst_ulong const *out_shape;
   float const *out_result;
-  CHECK_CALL(XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
-                                         R_ExternalPtrAddr(dmat), c_json_config,
-                                         &out_shape, &out_dim, &out_result));
+
+  int res_code;
+  {
+    switch (input_type) {
+      case PredictionInputType::DMatrix: {
+        res_code = XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
+                                               R_ExternalPtrAddr(input_data), c_json_config,
+                                               &out_shape, &out_dim, &out_result);
+        break;
+      }
+
+      case PredictionInputType::CSRMatrix: {
+        std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat = GetProxyDMatrixWithBaseMargin(
+          base_margin);
+        DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+
+        SEXP indptr = VECTOR_ELT(input_data, 0);
+        SEXP indices = VECTOR_ELT(input_data, 1);
+        SEXP data = VECTOR_ELT(input_data, 2);
+        const int ncol_csr = Rf_asInteger(VECTOR_ELT(input_data, 3));
+        const SEXPTYPE type_data = TYPEOF(data);
+        CHECK_EQ(type_data, REALSXP);
+        std::string sindptr, sindices, sdata;
+        CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
+
+        xgboost::StringView json_str(c_json_config);
+        xgboost::Json new_json = xgboost::Json::Load(json_str);
+        AddMissingToJson(&new_json, missing, type_data);
+        const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+        res_code = XGBoosterPredictFromCSR(
+          R_ExternalPtrAddr(handle), sindptr.c_str(), sindices.c_str(), sdata.c_str(),
+          ncol_csr, new_c_json.c_str(), proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+        break;
+      }
+
+      case PredictionInputType::DenseMatrix: {
+        std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat = GetProxyDMatrixWithBaseMargin(
+          base_margin);
+        DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+        const std::string array_str = MakeArrayInterfaceFromRMat(input_data);
+
+        xgboost::StringView json_str(c_json_config);
+        xgboost::Json new_json = xgboost::Json::Load(json_str);
+        AddMissingToJson(&new_json, missing, TYPEOF(input_data));
+        const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+        res_code = XGBoosterPredictFromDense(
+          R_ExternalPtrAddr(handle), array_str.c_str(), new_c_json.c_str(),
+          proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+        break;
+      }
+
+      case PredictionInputType::DataFrame: {
+        std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat = GetProxyDMatrixWithBaseMargin(
+          base_margin);
+        DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+
+        const std::string df_str = MakeArrayInterfaceFromRDataFrame(input_data);
+
+        xgboost::StringView json_str(c_json_config);
+        xgboost::Json new_json = xgboost::Json::Load(json_str);
+        AddMissingToJson(&new_json, missing, REALSXP);
+        const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+        res_code = XGBoosterPredictFromColumnar(
+          R_ExternalPtrAddr(handle), df_str.c_str(), new_c_json.c_str(),
+          proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+        break;
+      }
+    }
+  }
+  CHECK_CALL(res_code);
 
   r_out_shape = PROTECT(allocVector(INTSXP, out_dim));
   size_t len = 1;
@@ -1282,6 +1405,31 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
   return r_out;
 }
 
+}  // namespace
+
+XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config)  {
+  return XGBoosterPredictGeneric(handle, dmat, json_config,
+                                 PredictionInputType::DMatrix, R_NilValue, R_NilValue);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing,
+                                         SEXP json_config, SEXP base_margin) {
+  return XGBoosterPredictGeneric(handle, R_mat, json_config,
+                                 PredictionInputType::DenseMatrix, missing, base_margin);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing,
+                                       SEXP json_config, SEXP base_margin) {
+  return XGBoosterPredictGeneric(handle, lst, json_config,
+                                 PredictionInputType::CSRMatrix, missing, base_margin);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing,
+                                            SEXP json_config, SEXP base_margin) {
+  return XGBoosterPredictGeneric(handle, R_df, json_config,
+                                 PredictionInputType::DataFrame, missing, base_margin);
+}
+
 XGB_DLL SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
   R_API_BEGIN();
   CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 652345e52b64..70fd885e7f12 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -371,6 +371,50 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
  * \return A list containing 2 vectors, first one for shape while second one for prediction result.
  */
 XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config);
+
+/*!
+ * \brief Run prediction on R dense matrix
+ * \param handle handle
+ * \param R_mat R matrix
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing,
+                                         SEXP json_config, SEXP base_margin);
+
+/*!
+ * \brief Run prediction on R CSR matrix
+ * \param handle handle
+ * \param lst An R list, containing, in this order:
+ *              (a) 'p' array (a.k.a. indptr)
+ *              (b) 'j' array (a.k.a. indices)
+ *              (c) 'x' array (a.k.a. data / values)
+ *              (d) number of columns
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromCSR` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing,
+                                       SEXP json_config, SEXP base_margin);
+
+/*!
+ * \brief Run prediction on R data.frame
+ * \param handle handle
+ * \param R_df R data.frame
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing,
+                                            SEXP json_config, SEXP base_margin);
+
 /*!
  * \brief load model from existing file
  * \param handle handle
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index fb3162e423ce..5438c8bb2235 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -139,8 +139,8 @@ test_that("dart prediction works", {
   pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, iterationrange = c(1, nrounds))
   pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
 
-  expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE)))
-  expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
+  expect_equal(pred_by_train_0, pred_by_xgboost_0, tolerance = 1e-6)
+  expect_equal(pred_by_train_1, pred_by_xgboost_1, tolerance = 1e-6)
   expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
 })
 
@@ -651,6 +651,51 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", {
   expect_equal(pred_qid, pred_gr)
 })
 
+test_that("Can predict on data.frame objects", {
+  data("mtcars")
+  y <- mtcars$mpg
+  x_df <- mtcars[, -1]
+  x_mat <- as.matrix(x_df)
+  dm <- xgb.DMatrix(x_mat, label = y, nthread = n_threads)
+  model <- xgb.train(
+    params = list(
+      tree_method = "hist",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    data = dm,
+    nrounds = 5
+  )
+
+  pred_mat <- predict(model, xgb.DMatrix(x_mat), nthread = n_threads)
+  pred_df <- predict(model, x_df, nthread = n_threads)
+  expect_equal(pred_mat, pred_df)
+})
+
+test_that("'base_margin' gives the same result in DMatrix as in inplace_predict", {
+  data("mtcars")
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = y, nthread = n_threads)
+  model <- xgb.train(
+    params = list(
+      tree_method = "hist",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    data = dm,
+    nrounds = 5
+  )
+
+  set.seed(123)
+  base_margin <- rnorm(nrow(x))
+  dm_w_base <- xgb.DMatrix(data = x, base_margin = base_margin)
+  pred_from_dm <- predict(model, dm_w_base)
+  pred_from_mat <- predict(model, x, base_margin = base_margin)
+
+  expect_equal(pred_from_dm, pred_from_mat)
+})
+
 test_that("Coefficients from gblinear have the expected shape and names", {
   # Single-column coefficients
   data(mtcars)
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 45bcac08d479..0612406444ae 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -302,6 +302,37 @@ test_that("xgb.DMatrix: Inf as missing", {
   file.remove(fname_nan)
 })
 
+test_that("xgb.DMatrix: missing in CSR", {
+  x_dense <- matrix(as.numeric(1:10), nrow = 5)
+  x_dense[2, 1] <- NA_real_
+
+  x_csr <- as(x_dense, "RsparseMatrix")
+
+  m_dense <- xgb.DMatrix(x_dense, nthread = n_threads, missing = NA_real_)
+  xgb.DMatrix.save(m_dense, "dense.dmatrix")
+
+  m_csr <- xgb.DMatrix(x_csr, nthread = n_threads, missing = NA)
+  xgb.DMatrix.save(m_csr, "csr.dmatrix")
+
+  denseconn <- file("dense.dmatrix", "rb")
+  csrconn <- file("csr.dmatrix", "rb")
+
+  expect_equal(file.size("dense.dmatrix"), file.size("csr.dmatrix"))
+
+  bytes <- file.size("dense.dmatrix")
+  densedmatrix <- readBin(denseconn, "raw", n = bytes)
+  csrmatrix <- readBin(csrconn, "raw", n = bytes)
+
+  expect_equal(length(densedmatrix), length(csrmatrix))
+  expect_equal(densedmatrix, csrmatrix)
+
+  close(denseconn)
+  close(csrconn)
+
+  file.remove("dense.dmatrix")
+  file.remove("csr.dmatrix")
+})
+
 test_that("xgb.DMatrix: error on three-dimensional array", {
   set.seed(123)
   x <- matrix(rnorm(500), nrow = 50)

From 0ce4372bd49ffd79bbbafef6438af62d3cf8a342 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 25 Feb 2024 00:18:23 +0800
Subject: [PATCH 2/5] Use UBJSON for serializing splits for vertical data
 split. (#10059)

---
 .github/workflows/main.yml                    |   2 +-
 .github/workflows/python_tests.yml            |   2 +-
 .github/workflows/python_wheels.yml           |   2 +-
 .github/workflows/r_tests.yml                 |   2 +-
 R-package/src/Makevars.in                     |   1 +
 R-package/src/Makevars.win                    |   1 +
 src/collective/communicator-inl.cc            |  34 ++++
 src/collective/communicator-inl.h             |  47 ++---
 src/tree/hist/evaluate_splits.h               | 164 +++++++-----------
 src/tree/hist/expand_entry.h                  |  15 +-
 src/tree/updater_quantile_hist.cc             |   5 +-
 .../cpp/collective/test_rabit_communicator.cc |  45 ++++-
 tests/cpp/common/test_json.cc                 |   2 +-
 tests/cpp/tree/test_quantile_hist.cc          |   5 +-
 14 files changed, 162 insertions(+), 165 deletions(-)
 create mode 100644 src/collective/communicator-inl.cc

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 20e91a5d93f6..133e151e5e4f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -174,7 +174,7 @@ jobs:
     - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
       with:
         submodules: 'true'
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
         architecture: 'x64'
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 0fca76673962..3fbcc7a01acf 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -310,7 +310,7 @@ jobs:
           submodules: 'true'
 
       - name: Set up Python 3.8
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
         with:
           python-version: 3.8
 
diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
index f46b772950c9..12ae8a244e4f 100644
--- a/.github/workflows/python_wheels.yml
+++ b/.github/workflows/python_wheels.yml
@@ -21,7 +21,7 @@ jobs:
       with:
         submodules: 'true'
     - name: Setup Python
-      uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
     - name: Build wheels
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index d004ab15ca15..ad6853281c1f 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -74,7 +74,7 @@ jobs:
         key: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
         restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
 
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
         architecture: 'x64'
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index dd13983f5b59..0f4b3ac6f6a7 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -104,6 +104,7 @@ OBJECTS= \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/coll.o \
+    $(PKGROOT)/src/collective/communicator-inl.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 46a862711dc6..0c2084de940c 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -104,6 +104,7 @@ OBJECTS= \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/coll.o \
+    $(PKGROOT)/src/collective/communicator-inl.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
diff --git a/src/collective/communicator-inl.cc b/src/collective/communicator-inl.cc
new file mode 100644
index 000000000000..4164855f1cef
--- /dev/null
+++ b/src/collective/communicator-inl.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#include "communicator-inl.h"
+
+namespace xgboost::collective {
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input) {
+  auto n_inputs = input.size();
+  std::vector<std::int64_t> sizes(n_inputs);
+  std::transform(input.cbegin(), input.cend(), sizes.begin(),
+                 [](auto const &vec) { return vec.size(); });
+
+  std::vector<std::int64_t> global_sizes = AllgatherV(sizes);
+  std::vector<std::int64_t> offset(global_sizes.size() + 1);
+  offset[0] = 0;
+  for (std::size_t i = 1; i < offset.size(); i++) {
+    offset[i] = offset[i - 1] + global_sizes[i - 1];
+  }
+
+  std::vector<char> collected;
+  for (auto const &vec : input) {
+    collected.insert(collected.end(), vec.cbegin(), vec.cend());
+  }
+  auto out = AllgatherV(collected);
+
+  std::vector<std::vector<char>> result;
+  for (std::size_t i = 1; i < offset.size(); ++i) {
+    std::vector<char> local(out.cbegin() + offset[i - 1], out.cbegin() + offset[i]);
+    result.emplace_back(std::move(local));
+  }
+  return result;
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
index 34212def2377..991e19f2c65a 100644
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost contributors
+ * Copyright 2022-2024, XGBoost contributors
  */
 #pragma once
 #include <string>
@@ -192,6 +192,18 @@ inline std::vector<T> AllgatherV(std::vector<T> const &input) {
   return result;
 }
 
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * @param inputs All the inputs from the local worker. The number of inputs can vary
+ *               across different workers. Along with which, the size of each vector in
+ *               the input can also vary.
+ *
+ * @return The AllgatherV result, containing vectors from all workers.
+ */
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input);
+
 /**
  * @brief Gathers variable-length strings from all processes and distributes them to all processes.
  * @param input Variable-length list of variable-length strings.
@@ -294,38 +306,5 @@ template <Operation op>
 inline void Allreduce(double *send_receive_buffer, size_t count) {
   Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }
-
-template <typename T>
-struct SpecialAllgatherVResult {
-  std::vector<std::size_t> offsets;
-  std::vector<std::size_t> sizes;
-  std::vector<T> result;
-};
-
-/**
- * @brief Gathers variable-length data from all processes and distributes it to all processes.
- *
- * We assume each worker has the same number of inputs, but each input may be of a different size.
- *
- * @param inputs All the inputs from the local worker.
- * @param sizes  Sizes of each input.
- */
-template <typename T>
-inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
-                                                    std::vector<std::size_t> const &sizes) {
-  // Gather the sizes across all workers.
-  auto const all_sizes = Allgather(sizes);
-
-  // Calculate input offsets (std::exclusive_scan).
-  std::vector<std::size_t> offsets(all_sizes.size());
-  for (std::size_t i = 1; i < offsets.size(); i++) {
-    offsets[i] = offsets[i - 1] + all_sizes[i - 1];
-  }
-
-  // Gather all the inputs.
-  auto const all_inputs = AllgatherV(inputs);
-
-  return {offsets, all_sizes, all_inputs};
-}
 }  // namespace collective
 }  // namespace xgboost
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index bc534d351f17..d25a41cb0b3c 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
@@ -26,6 +26,47 @@
 #include "xgboost/linalg.h"            // for Constants, Vector
 
 namespace xgboost::tree {
+/**
+ * @brief Gather the expand entries from all the workers.
+ * @param entries Local expand entries on this worker.
+ * @return Global expand entries gathered from all workers.
+ */
+template <typename ExpandEntry>
+std::enable_if_t<std::is_same_v<ExpandEntry, CPUExpandEntry> ||
+                     std::is_same_v<ExpandEntry, MultiExpandEntry>,
+                 std::vector<ExpandEntry>>
+AllgatherColumnSplit(std::vector<ExpandEntry> const &entries) {
+  auto const n_entries = entries.size();
+
+  // First, gather all the primitive fields.
+  std::vector<ExpandEntry> local_entries(n_entries);
+
+  // Collect and serialize all entries
+  std::vector<std::vector<char>> serialized_entries;
+  for (std::size_t i = 0; i < n_entries; ++i) {
+    Json jentry{Object{}};
+    entries[i].Save(&jentry);
+
+    std::vector<char> out;
+    Json::Dump(jentry, &out, std::ios::binary);
+
+    serialized_entries.emplace_back(std::move(out));
+  }
+  auto all_serialized = collective::VectorAllgatherV(serialized_entries);
+  CHECK_GE(all_serialized.size(), local_entries.size());
+
+  std::vector<ExpandEntry> all_entries(all_serialized.size());
+  std::transform(all_serialized.cbegin(), all_serialized.cend(), all_entries.begin(),
+                 [](std::vector<char> const &e) {
+                   ExpandEntry entry;
+                   auto je = Json::Load(StringView{e.data(), e.size()}, std::ios::binary);
+                   entry.Load(je);
+                   return entry;
+                 });
+
+  return all_entries;
+}
+
 class HistEvaluator {
  private:
   struct NodeEntry {
@@ -36,8 +77,8 @@ class HistEvaluator {
   };
 
  private:
-  Context const* ctx_;
-  TrainParam const* param_;
+  Context const *ctx_;
+  TrainParam const *param_;
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   TreeEvaluator tree_evaluator_;
   bool is_col_split_{false};
@@ -202,7 +243,7 @@ class HistEvaluator {
       common::CatBitField cat_bits{best.cat_bits};
       bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin);
       CHECK_GT(partition, 0);
-      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) {
+      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](std::size_t c) {
         auto cat = cut_val[c + f_begin];
         cat_bits.Set(cat);
       });
@@ -285,57 +326,23 @@ class HistEvaluator {
     return left_sum;
   }
 
-  /**
-   * @brief Gather the expand entries from all the workers.
-   * @param entries Local expand entries on this worker.
-   * @return Global expand entries gathered from all workers.
-   */
-  std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
-    auto const world = collective::GetWorldSize();
-    auto const num_entries = entries.size();
-
-    // First, gather all the primitive fields.
-    std::vector<CPUExpandEntry> local_entries(num_entries);
-    std::vector<uint32_t> cat_bits;
-    std::vector<std::size_t> cat_bits_sizes;
-    for (std::size_t i = 0; i < num_entries; i++) {
-      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
-    }
-    auto all_entries = collective::Allgather(local_entries);
-
-    // Gather all the cat_bits.
-    auto gathered = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
-
-    common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
-      // Copy the cat_bits back into all expand entries.
-      all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
-      std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
-                  all_entries[i].split.cat_bits.begin());
-    });
-
-    return all_entries;
-  }
-
  public:
   void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut,
                       common::Span<FeatureType const> feature_types, const RegTree &tree,
                       std::vector<CPUExpandEntry> *p_entries) {
     auto n_threads = ctx_->Threads();
-    auto& entries = *p_entries;
+    auto &entries = *p_entries;
     // All nodes are on the same level, so we can store the shared ptr.
-    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(
-        entries.size());
+    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
     for (size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
       auto nidx = entries[nidx_in_set].nid;
-      features[nidx_in_set] =
-          column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
+      features[nidx_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
     }
     CHECK(!features.empty());
-    const size_t grain_size =
-        std::max<size_t>(1, features.front()->Size() / n_threads);
-    common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) {
-      return features[nidx_in_set]->Size();
-    }, grain_size);
+    const size_t grain_size = std::max<size_t>(1, features.front()->Size() / n_threads);
+    common::BlockedSpace2d space(
+        entries.size(), [&](size_t nidx_in_set) { return features[nidx_in_set]->Size(); },
+        grain_size);
 
     std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
     for (size_t i = 0; i < entries.size(); ++i) {
@@ -344,7 +351,7 @@ class HistEvaluator {
       }
     }
     auto evaluator = tree_evaluator_.GetEvaluator();
-    auto const& cut_ptrs = cut.Ptrs();
+    auto const &cut_ptrs = cut.Ptrs();
 
     common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
       auto tidx = omp_get_thread_num();
@@ -385,18 +392,16 @@ class HistEvaluator {
       }
     });
 
-    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size();
-         ++nidx_in_set) {
+    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
       for (auto tidx = 0; tidx < n_threads; ++tidx) {
-        entries[nidx_in_set].split.Update(
-            tloc_candidates[n_threads * nidx_in_set + tidx].split);
+        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
       }
     }
 
     if (is_col_split_) {
       // With column-wise data split, we gather the best splits from all the workers and update the
       // expand entries accordingly.
-      auto all_entries = Allgather(entries);
+      auto all_entries = AllgatherColumnSplit(entries);
       for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
         for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
           entries[nidx_in_set].split.Update(
@@ -407,7 +412,7 @@ class HistEvaluator {
   }
 
   // Add splits to tree, handles all statistic
-  void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
+  void ApplyTreeSplit(CPUExpandEntry const &candidate, RegTree *p_tree) {
     auto evaluator = tree_evaluator_.GetEvaluator();
     RegTree &tree = *p_tree;
 
@@ -437,8 +442,7 @@ class HistEvaluator {
     auto left_child = tree[candidate.nid].LeftChild();
     auto right_child = tree[candidate.nid].RightChild();
     tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
-                             tree[candidate.nid].SplitIndex(), left_weight,
-                             right_weight);
+                             tree[candidate.nid].SplitIndex(), left_weight, right_weight);
     evaluator = tree_evaluator_.GetEvaluator();
 
     snode_.resize(tree.GetNodes().size());
@@ -449,8 +453,7 @@ class HistEvaluator {
     snode_.at(right_child).root_gain =
         evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.right_sum});
 
-    interaction_constraints_.Split(candidate.nid,
-                                   tree[candidate.nid].SplitIndex(), left_child,
+    interaction_constraints_.Split(candidate.nid, tree[candidate.nid].SplitIndex(), left_child,
                                    right_child);
   }
 
@@ -571,53 +574,6 @@ class HistMultiEvaluator {
     return false;
   }
 
-  /**
-   * @brief Gather the expand entries from all the workers.
-   * @param entries Local expand entries on this worker.
-   * @return Global expand entries gathered from all workers.
-   */
-  std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
-    auto const world = collective::GetWorldSize();
-    auto const num_entries = entries.size();
-
-    // First, gather all the primitive fields.
-    std::vector<MultiExpandEntry> local_entries(num_entries);
-    std::vector<uint32_t> cat_bits;
-    std::vector<std::size_t> cat_bits_sizes;
-    std::vector<GradientPairPrecise> gradients;
-    for (std::size_t i = 0; i < num_entries; i++) {
-      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes, &gradients);
-    }
-    auto all_entries = collective::Allgather(local_entries);
-
-    // Gather all the cat_bits.
-    auto gathered_cat_bits = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
-
-    // Gather all the gradients.
-    auto const num_gradients = gradients.size();
-    auto const all_gradients = collective::Allgather(gradients);
-
-    auto const total_entries = num_entries * world;
-    auto const gradients_per_entry = num_gradients / num_entries;
-    auto const gradients_per_side = gradients_per_entry / 2;
-    common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
-      // Copy the cat_bits back into all expand entries.
-      all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
-      std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
-                  gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
-
-      // Copy the gradients back into all expand entries.
-      all_entries[i].split.left_sum.resize(gradients_per_side);
-      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
-                  all_entries[i].split.left_sum.begin());
-      all_entries[i].split.right_sum.resize(gradients_per_side);
-      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
-                  gradients_per_side, all_entries[i].split.right_sum.begin());
-    });
-
-    return all_entries;
-  }
-
  public:
   void EvaluateSplits(RegTree const &tree, common::Span<const BoundedHistCollection *> hist,
                       common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
@@ -676,7 +632,7 @@ class HistMultiEvaluator {
     if (is_col_split_) {
       // With column-wise data split, we gather the best splits from all the workers and update the
       // expand entries accordingly.
-      auto all_entries = Allgather(entries);
+      auto all_entries = AllgatherColumnSplit(entries);
       for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
         for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
           entries[nidx_in_set].split.Update(
diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h
index d6315877d1e0..fd16397e1193 100644
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -90,7 +90,6 @@ struct ExpandEntryImpl {
     }
 
     self->split.is_cat = get<Boolean const>(split["is_cat"]);
-
     self->LoadGrad(split);
   }
 };
@@ -106,8 +105,8 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
   void SaveGrad(Json* p_out) const {
     auto& out = *p_out;
     auto save = [&](std::string const& name, GradStats const& sum) {
-      out[name] = F32Array{2};
-      auto& array = get<F32Array>(out[name]);
+      out[name] = F64Array{2};
+      auto& array = get<F64Array>(out[name]);
       array[0] = sum.GetGrad();
       array[1] = sum.GetHess();
     };
@@ -115,9 +114,9 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
     save("right_sum", this->split.right_sum);
   }
   void LoadGrad(Json const& in) {
-    auto const& left_sum = get<F32Array const>(in["left_sum"]);
+    auto const& left_sum = get<F64Array const>(in["left_sum"]);
     this->split.left_sum = GradStats{left_sum[0], left_sum[1]};
-    auto const& right_sum = get<F32Array const>(in["right_sum"]);
+    auto const& right_sum = get<F64Array const>(in["right_sum"]);
     this->split.right_sum = GradStats{right_sum[0], right_sum[1]};
   }
 
@@ -173,8 +172,8 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
   void SaveGrad(Json* p_out) const {
     auto& out = *p_out;
     auto save = [&](std::string const& name, std::vector<GradientPairPrecise> const& sum) {
-      out[name] = F32Array{sum.size() * 2};
-      auto& array = get<F32Array>(out[name]);
+      out[name] = F64Array{sum.size() * 2};
+      auto& array = get<F64Array>(out[name]);
       for (std::size_t i = 0, j = 0; i < sum.size(); i++, j += 2) {
         array[j] = sum[i].GetGrad();
         array[j + 1] = sum[i].GetHess();
@@ -185,7 +184,7 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
   }
   void LoadGrad(Json const& in) {
     auto load = [&](std::string const& name, std::vector<GradientPairPrecise>* p_sum) {
-      auto const& array = get<F32Array const>(in[name]);
+      auto const& array = get<F64Array const>(in[name]);
       auto& sum = *p_sum;
       sum.resize(array.size() / 2);
       for (std::size_t i = 0, j = 0; i < sum.size(); ++i, j += 2) {
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index c2aaedafac95..cd60e6602cf7 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023, XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
  * \file updater_quantile_hist.cc
  * \brief use quantized feature values to construct a tree
  * \author Philip Cho, Tianqi Checn, Egor Smirnov
@@ -149,9 +149,6 @@ class MultiTargetHistBuilder {
   }
 
   void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
-    if (collective::IsDistributed()) {
-      LOG(FATAL) << "Distributed training for vector-leaf is not yet supported.";
-    }
     monitor_->Start(__func__);
 
     p_last_fmat_ = p_fmat;
diff --git a/tests/cpp/collective/test_rabit_communicator.cc b/tests/cpp/collective/test_rabit_communicator.cc
index ba22d8fdb84f..9711e1aede71 100644
--- a/tests/cpp/collective/test_rabit_communicator.cc
+++ b/tests/cpp/collective/test_rabit_communicator.cc
@@ -1,13 +1,12 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
 #include "../../../src/collective/rabit_communicator.h"
+#include "../helpers.h"
 
-namespace xgboost {
-namespace collective {
-
+namespace xgboost::collective {
 TEST(RabitCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
   auto construct = []() { RabitCommunicator comm{0, 0}; };
   EXPECT_THROW(construct(), dmlc::Error);
@@ -35,5 +34,37 @@ TEST(RabitCommunicatorSimpleTest, IsNotDistributed) {
   EXPECT_FALSE(comm.IsDistributed());
 }
 
-}  // namespace collective
-}  // namespace xgboost
+namespace {
+void VerifyVectorAllgatherV() {
+  auto n_workers = collective::GetWorldSize();
+  ASSERT_EQ(n_workers, 3);
+  auto rank = collective::GetRank();
+  // Construct input that has different length for each worker.
+  std::vector<std::vector<char>> inputs;
+  for (std::int32_t i = 0; i < rank + 1; ++i) {
+    std::vector<char> in;
+    for (std::int32_t j = 0; j < rank + 1; ++j) {
+      in.push_back(static_cast<char>(j));
+    }
+    inputs.emplace_back(std::move(in));
+  }
+
+  auto outputs = VectorAllgatherV(inputs);
+
+  ASSERT_EQ(outputs.size(), (1 + n_workers) * n_workers / 2);
+  auto const& res = outputs;
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    std::int32_t k = 0;
+    for (auto v : res[i]) {
+      ASSERT_EQ(v, k++);
+    }
+  }
+}
+}  // namespace
+
+TEST(VectorAllgatherV, Basic) {
+  std::int32_t n_workers{3};
+  RunWithInMemoryCommunicator(n_workers, VerifyVectorAllgatherV);
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 72163efd78cc..3ee041a339ed 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index cf806536a861..4021c9959440 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
@@ -18,7 +18,6 @@
 #include "xgboost/data.h"
 
 namespace xgboost::tree {
-
 namespace {
 template <typename ExpandEntry>
 void TestPartitioner(bst_target_t n_targets) {
@@ -253,5 +252,5 @@ void TestColumnSplit(bst_target_t n_targets) {
 
 TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
 
-TEST(QuantileHist, DISABLED_ColumnSplitMultiTarget) { TestColumnSplit(3); }
+TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
 }  // namespace xgboost::tree

From 761845f59402d61b575c3fa2f4fc60ec66804d1e Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <dmitry.razdoburdin@intel.com>
Date: Mon, 26 Feb 2024 14:07:36 +0100
Subject: [PATCH 3/5] [SYCL] Implement row set collection. (#10057)

Co-authored-by: Dmitry Razdoburdin <>
---
 plugin/sycl/common/row_set.h                  | 123 ++++++++++++++++++
 .../plugin/test_sycl_row_set_collection.cc    |  78 +++++++++++
 2 files changed, 201 insertions(+)
 create mode 100644 plugin/sycl/common/row_set.h
 create mode 100644 tests/cpp/plugin/test_sycl_row_set_collection.cc

diff --git a/plugin/sycl/common/row_set.h b/plugin/sycl/common/row_set.h
new file mode 100644
index 000000000000..574adbf8d9b9
--- /dev/null
+++ b/plugin/sycl/common/row_set.h
@@ -0,0 +1,123 @@
+/*!
+ * Copyright 2017-2023 XGBoost contributors
+ */
+#ifndef PLUGIN_SYCL_COMMON_ROW_SET_H_
+#define PLUGIN_SYCL_COMMON_ROW_SET_H_
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <xgboost/data.h>
+#pragma GCC diagnostic pop
+#include <algorithm>
+#include <vector>
+#include <utility>
+
+#include "../data.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+
+/*! \brief Collection of rowsets stored on device in USM memory */
+class RowSetCollection {
+ public:
+  /*! \brief data structure to store an instance set, a subset of
+   *  rows (instances) associated with a particular node in a decision
+   *  tree. */
+  struct Elem {
+    const size_t* begin{nullptr};
+    const size_t* end{nullptr};
+    bst_node_t node_id{-1};  // id of node associated with this instance set; -1 means uninitialized
+    Elem()
+         = default;
+    Elem(const size_t* begin,
+         const size_t* end,
+         bst_node_t node_id = -1)
+        : begin(begin), end(end), node_id(node_id) {}
+
+
+    inline size_t Size() const {
+      return end - begin;
+    }
+  };
+
+  inline size_t Size() const {
+    return elem_of_each_node_.size();
+  }
+
+  /*! \brief return corresponding element set given the node_id */
+  inline const Elem& operator[](unsigned node_id) const {
+    const Elem& e = elem_of_each_node_[node_id];
+    CHECK(e.begin != nullptr)
+        << "access element that is not in the set";
+    return e;
+  }
+
+  /*! \brief return corresponding element set given the node_id */
+  inline Elem& operator[](unsigned node_id) {
+    Elem& e = elem_of_each_node_[node_id];
+    return e;
+  }
+
+  // clear up things
+  inline void Clear() {
+    elem_of_each_node_.clear();
+  }
+  // initialize node id 0->everything
+  inline void Init() {
+    CHECK_EQ(elem_of_each_node_.size(), 0U);
+
+    const size_t* begin = row_indices_.Begin();
+    const size_t* end = row_indices_.End();
+    elem_of_each_node_.emplace_back(Elem(begin, end, 0));
+  }
+
+  auto& Data() { return row_indices_; }
+
+  // split rowset into two
+  inline void AddSplit(unsigned node_id,
+                       unsigned left_node_id,
+                       unsigned right_node_id,
+                       size_t n_left,
+                       size_t n_right) {
+    const Elem e = elem_of_each_node_[node_id];
+    CHECK(e.begin != nullptr);
+    size_t* all_begin = row_indices_.Begin();
+    size_t* begin = all_begin + (e.begin - all_begin);
+
+
+    CHECK_EQ(n_left + n_right, e.Size());
+    CHECK_LE(begin + n_left, e.end);
+    CHECK_EQ(begin + n_left + n_right, e.end);
+
+
+    if (left_node_id >= elem_of_each_node_.size()) {
+      elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
+    }
+    if (right_node_id >= elem_of_each_node_.size()) {
+      elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
+    }
+
+
+    elem_of_each_node_[left_node_id] = Elem(begin, begin + n_left, left_node_id);
+    elem_of_each_node_[right_node_id] = Elem(begin + n_left, e.end, right_node_id);
+    elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
+  }
+
+ private:
+  // stores the row indexes in the set
+  USMVector<size_t, MemoryType::on_device> row_indices_;
+  // vector: node_id -> elements
+  std::vector<Elem> elem_of_each_node_;
+};
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
+
+
+#endif  // PLUGIN_SYCL_COMMON_ROW_SET_H_
diff --git a/tests/cpp/plugin/test_sycl_row_set_collection.cc b/tests/cpp/plugin/test_sycl_row_set_collection.cc
new file mode 100644
index 000000000000..f527d9f16d1b
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_row_set_collection.cc
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2020-2023 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../../../plugin/sycl/common/row_set.h"
+#include "../../../plugin/sycl/device_manager.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::common {
+TEST(SyclRowSetCollection, AddSplits) {
+  const size_t num_rows = 16;
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
+
+  RowSetCollection row_set_collection;
+
+  auto& row_indices = row_set_collection.Data();
+  row_indices.Resize(&qu, num_rows);
+  size_t* p_row_indices = row_indices.Data();
+
+  qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows),
+                       [p_row_indices](::sycl::item<1> pid) {
+      const size_t idx = pid.get_id(0);
+      p_row_indices[idx] = idx;
+    });
+  }).wait_and_throw();
+  row_set_collection.Init();
+
+  CHECK_EQ(row_set_collection.Size(), 1);
+  {
+    size_t nid_test = 0;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, row_indices.Begin());
+    CHECK_EQ(elem.end, row_indices.End());
+    CHECK_EQ(elem.node_id , 0);
+  }
+
+  size_t nid = 0;
+  size_t nid_left = 1;
+  size_t nid_right = 2;
+  size_t n_left = 4;
+  size_t n_right = num_rows - n_left;
+  row_set_collection.AddSplit(nid, nid_left, nid_right, n_left, n_right);
+  CHECK_EQ(row_set_collection.Size(), 3);
+
+  {
+    size_t nid_test = 0;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, nullptr);
+    CHECK_EQ(elem.end, nullptr);
+    CHECK_EQ(elem.node_id , -1);
+  }
+
+  {
+    size_t nid_test = 1;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, row_indices.Begin());
+    CHECK_EQ(elem.end, row_indices.Begin() + n_left);
+    CHECK_EQ(elem.node_id , nid_test);
+  }
+
+  {
+    size_t nid_test = 2;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, row_indices.Begin() + n_left);
+    CHECK_EQ(elem.end, row_indices.End());
+    CHECK_EQ(elem.node_id , nid_test);
+  }
+
+}
+}  // namespace xgboost::sycl::common

From 5ac233280e1218fcf9de011fbbbe7841402d9866 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 28 Feb 2024 03:12:42 +0800
Subject: [PATCH 4/5] Require context in aggregators. (#10075)

---
 .clang-format                       |  2 +-
 include/xgboost/collective/result.h | 12 ++++++--
 src/collective/aggregator.h         | 46 ++++++++++++++++-------------
 src/common/quantile.cc              | 34 +++++++++++++--------
 src/learner.cc                      |  6 ++--
 src/metric/auc.cc                   | 10 ++++---
 src/metric/elementwise_metric.cu    | 37 ++++++++++++-----------
 src/metric/metric_common.h          |  7 ++---
 src/metric/multiclass_metric.cu     | 20 ++++++-------
 src/metric/rank_metric.cc           | 11 +++----
 src/metric/rank_metric.cu           |  2 +-
 src/metric/survival_metric.cu       | 28 ++++++++----------
 src/objective/adaptive.cc           |  8 ++---
 src/objective/adaptive.cu           |  9 +++---
 src/objective/adaptive.h            | 23 ++++++++-------
 src/objective/quantile_obj.cu       |  6 ++--
 src/objective/regression_obj.cu     | 10 +++++--
 src/tree/fit_stump.cc               | 11 ++++---
 src/tree/fit_stump.cu               | 17 +++++------
 src/tree/gpu_hist/histogram.cu      | 11 ++++---
 src/tree/updater_approx.cc          |  7 +++--
 src/tree/updater_gpu_hist.cu        |  6 ++--
 src/tree/updater_quantile_hist.cc   | 11 +++++--
 23 files changed, 190 insertions(+), 144 deletions(-)

diff --git a/.clang-format b/.clang-format
index 0984d5a7bf30..737cf9006bae 100644
--- a/.clang-format
+++ b/.clang-format
@@ -17,7 +17,7 @@ AllowShortEnumsOnASingleLine: true
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
+AllowShortLambdasOnASingleLine: Inline
 AllowShortIfStatementsOnASingleLine: WithoutElse
 AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
diff --git a/include/xgboost/collective/result.h b/include/xgboost/collective/result.h
index 507171dd4ff1..919d3a902298 100644
--- a/include/xgboost/collective/result.h
+++ b/include/xgboost/collective/result.h
@@ -1,8 +1,10 @@
 /**
- *  Copyright 2023, XGBoost Contributors
+ *  Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 
+#include <xgboost/logging.h>
+
 #include <memory>   // for unique_ptr
 #include <sstream>  // for stringstream
 #include <stack>    // for stack
@@ -160,10 +162,16 @@ struct Result {
 
 // We don't have monad, a simple helper would do.
 template <typename Fn>
-Result operator<<(Result&& r, Fn&& fn) {
+[[nodiscard]] Result operator<<(Result&& r, Fn&& fn) {
   if (!r.OK()) {
     return std::forward<Result>(r);
   }
   return fn();
 }
+
+inline void SafeColl(Result const& rc) {
+  if (!rc.OK()) {
+    LOG(FATAL) << rc.Report();
+  }
+}
 }  // namespace xgboost::collective
diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index f2a9ff528366..8a5b31c36546 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -1,22 +1,21 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
  *
  * Higher level functions built on top the Communicator API, taking care of behavioral differences
  * between row-split vs column-split distributed training, and horizontal vs vertical federated
  * learning.
  */
 #pragma once
-#include <xgboost/data.h>
-
 #include <limits>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "communicator-inl.h"
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/data.h"               // for MetaINfo
 
-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 
 /**
  * @brief Apply the given function where the labels are.
@@ -31,15 +30,16 @@ namespace collective {
  * @param size The size of the buffer.
  * @param function The function used to calculate the results.
  */
-template <typename Function>
-void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& function) {
+template <typename FN>
+void ApplyWithLabels(Context const*, MetaInfo const& info, void* buffer, std::size_t size,
+                     FN&& function) {
   if (info.IsVerticalFederated()) {
     // We assume labels are only available on worker 0, so the calculation is done there and result
     // broadcast to other workers.
     std::string message;
     if (collective::GetRank() == 0) {
       try {
-        std::forward<Function>(function)();
+        std::forward<FN>(function)();
       } catch (dmlc::Error& e) {
         message = e.what();
       }
@@ -52,7 +52,7 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
       LOG(FATAL) << &message[0];
     }
   } else {
-    std::forward<Function>(function)();
+    std::forward<FN>(function)();
   }
 }
 
@@ -70,7 +70,8 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
  * @param function The function used to calculate the results.
  */
 template <typename T, typename Function>
-void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function&& function) {
+void ApplyWithLabels(Context const*, MetaInfo const& info, HostDeviceVector<T>* result,
+                     Function&& function) {
   if (info.IsVerticalFederated()) {
     // We assume labels are only available on worker 0, so the calculation is done there and result
     // broadcast to other workers.
@@ -114,7 +115,9 @@ void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function
  * @return The global max of the input.
  */
 template <typename T>
-T GlobalMax(MetaInfo const& info, T value) {
+std::enable_if_t<std::is_trivially_copy_assignable_v<T>, T> GlobalMax(Context const*,
+                                                                      MetaInfo const& info,
+                                                                      T value) {
   if (info.IsRowSplit()) {
     collective::Allreduce<collective::Operation::kMax>(&value, 1);
   }
@@ -132,16 +135,18 @@ T GlobalMax(MetaInfo const& info, T value) {
  * @param values Pointer to the inputs to sum.
  * @param size Number of values to sum.
  */
-template <typename T>
-void GlobalSum(MetaInfo const& info, T* values, size_t size) {
+template <typename T, std::int32_t kDim>
+[[nodiscard]] Result GlobalSum(Context const*, MetaInfo const& info,
+                               linalg::TensorView<T, kDim> values) {
   if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(values, size);
+    collective::Allreduce<collective::Operation::kSum>(values.Values().data(), values.Size());
   }
+  return Success();
 }
 
 template <typename Container>
-void GlobalSum(MetaInfo const& info, Container* values) {
-  GlobalSum(info, values->data(), values->size());
+[[nodiscard]] Result GlobalSum(Context const* ctx, MetaInfo const& info, Container* values) {
+  return GlobalSum(ctx, info, values->data(), values->size());
 }
 
 /**
@@ -157,9 +162,10 @@ void GlobalSum(MetaInfo const& info, Container* values) {
  * @return The global ratio of the two inputs.
  */
 template <typename T>
-T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
+T GlobalRatio(Context const* ctx, MetaInfo const& info, T dividend, T divisor) {
   std::array<T, 2> results{dividend, divisor};
-  GlobalSum(info, &results);
+  auto rc = GlobalSum(ctx, info, linalg::MakeVec(results.data(), results.size()));
+  collective::SafeColl(rc);
   std::tie(dividend, divisor) = std::tuple_cat(results);
   if (divisor <= 0) {
     return std::numeric_limits<T>::quiet_NaN();
@@ -167,6 +173,4 @@ T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
     return dividend / divisor;
   }
 }
-
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index c74db99e4006..e521fae69b1d 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include "quantile.h"
 
@@ -145,7 +145,7 @@ struct QuantileAllreduce {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
-    Context const *, MetaInfo const &info,
+    Context const *ctx, MetaInfo const &info,
     std::vector<typename WQSketch::SummaryContainer> const &reduced,
     std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
     std::vector<typename WQSketch::Entry> *p_global_sketches) {
@@ -171,7 +171,9 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
   std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1);
 
   // Gather all column pointers
-  collective::GlobalSum(info, sketches_scan.data(), sketches_scan.size());
+  auto rc =
+      collective::GlobalSum(ctx, info, linalg::MakeVec(sketches_scan.data(), sketches_scan.size()));
+  collective::SafeColl(rc);
   for (int32_t i = 0; i < world; ++i) {
     size_t back = (i + 1) * (n_columns + 1) - 1;
     auto n_entries = sketches_scan.at(back);
@@ -199,14 +201,15 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 
   static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float),
                 "Unexpected size of sketch entry.");
-  collective::GlobalSum(
-      info,
-      reinterpret_cast<float *>(global_sketches.data()),
-      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float));
+  rc = collective::GlobalSum(
+      ctx, info,
+      linalg::MakeVec(reinterpret_cast<float *>(global_sketches.data()),
+                      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float)));
+  collective::SafeColl(rc);
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo const& info) {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const* ctx, MetaInfo const& info) {
   auto world_size = collective::GetWorldSize();
   auto rank = collective::GetRank();
   if (world_size == 1 || info.IsColumnSplit()) {
@@ -226,7 +229,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
   std::vector<size_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
   size_t feat_begin = rank * feature_ptr.size();  // pointer to current worker
   std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin);
-  collective::GlobalSum(info, global_feat_ptrs.data(), global_feat_ptrs.size());
+  auto rc = collective::GlobalSum(
+      ctx, info, linalg::MakeVec(global_feat_ptrs.data(), global_feat_ptrs.size()));
 
   // move all categories into a flatten vector to prepare for allreduce
   size_t total = feature_ptr.back();
@@ -239,7 +243,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
   // indptr for indexing workers
   std::vector<size_t> global_worker_ptr(world_size + 1, 0);
   global_worker_ptr[rank + 1] = total;  // shift 1 to right for constructing the indptr
-  collective::GlobalSum(info, global_worker_ptr.data(), global_worker_ptr.size());
+  rc = collective::GlobalSum(ctx, info,
+                             linalg::MakeVec(global_worker_ptr.data(), global_worker_ptr.size()));
   std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin());
   // total number of categories in all workers with all features
   auto gtotal = global_worker_ptr.back();
@@ -251,7 +256,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
   CHECK_EQ(rank_size, total);
   std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
   // gather values from all workers.
-  collective::GlobalSum(info, global_categories.data(), global_categories.size());
+  rc = collective::GlobalSum(ctx, info,
+                             linalg::MakeVec(global_categories.data(), global_categories.size()));
   QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
                                             categories_.size()};
   ParallelFor(categories_.size(), n_threads_, [&](auto fidx) {
@@ -293,7 +299,9 @@ void SketchContainerImpl<WQSketch>::AllReduce(
 
   // Prune the intermediate num cuts for synchronization.
   std::vector<bst_row_t> global_column_size(columns_size_);
-  collective::GlobalSum(info, &global_column_size);
+  auto rc = collective::GlobalSum(
+      ctx, info, linalg::MakeVec(global_column_size.data(), global_column_size.size()));
+  collective::SafeColl(rc);
 
   ParallelFor(sketches_.size(), n_threads_, [&](size_t i) {
     int32_t intermediate_num_cuts = static_cast<int32_t>(
diff --git a/src/learner.cc b/src/learner.cc
index db72f71644cb..eed9dd5cdcd7 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file learner.cc
  * \brief Implementation of learning algorithm.
  * \author Tianqi Chen
@@ -846,7 +846,7 @@ class LearnerConfiguration : public Learner {
 
   void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
     base_score->Reshape(1);
-    collective::ApplyWithLabels(info, base_score->Data(),
+    collective::ApplyWithLabels(this->Ctx(), info, base_score->Data(),
                                 [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
   }
 };
@@ -1472,7 +1472,7 @@ class LearnerImpl : public LearnerIO {
   void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
                    std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
     out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
-    collective::ApplyWithLabels(info, out_gpair->Data(),
+    collective::ApplyWithLabels(&ctx_, info, out_gpair->Data(),
                                 [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
   }
 
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index 4a8aa8a4bbdc..212a3a027d35 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #include "auc.h"
 
@@ -112,7 +112,9 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
 
   // we have 2 averages going in here, first is among workers, second is among
   // classes. allreduce sums up fp/tp auc for each class.
-  collective::GlobalSum(info, &results.Values());
+  auto rc = collective::GlobalSum(ctx, info, results);
+  collective::SafeColl(rc);
+
   double auc_sum{0};
   double tp_sum{0};
   for (size_t c = 0; c < n_classes; ++c) {
@@ -286,7 +288,7 @@ class EvalAUC : public MetricNoCache {
         InvalidGroupAUC();
       }
 
-      auc = collective::GlobalRatio(info, auc, static_cast<double>(valid_groups));
+      auc = collective::GlobalRatio(ctx_, info, auc, static_cast<double>(valid_groups));
       if (!std::isnan(auc)) {
         CHECK_LE(auc, 1) << "Total AUC across groups: " << auc * valid_groups
                          << ", valid groups: " << valid_groups;
@@ -307,7 +309,7 @@ class EvalAUC : public MetricNoCache {
         std::tie(fp, tp, auc) =
             static_cast<Curve *>(this)->EvalBinary(preds, info);
       }
-      auc = collective::GlobalRatio(info, auc, fp * tp);
+      auc = collective::GlobalRatio(ctx_, info, auc, fp * tp);
       if (!std::isnan(auc)) {
         CHECK_LE(auc, 1.0);
       }
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index f245f3e06306..9c26011aa99f 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file elementwise_metric.cu
  * \brief evaluation metrics for elementwise binary or regression.
  * \author Kailong Chen, Tianqi Chen
@@ -12,13 +12,14 @@
 #include <cmath>
 
 #include "../collective/communicator-inl.h"
-#include "../common/common.h"           // MetricNoCache
+#include "../common/common.h"  // MetricNoCache
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // OptionalWeights
 #include "../common/pseudo_huber.h"
 #include "../common/quantile_loss_utils.h"  // QuantileLossParam
 #include "../common/threading_utils.h"
 #include "metric_common.h"
+#include "xgboost/collective/result.h"  // for SafeColl
 #include "xgboost/metric.h"
 
 #if defined(XGBOOST_USE_CUDA)
@@ -30,8 +31,7 @@
 #include "../common/device_helpers.cuh"
 #endif  // XGBOOST_USE_CUDA
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(elementwise_metric);
 
@@ -199,7 +199,8 @@ class PseudoErrorLoss : public MetricNoCache {
           return std::make_tuple(v, wt);
         });
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return EvalRowMAPE::GetFinal(dat[0], dat[1]);
   }
 };
@@ -243,11 +244,11 @@ struct EvalError {
 };
 
 struct EvalPoissonNegLogLik {
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
     return "poisson-nloglik";
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
     const bst_float eps = 1e-16f;
     if (py < eps) py = eps;
     return common::LogGamma(y + 1.0f) + py - std::log(py) * y;
@@ -266,9 +267,9 @@ struct EvalPoissonNegLogLik {
  *   predt >= 0
  */
 struct EvalGammaDeviance {
-  const char *Name() const { return "gamma-deviance"; }
+  [[nodiscard]] const char *Name() const { return "gamma-deviance"; }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const {
     predt += kRtEps;
     label += kRtEps;
     return std::log(predt / label) + label / predt - 1;
@@ -287,7 +288,7 @@ struct EvalGammaNLogLik {
     return "gamma-nloglik";
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
     py = std::max(py, 1e-6f);
     // hardcoded dispersion.
     float constexpr kPsi = 1.0;
@@ -313,7 +314,7 @@ struct EvalTweedieNLogLik {
     CHECK(rho_ < 2 && rho_ >= 1)
         << "tweedie variance power must be in interval [1, 2)";
   }
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
     static thread_local std::string name;
     std::ostringstream os;
     os << "tweedie-nloglik@" << rho_;
@@ -321,7 +322,7 @@ struct EvalTweedieNLogLik {
     return name.c_str();
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const {
     bst_float a = y * std::exp((1 - rho_) * std::log(p)) / (1 - rho_);
     bst_float b = std::exp((2 - rho_) * std::log(p)) / (2 - rho_);
     return -a + b;
@@ -366,7 +367,8 @@ struct EvalEWiseBase : public MetricNoCache {
         });
 
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
@@ -438,7 +440,8 @@ class QuantileError : public MetricNoCache {
     if (info.num_row_ == 0) {
       // empty DMatrix on distributed env
       std::array<double, 2> dat{0.0, 0.0};
-      collective::GlobalSum(info, &dat);
+      auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+      collective::SafeColl(rc);
       CHECK_GT(dat[1], 0);
       return dat[0] / dat[1];
     }
@@ -476,7 +479,8 @@ class QuantileError : public MetricNoCache {
           return std::make_tuple(l, w);
         });
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     CHECK_GT(dat[1], 0);
     return dat[0] / dat[1];
   }
@@ -501,5 +505,4 @@ class QuantileError : public MetricNoCache {
 XGBOOST_REGISTER_METRIC(QuantileError, "quantile")
     .describe("Quantile regression error.")
     .set_body([](const char*) { return new QuantileError{}; });
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h
index 1b148ab0f47c..53c38ff2a8c2 100644
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -1,6 +1,5 @@
-/*!
- * Copyright 2018-2022 by Contributors
- * \file metric_common.h
+/**
+ * Copyright 2018-2024, Contributors
  */
 #ifndef XGBOOST_METRIC_METRIC_COMMON_H_
 #define XGBOOST_METRIC_METRIC_COMMON_H_
@@ -24,7 +23,7 @@ class MetricNoCache : public Metric {
   double Evaluate(HostDeviceVector<float> const &predts, std::shared_ptr<DMatrix> p_fmat) final {
     double result{0.0};
     auto const &info = p_fmat->Info();
-    collective::ApplyWithLabels(info, &result, sizeof(double),
+    collective::ApplyWithLabels(ctx_, info, &result, sizeof(double),
                                 [&] { result = this->Eval(predts, info); });
     return result;
   }
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index 897c91dabe96..acaef7cf7e84 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file multiclass_metric.cc
  * \brief evaluation metrics for multiclass classification.
  * \author Kailong Chen, Tianqi Chen
@@ -24,8 +24,7 @@
 #include "../common/device_helpers.cuh"
 #endif  // XGBOOST_USE_CUDA
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(multiclass_metric);
 
@@ -40,11 +39,10 @@ class MultiClassMetricsReduction {
  public:
   MultiClassMetricsReduction() = default;
 
-  PackedReduceResult
-  CpuReduceMetrics(const HostDeviceVector<bst_float> &weights,
-                   const HostDeviceVector<bst_float> &labels,
-                   const HostDeviceVector<bst_float> &preds,
-                   const size_t n_class, int32_t n_threads) const {
+  [[nodiscard]] PackedReduceResult CpuReduceMetrics(const HostDeviceVector<bst_float>& weights,
+                                                    const HostDeviceVector<bst_float>& labels,
+                                                    const HostDeviceVector<bst_float>& preds,
+                                                    const size_t n_class, int32_t n_threads) const {
     size_t ndata = labels.Size();
 
     const auto& h_labels = labels.HostVector();
@@ -182,7 +180,8 @@ struct EvalMClassBase : public MetricNoCache {
       dat[0] = result.Residue();
       dat[1] = result.Weights();
     }
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return Derived::GetFinal(dat[0], dat[1]);
   }
   /*!
@@ -245,5 +244,4 @@ XGBOOST_REGISTER_METRIC(MatchError, "merror")
 XGBOOST_REGISTER_METRIC(MultiLogLoss, "mlogloss")
     .describe("Multiclass negative loglikelihood.")
     .set_body([](const char*) { return new EvalMultiLogLoss(); });
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 6762aec32d46..53841c05135e 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -101,7 +101,7 @@ struct EvalAMS : public MetricNoCache {
     }
   }
 
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return name_.c_str();
   }
 
@@ -159,7 +159,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
       exc.Rethrow();
     }
 
-    return collective::GlobalRatio(info, sum_metric, static_cast<double>(ngroups));
+    return collective::GlobalRatio(ctx_, info, sum_metric, static_cast<double>(ngroups));
   }
 
   [[nodiscard]] const char* Name() const override {
@@ -274,7 +274,7 @@ class EvalRankWithCache : public Metric {
   double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
     double result{0.0};
     auto const& info = p_fmat->Info();
-    collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
+    collective::ApplyWithLabels(ctx_, info, &result, sizeof(double), [&] {
       auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
       if (p_cache->Param() != param_) {
         p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
@@ -294,9 +294,10 @@ class EvalRankWithCache : public Metric {
 };
 
 namespace {
-double Finalize(Context const*, MetaInfo const& info, double score, double sw) {
+double Finalize(Context const* ctx, MetaInfo const& info, double score, double sw) {
   std::array<double, 2> dat{score, sw};
-  collective::GlobalSum(info, &dat);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), 2));
+  collective::SafeColl(rc);
   std::tie(score, sw) = std::tuple_cat(dat);
   if (sw > 0.0) {
     score = score / sw;
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index 372eb680531e..d43125dcbaf7 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <dmlc/registry.h>
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index c13702a1917f..c64fece6c1d3 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by Contributors
+ * Copyright 2019-2024, Contributors
  * \file survival_metric.cu
  * \brief Metrics for survival analysis
  * \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
@@ -30,8 +30,7 @@ using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType
 template <typename Distribution>
 using AFTLoss = xgboost::common::AFTLoss<Distribution>;
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(survival_metric);
 
@@ -43,12 +42,11 @@ class ElementWiseSurvivalMetricsReduction {
     policy_ = policy;
   }
 
-  PackedReduceResult
-  CpuReduceMetrics(const HostDeviceVector<bst_float> &weights,
-                   const HostDeviceVector<bst_float> &labels_lower_bound,
-                   const HostDeviceVector<bst_float> &labels_upper_bound,
-                   const HostDeviceVector<bst_float> &preds,
-                   int32_t n_threads) const {
+  [[nodiscard]] PackedReduceResult CpuReduceMetrics(
+      const HostDeviceVector<bst_float>& weights,
+      const HostDeviceVector<bst_float>& labels_lower_bound,
+      const HostDeviceVector<bst_float>& labels_upper_bound,
+      const HostDeviceVector<bst_float>& preds, int32_t n_threads) const {
     size_t ndata = labels_lower_bound.Size();
     CHECK_EQ(ndata, labels_upper_bound.Size());
 
@@ -155,7 +153,7 @@ class ElementWiseSurvivalMetricsReduction {
 struct EvalIntervalRegressionAccuracy {
   void Configure(const Args&) {}
 
-  const char* Name() const {
+  [[nodiscard]] const char* Name() const {
     return "interval-regression-accuracy";
   }
 
@@ -177,7 +175,7 @@ struct EvalAFTNLogLik {
     param_.UpdateAllowUnknown(args);
   }
 
-  const char* Name() const {
+  [[nodiscard]] const char* Name() const {
     return "aft-nloglik";
   }
 
@@ -213,7 +211,8 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
                                   info.labels_upper_bound_, preds);
 
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
@@ -230,7 +229,7 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
 // This class exists because we want to perform dispatch according to the distribution type at
 // configuration time, not at prediction time.
 struct AFTNLogLikDispatcher : public MetricNoCache {
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return "aft-nloglik";
   }
 
@@ -282,5 +281,4 @@ XGBOOST_REGISTER_METRIC(IntervalRegressionAccuracy, "interval-regression-accurac
       return new EvalEWiseSurvivalBase<EvalIntervalRegressionAccuracy>();
     });
 
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index b195dffd793a..e7778c464762 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #include "adaptive.h"
 
@@ -85,7 +85,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
   size_t n_leaf = nidx.size();
   if (nptr.empty()) {
     std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
+    UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree);
     return;
   }
 
@@ -100,7 +100,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
                                         predt.Size() / info.num_row_);
 
   collective::ApplyWithLabels(
-      info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
+      ctx, info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
         // loop over each leaf
         common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
           auto nidx = h_node_idx[k];
@@ -134,7 +134,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
         });
       });
 
-  UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
+  UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree);
 }
 
 #if !defined(XGBOOST_USE_CUDA)
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 07644146b2ec..235e284198f3 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #include <thrust/sort.h>
 
@@ -150,7 +150,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 
   if (nptr.Empty()) {
     std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
+    UpdateLeafValues(ctx, &quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
   }
 
   predt.SetDevice(ctx->Device());
@@ -160,7 +160,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   auto t_predt = d_predt.Slice(linalg::All(), group_idx);
 
   HostDeviceVector<float> quantiles;
-  collective::ApplyWithLabels(info, &quantiles, [&] {
+  collective::ApplyWithLabels(ctx, info, &quantiles, [&] {
     auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx));
     auto d_row_index = dh::ToSpan(ridx);
     auto seg_beg = nptr.DevicePointer();
@@ -186,6 +186,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                         w_it + d_weights.size(), &quantiles);
     }
   });
-  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
+  UpdateLeafValues(ctx, &quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate,
+                   p_tree);
 }
 }  // namespace xgboost::obj::detail
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index a64f37f63c83..cbe69e79a6cc 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #pragma once
 
@@ -17,8 +17,7 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/tree_model.h"          // RegTree
 
-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 namespace detail {
 inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
                             std::vector<bst_node_t>* p_nidx, std::vector<size_t>* p_nptr) {
@@ -36,13 +35,14 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
   }
 }
 
-inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
-                             MetaInfo const& info, float learning_rate, RegTree* p_tree) {
+inline void UpdateLeafValues(Context const* ctx, std::vector<float>* p_quantiles,
+                             std::vector<bst_node_t> const& nidx, MetaInfo const& info,
+                             float learning_rate, RegTree* p_tree) {
   auto& tree = *p_tree;
   auto& quantiles = *p_quantiles;
   auto const& h_node_idx = nidx;
 
-  size_t n_leaf = collective::GlobalMax(info, h_node_idx.size());
+  size_t n_leaf = collective::GlobalMax(ctx, info, h_node_idx.size());
   CHECK(quantiles.empty() || quantiles.size() == n_leaf);
   if (quantiles.empty()) {
     quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
@@ -52,12 +52,16 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
   std::vector<int32_t> n_valids(quantiles.size());
   std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
                  [](float q) { return static_cast<int32_t>(!std::isnan(q)); });
-  collective::GlobalSum(info, &n_valids);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(n_valids.data(), n_valids.size()));
+  collective::SafeColl(rc);
+
   // convert to 0 for all reduce
   std::replace_if(
       quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
   // use the mean value
-  collective::GlobalSum(info, &quantiles);
+  rc = collective::GlobalSum(ctx, info, linalg::MakeVec(quantiles.data(), quantiles.size()));
+  collective::SafeColl(rc);
+
   for (size_t i = 0; i < n_leaf; ++i) {
     if (n_valids[i] > 0) {
       quantiles[i] /= static_cast<float>(n_valids[i]);
@@ -105,5 +109,4 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> cons
                                predt, alpha, p_tree);
   }
 }
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 15ec72f95d91..7029a201ada8 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
  */
 #include <array>                            // std::array
 #include <cstddef>                          // std::size_t
@@ -170,7 +170,9 @@ class QuantileRegression : public ObjFunction {
     double meanq = temp(0) * sw;
 
     std::array<double, 2> dat{meanq, sw};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
+
     std::tie(meanq, sw) = std::tuple_cat(dat);
     meanq /= (sw + kRtEps);
     base_score->Reshape(1);
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index df30b354b48a..3b60ff111b33 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file regression_obj.cu
  * \brief Definition of single-value regression and classification objectives.
  * \author Tianqi Chen, Kailong Chen
@@ -672,8 +672,12 @@ class MeanAbsoluteError : public ObjFunction {
     std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
                    [w](float v) { return v * w; });
 
-    collective::GlobalSum(info, &out.Values());
-    collective::GlobalSum(info, &w, 1);
+    auto rc = collective::Success() << [&] {
+      return collective::GlobalSum(ctx_, info, out);
+    } << [&] {
+      return collective::GlobalSum(ctx_, info, linalg::MakeVec(&w, 1));
+    };
+    collective::SafeColl(rc);
 
     if (common::CloseTo(w, 0.0)) {
       // Mostly for handling empty dataset test.
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index 21a050536a2e..5e4d16e4e13e 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -1,7 +1,7 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  *
- * \brief Utilities for estimating initial score.
+ * @brief Utilities for estimating initial score.
  */
 #include "fit_stump.h"
 
@@ -44,8 +44,11 @@ void FitStump(Context const* ctx, MetaInfo const& info,
     }
   }
   CHECK(h_sum.CContiguous());
-
-  collective::GlobalSum(info, reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
+  auto as_double = linalg::MakeTensorView(
+      ctx, common::Span{reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2},
+      h_sum.Size() * 2);
+  auto rc = collective::GlobalSum(ctx, info, as_double);
+  collective::SafeColl(rc);
 
   for (std::size_t i = 0; i < h_sum.Size(); ++i) {
     out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index 9fcacd081996..832d40754ec9 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -1,19 +1,18 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  *
- * \brief Utilities for estimating initial score.
+ * @brief Utilities for estimating initial score.
  */
 #if !defined(NOMINMAX) && defined(_WIN32)
 #define NOMINMAX
-#endif                                            // !defined(NOMINMAX)
-#include <thrust/execution_policy.h>              // cuda::par
-#include <thrust/iterator/counting_iterator.h>    // thrust::make_counting_iterator
+#endif                                          // !defined(NOMINMAX)
+#include <thrust/execution_policy.h>            // cuda::par
+#include <thrust/iterator/counting_iterator.h>  // thrust::make_counting_iterator
 
-#include <cstddef>                                // std::size_t
+#include <cstddef>  // std::size_t
 
-#include "../collective/aggregator.cuh"
-#include "../collective/communicator-inl.cuh"
-#include "../common/device_helpers.cuh"           // dh::MakeTransformIterator
+#include "../collective/aggregator.cuh"  // for GlobalSum
+#include "../common/device_helpers.cuh"  // dh::MakeTransformIterator
 #include "fit_stump.h"
 #include "xgboost/base.h"     // GradientPairPrecise, GradientPair, XGBOOST_DEVICE
 #include "xgboost/context.h"  // Context
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index c473c9269580..90c151556566 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
@@ -52,7 +52,7 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
  *
  * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
  */
-GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
+GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair,
                                      MetaInfo const& info) {
   using GradientSumT = GradientPairPrecise;
   using T = typename GradientSumT::ValueT;
@@ -65,11 +65,14 @@ GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair c
   // Treat pair as array of 4 primitive types to allreduce
   using ReduceT = typename decltype(p.first)::ValueT;
   static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
-  collective::GlobalSum(info, reinterpret_cast<ReduceT*>(&p), 4);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(reinterpret_cast<ReduceT*>(&p), 4));
+  collective::SafeColl(rc);
+
   GradientPair positive_sum{p.first}, negative_sum{p.second};
 
   std::size_t total_rows = gpair.size();
-  collective::GlobalSum(info, &total_rows, 1);
+  rc = collective::GlobalSum(ctx, info, linalg::MakeVec(&total_rows, 1));
+  collective::SafeColl(rc);
 
   auto histogram_rounding =
       GradientSumT{common::CreateRoundingFactor<T>(
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 94e7547ee209..68317fc41a85 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost contributors
+ * Copyright 2021-2024, XGBoost contributors
  *
  * \brief Implementation for the approx tree method.
  */
@@ -107,7 +107,10 @@ class GloablApproxBuilder {
     for (auto const &g : gpair) {
       root_sum.Add(g);
     }
-    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
+    auto rc = collective::GlobalSum(ctx_, p_fmat->Info(),
+                                    linalg::MakeVec(reinterpret_cast<double *>(&root_sum), 2));
+    collective::SafeColl(rc);
+
     std::vector<CPUExpandEntry> nodes{best};
     this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
                                            linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 3c9c61f88847..4911cec093c8 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #include <thrust/copy.h>
 #include <thrust/reduce.h>
@@ -729,7 +729,9 @@ struct GPUHistMakerDevice {
         dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
                    GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
     using ReduceT = typename decltype(root_sum_quantised)::ValueT;
-    collective::GlobalSum(info_, reinterpret_cast<ReduceT*>(&root_sum_quantised), 2);
+    auto rc = collective::GlobalSum(
+        ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
+    collective::SafeColl(rc);
 
     hist.AllocateHistograms({kRootNIdx});
     this->BuildHist(kRootNIdx);
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index cd60e6602cf7..ced277773055 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -199,8 +199,10 @@ class MultiTargetHistBuilder {
       }
     }
     CHECK(root_sum.CContiguous());
-    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
-                          root_sum.Size() * 2);
+    auto rc = collective::GlobalSum(
+        ctx_, p_fmat->Info(),
+        linalg::MakeVec(reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2));
+    collective::SafeColl(rc);
 
     histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));
 
@@ -408,7 +410,9 @@ class HistUpdater {
         for (auto const &grad : gpair_h) {
           grad_stat.Add(grad.GetGrad(), grad.GetHess());
         }
-        collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
+        auto rc = collective::GlobalSum(ctx_, p_fmat->Info(),
+                                        linalg::MakeVec(reinterpret_cast<double *>(&grad_stat), 2));
+        collective::SafeColl(rc);
       }
 
       auto weight = evaluator_->InitRoot(GradStats{grad_stat});
@@ -471,6 +475,7 @@ class QuantileHistMaker : public TreeUpdater {
   std::unique_ptr<HistUpdater> p_impl_{nullptr};
   std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
   std::shared_ptr<common::ColumnSampler> column_sampler_;
+
   common::Monitor monitor_;
   ObjInfo const *task_{nullptr};
   HistMakerTrainParam hist_param_;

From fe732944401ccb4244327b86ab134058c58a7b39 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <71786575+ZiyueXu77@users.noreply.github.com>
Date: Fri, 1 Mar 2024 14:29:58 -0500
Subject: [PATCH 5/5] [secure boost] Vertical pipeline with hist sync (#10037)

The first phase is to implement an alternative vertical pipeline that syncs the histograms from clients to the label owner.
---
 include/xgboost/data.h                |  9 ++-
 src/common/quantile.cc                | 39 +++++++++--
 src/tree/hist/evaluate_splits.h       | 94 +++++++++++++++------------
 src/tree/hist/histogram.h             | 22 ++++++-
 src/tree/updater_approx.cc            |  2 +-
 src/tree/updater_quantile_hist.cc     |  4 +-
 tests/cpp/tree/hist/test_histogram.cc | 59 +++++++++++------
 7 files changed, 153 insertions(+), 76 deletions(-)

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 08d3d119a8ff..c449164ca572 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -40,7 +40,7 @@ enum class DataType : uint8_t {
 
 enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };
 
-enum class DataSplitMode : int { kRow = 0, kCol = 1 };
+enum class DataSplitMode : int { kRow = 0, kCol = 1, kColSecure = 2 };
 
 /*!
  * \brief Meta information about dataset, always sit in memory.
@@ -186,7 +186,12 @@ class MetaInfo {
   }
 
   /** @brief Whether the data is split column-wise. */
-  bool IsColumnSplit() const { return data_split_mode == DataSplitMode::kCol; }
+  bool IsColumnSplit() const { return (data_split_mode == DataSplitMode::kCol)
+  || (data_split_mode == DataSplitMode::kColSecure); }
+
+  /** @brief Whether the data is split column-wise with secure computation. */
+  bool IsSecure() const { return data_split_mode == DataSplitMode::kColSecure; }
+
   /** @brief Whether this is a learning to rank data. */
   bool IsRanking() const { return !group_ptr_.empty(); }
 
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index e521fae69b1d..49a2594e4a52 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -362,14 +362,27 @@ void SketchContainerImpl<WQSketch>::AllReduce(
 
 template <typename SketchType>
 void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_bin,
-                 HistogramCuts *cuts) {
+                 HistogramCuts *cuts, bool secure) {
   size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
+  // make a copy of required_cuts for mode selection
+  size_t required_cuts_original = required_cuts;
+  if (secure) {
+    // Sync the required_cuts across all workers
+    collective::Allreduce<collective::Operation::kMax>(&required_cuts, 1);
+  }
   auto &cut_values = cuts->cut_values_.HostVector();
-  // we use the min_value as the first (0th) element, hence starting from 1.
-  for (size_t i = 1; i < required_cuts; ++i) {
-    bst_float cpt = summary.data[i].value;
-    if (i == 1 || cpt > cut_values.back()) {
-      cut_values.push_back(cpt);
+  // if empty column, fill the cut values with 0
+  if (secure && (required_cuts_original == 0)) {
+    for (size_t i = 1; i < required_cuts; ++i) {
+      cut_values.push_back(0.0);
+    }
+  } else {
+    // we use the min_value as the first (0th) element, hence starting from 1.
+    for (size_t i = 1; i < required_cuts; ++i) {
+      bst_float cpt = summary.data[i].value;
+      if (i == 1 || cpt > cut_values.back()) {
+        cut_values.push_back(cpt);
+      }
     }
   }
 }
@@ -423,11 +436,16 @@ void SketchContainerImpl<WQSketch>::MakeCuts(Context const *ctx, MetaInfo const
   float max_cat{-1.f};
   for (size_t fid = 0; fid < reduced.size(); ++fid) {
     size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
+    // If vertical and secure mode, we need to sync the max_num_bins aross workers
+    if (info.IsVerticalFederated() && info.IsSecure()) {
+      collective::Allreduce<collective::Operation::kMax>(&max_num_bins, 1);
+    }
     typename WQSketch::SummaryContainer const &a = final_summaries[fid];
     if (IsCat(feature_types_, fid)) {
       max_cat = std::max(max_cat, AddCategories(categories_.at(fid), p_cuts));
     } else {
-      AddCutPoint<WQSketch>(a, max_num_bins, p_cuts);
+      // use special AddCutPoint scheme for secure vertical federated learning
+      AddCutPoint<WQSketch>(a, max_num_bins, p_cuts, info.IsSecure());
       // push a value that is greater than anything
       const bst_float cpt =
           (a.size > 0) ? a.data[a.size - 1].value : p_cuts->min_vals_.HostVector()[fid];
@@ -443,6 +461,13 @@ void SketchContainerImpl<WQSketch>::MakeCuts(Context const *ctx, MetaInfo const
     p_cuts->cut_ptrs_.HostVector().push_back(cut_size);
   }
 
+  if (info.IsVerticalFederated() && info.IsSecure()) {
+      // cut values need to be synced across all workers via Allreduce
+      auto cut_val = p_cuts->cut_values_.HostVector().data();
+      std::size_t n = p_cuts->cut_values_.HostVector().size();
+      collective::Allreduce<collective::Operation::kSum>(cut_val, n);
+  }
+
   p_cuts->SetCategorical(this->has_categorical_, max_cat);
   monitor_.Stop(__func__);
 }
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index d25a41cb0b3c..85cd5ab67176 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -82,6 +82,7 @@ class HistEvaluator {
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   TreeEvaluator tree_evaluator_;
   bool is_col_split_{false};
+  bool is_secure_{false};
   FeatureInteractionConstraintHost interaction_constraints_;
   std::vector<NodeEntry> snode_;
 
@@ -321,7 +322,6 @@ class HistEvaluator {
         }
       }
     }
-
     p_best->Update(best);
     return left_sum;
   }
@@ -353,54 +353,63 @@ class HistEvaluator {
     auto evaluator = tree_evaluator_.GetEvaluator();
     auto const &cut_ptrs = cut.Ptrs();
 
-    common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
-      auto tidx = omp_get_thread_num();
-      auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
-      auto best = &entry->split;
-      auto nidx = entry->nid;
-      auto histogram = hist[nidx];
-      auto features_set = features[nidx_in_set]->ConstHostSpan();
-      for (auto fidx_in_set = r.begin(); fidx_in_set < r.end(); fidx_in_set++) {
-        auto fidx = features_set[fidx_in_set];
-        bool is_cat = common::IsCat(feature_types, fidx);
-        if (!interaction_constraints_.Query(nidx, fidx)) {
-          continue;
-        }
-        if (is_cat) {
-          auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
-          if (common::UseOneHot(n_bins, param_->max_cat_to_onehot)) {
-            EnumerateOneHot(cut, histogram, fidx, nidx, evaluator, best);
-          } else {
-            std::vector<size_t> sorted_idx(n_bins);
-            std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
-            auto feat_hist = histogram.subspan(cut_ptrs[fidx], n_bins);
-            // Sort the histogram to get contiguous partitions.
-            std::stable_sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t l, size_t r) {
-              auto ret = evaluator.CalcWeightCat(*param_, feat_hist[l]) <
-                         evaluator.CalcWeightCat(*param_, feat_hist[r]);
-              return ret;
-            });
-            EnumeratePart<+1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
-            EnumeratePart<-1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
+    // Under secure vertical setting, only the active party is able to evaluate the split
+    // based on global histogram. Other parties will receive the final best split information
+    // Hence the below computation is not performed by the passive parties
+    if ((!is_secure_) || (collective::GetRank() == 0)) {
+      // Evaluate the splits for each feature
+      common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
+        auto tidx = omp_get_thread_num();
+        auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
+        auto best = &entry->split;
+        auto nidx = entry->nid;
+        auto histogram = hist[nidx];
+        auto features_set = features[nidx_in_set]->ConstHostSpan();
+        for (auto fidx_in_set = r.begin(); fidx_in_set < r.end(); fidx_in_set++) {
+          auto fidx = features_set[fidx_in_set];
+          bool is_cat = common::IsCat(feature_types, fidx);
+          if (!interaction_constraints_.Query(nidx, fidx)) {
+            continue;
           }
-        } else {
-          auto grad_stats = EnumerateSplit<+1>(cut, histogram, fidx, nidx, evaluator, best);
-          if (SplitContainsMissingValues(grad_stats, snode_[nidx])) {
-            EnumerateSplit<-1>(cut, histogram, fidx, nidx, evaluator, best);
+          if (is_cat) {
+            auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
+            if (common::UseOneHot(n_bins, param_->max_cat_to_onehot)) {
+              EnumerateOneHot(cut, histogram, fidx, nidx, evaluator, best);
+            } else {
+              std::vector<size_t> sorted_idx(n_bins);
+              std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
+              auto feat_hist = histogram.subspan(cut_ptrs[fidx], n_bins);
+              // Sort the histogram to get contiguous partitions.
+              std::stable_sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t l, size_t r) {
+                auto ret = evaluator.CalcWeightCat(*param_, feat_hist[l]) <
+                           evaluator.CalcWeightCat(*param_, feat_hist[r]);
+                return ret;
+              });
+              EnumeratePart<+1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
+              EnumeratePart<-1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
+            }
+          } else {
+            auto grad_stats = EnumerateSplit<+1>(cut, histogram, fidx, nidx, evaluator, best);
+            if (SplitContainsMissingValues(grad_stats, snode_[nidx])) {
+              EnumerateSplit<-1>(cut, histogram, fidx, nidx, evaluator, best);
+            }
           }
         }
-      }
-    });
+      });
 
-    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
-      for (auto tidx = 0; tidx < n_threads; ++tidx) {
-        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
+      for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+        for (auto tidx = 0; tidx < n_threads; ++tidx) {
+          entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
+        }
       }
     }
 
     if (is_col_split_) {
       // With column-wise data split, we gather the best splits from all the workers and update the
       // expand entries accordingly.
+      // Note that under secure vertical setting, only the label owner is able to evaluate the split
+      // based on the global histogram. The other parties will receive the final best splits
+      // allgather is capable of performing this (0-gain entries for non-label owners),
       auto all_entries = AllgatherColumnSplit(entries);
       for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
         for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
@@ -480,7 +489,8 @@ class HistEvaluator {
         param_{param},
         column_sampler_{std::move(sampler)},
         tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
-        is_col_split_{info.IsColumnSplit()} {
+        is_col_split_{info.IsColumnSplit()},
+        is_secure_{info.IsSecure()}{
     interaction_constraints_.Configure(*param, info.num_col_);
     column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
                           param_->colsample_bynode, param_->colsample_bylevel,
@@ -496,6 +506,7 @@ class HistMultiEvaluator {
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   Context const *ctx_;
   bool is_col_split_{false};
+  bool is_secure_{false};
 
  private:
   static double MultiCalcSplitGain(TrainParam const &param,
@@ -709,7 +720,8 @@ class HistMultiEvaluator {
       : param_{param},
         column_sampler_{std::move(sampler)},
         ctx_{ctx},
-        is_col_split_{info.IsColumnSplit()} {
+        is_col_split_{info.IsColumnSplit()},
+        is_secure_{info.IsSecure()} {
     interaction_constraints_.Configure(*param, info.num_col_);
     column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
                           param_->colsample_bynode, param_->colsample_bylevel,
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index 033d2221ec89..d4cea58d0f72 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -50,6 +50,7 @@ class HistogramBuilder {
   // Whether XGBoost is running in distributed environment.
   bool is_distributed_{false};
   bool is_col_split_{false};
+  bool is_secure_{false};
 
  public:
   /**
@@ -60,13 +61,14 @@ class HistogramBuilder {
    *                         of using global rabit variable.
    */
   void Reset(Context const *ctx, bst_bin_t total_bins, BatchParam const &p, bool is_distributed,
-             bool is_col_split, HistMakerTrainParam const *param) {
+             bool is_col_split, bool is_secure, HistMakerTrainParam const *param) {
     n_threads_ = ctx->Threads();
     param_ = p;
     hist_.Reset(total_bins, param->max_cached_hist_node);
     buffer_.Init(total_bins);
     is_distributed_ = is_distributed;
     is_col_split_ = is_col_split;
+    is_secure_ = is_secure;
   }
 
   template <bool any_missing>
@@ -175,6 +177,7 @@ class HistogramBuilder {
                      std::vector<bst_node_t> const &nodes_to_build,
                      std::vector<bst_node_t> const &nodes_to_trick) {
     auto n_total_bins = buffer_.TotalBins();
+
     common::BlockedSpace2d space(
         nodes_to_build.size(), [&](std::size_t) { return n_total_bins; }, 1024);
     common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
@@ -190,6 +193,18 @@ class HistogramBuilder {
           reinterpret_cast<double *>(this->hist_[first_nidx].data()), n);
     }
 
+    if (is_distributed_ && is_col_split_ && is_secure_) {
+      // Under secure vertical mode, we perform allgather for all nodes
+      CHECK(!nodes_to_build.empty());
+      // in theory the operation is AllGather, under current histogram setting of
+      // same length with 0s for empty slots,
+      // AllReduce is the most efficient way of achieving the global histogram
+      auto first_nidx = nodes_to_build.front();
+      std::size_t n = n_total_bins * nodes_to_build.size() * 2;
+      collective::Allreduce<collective::Operation::kSum>(
+              reinterpret_cast<double *>(this->hist_[first_nidx].data()), n);
+    }
+
     common::BlockedSpace2d const &subspace =
         nodes_to_trick.size() == nodes_to_build.size()
             ? space
@@ -329,12 +344,13 @@ class MultiHistogramBuilder {
   [[nodiscard]] auto &Histogram(bst_target_t t) { return target_builders_[t].Histogram(); }
 
   void Reset(Context const *ctx, bst_bin_t total_bins, bst_target_t n_targets, BatchParam const &p,
-             bool is_distributed, bool is_col_split, HistMakerTrainParam const *param) {
+             bool is_distributed, bool is_col_split, bool is_secure,
+             HistMakerTrainParam const *param) {
     ctx_ = ctx;
     target_builders_.resize(n_targets);
     CHECK_GE(n_targets, 1);
     for (auto &v : target_builders_) {
-      v.Reset(ctx, total_bins, p, is_distributed, is_col_split, param);
+      v.Reset(ctx, total_bins, p, is_distributed, is_col_split, is_secure, param);
     }
   }
 };
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 68317fc41a85..c1e3cc3d7c14 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -93,7 +93,7 @@ class GloablApproxBuilder {
 
     histogram_builder_.Reset(ctx_, n_total_bins, p_tree->NumTargets(), BatchSpec(*param_, hess),
                              collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
-                             hist_param_);
+                             p_fmat->Info().IsSecure(), hist_param_);
     monitor_->Stop(__func__);
   }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index ced277773055..b042f1631f2e 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -167,7 +167,7 @@ class MultiTargetHistBuilder {
     histogram_builder_ = std::make_unique<MultiHistogramBuilder>();
     histogram_builder_->Reset(ctx_, n_total_bins, n_targets, HistBatch(param_),
                               collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
-                              hist_param_);
+                              p_fmat->Info().IsSecure(), hist_param_);
 
     evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
     p_last_tree_ = p_tree;
@@ -357,7 +357,7 @@ class HistUpdater {
                                 fmat->Info().IsColumnSplit());
     }
     histogram_builder_->Reset(ctx_, n_total_bins, 1, HistBatch(param_), collective::IsDistributed(),
-                              fmat->Info().IsColumnSplit(), hist_param_);
+                              fmat->Info().IsColumnSplit(), fmat->Info().IsSecure(), hist_param_);
     evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
     p_last_tree_ = p_tree;
     monitor_->Stop(__func__);
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 25a800367c49..76428d1d83b4 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -68,8 +68,8 @@ void TestAddHistRows(bool is_distributed) {
 
   HistMakerTrainParam hist_param;
   HistogramBuilder histogram_builder;
-  histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed, false,
-                          &hist_param);
+  histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed,
+                          false, false, &hist_param);
   histogram_builder.AddHistRows(&tree, &nodes_to_build, &nodes_to_sub, false);
 
   for (bst_node_t const &nidx : nodes_to_build) {
@@ -102,7 +102,7 @@ void TestSyncHist(bool is_distributed) {
   HistogramBuilder histogram;
   uint32_t total_bins = gmat.cut.Ptrs().back();
   HistMakerTrainParam hist_param;
-  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, false, &hist_param);
+  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, false, false, &hist_param);
 
   common::RowSetCollection row_set_collection;
   {
@@ -222,13 +222,13 @@ TEST(CPUHistogram, SyncHist) {
   TestSyncHist(false);
 }
 
-void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split) {
+void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split, bool is_secure) {
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
   Context ctx;
   auto p_fmat =
       RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
-  if (is_col_split) {
+  if (is_col_split && !is_secure) {
     p_fmat = std::shared_ptr<DMatrix>{
         p_fmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
   }
@@ -244,7 +244,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
   bst_node_t nid = 0;
   HistogramBuilder histogram;
   HistMakerTrainParam hist_param;
-  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, &hist_param);
+  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, is_secure, &hist_param);
 
   RegTree tree;
 
@@ -286,22 +286,41 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
   // Now validate the computed histogram returned by BuildHist
   for (size_t i = 0; i < histogram.Histogram()[nid].size(); ++i) {
     GradientPairPrecise sol = histogram_expected[i];
-    ASSERT_NEAR(sol.GetGrad(), histogram.Histogram()[nid][i].GetGrad(), kEps);
-    ASSERT_NEAR(sol.GetHess(), histogram.Histogram()[nid][i].GetHess(), kEps);
+    double grad = sol.GetGrad();
+    double hess = sol.GetHess();
+    if (is_distributed && (!is_col_split || (is_secure && is_col_split))) {
+      // the solution also needs to be allreduce
+      collective::Allreduce<collective::Operation::kSum>(&grad, 1);
+      collective::Allreduce<collective::Operation::kSum>(&hess, 1);
+    }
+    ASSERT_NEAR(grad, histogram.Histogram()[nid][i].GetGrad(), kEps);
+    ASSERT_NEAR(hess, histogram.Histogram()[nid][i].GetHess(), kEps);
   }
 }
 
 TEST(CPUHistogram, BuildHist) {
-  TestBuildHistogram(true, false, false);
-  TestBuildHistogram(false, false, false);
-  TestBuildHistogram(true, true, false);
-  TestBuildHistogram(false, true, false);
+  TestBuildHistogram(true, false, false, false);
+  TestBuildHistogram(false, false, false, false);
+  TestBuildHistogram(true, true, false, false);
+  TestBuildHistogram(false, true, false, false);
+}
+
+TEST(CPUHistogram, BuildHistDist) {
+  auto constexpr kWorkers = 4;
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, false, false);
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, false, false);
+}
+
+TEST(CPUHistogram, BuildHistDistColSplit) {
+  auto constexpr kWorkers = 4;
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true, false);
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true, false);
 }
 
-TEST(CPUHistogram, BuildHistColSplit) {
+TEST(CPUHistogram, BuildHistDistColSplitSecure) {
   auto constexpr kWorkers = 4;
-  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true);
-  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true);
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true, true);
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true, true);
 }
 
 namespace {
@@ -360,7 +379,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   HistogramBuilder cat_hist;
   for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
-    cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+    cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, false, &hist_param);
     cat_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
     cat_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
                        linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
@@ -376,7 +395,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   HistogramBuilder onehot_hist;
   for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
-    onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+    onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, false, &hist_param);
     onehot_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
     onehot_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
                           linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
@@ -442,7 +461,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     }
     ASSERT_EQ(n_samples, m->Info().num_row_);
 
-    multi_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
+    multi_build.Reset(ctx, total_bins, batch_param, false, false, false, &hist_param);
     multi_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
     std::size_t page_idx{0};
     for (auto const &page : m->GetBatches<GHistIndexMatrix>(ctx, batch_param)) {
@@ -465,7 +484,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     common::RowSetCollection row_set_collection;
     InitRowPartitionForTest(&row_set_collection, n_samples);
 
-    single_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
+    single_build.Reset(ctx, total_bins, batch_param, false, false, false, &hist_param);
     SparsePage concat;
     std::vector<float> hess(m->Info().num_row_, 1.0f);
     for (auto const &page : m->GetBatches<SparsePage>()) {
@@ -542,7 +561,7 @@ class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
     CHECK_EQ(Xy->Info().IsColumnSplit(), is_col_split);
 
     hist_builder.Reset(&ctx, n_total_bins, tree.NumTargets(), batch, is_distributed,
-                       Xy->Info().IsColumnSplit(), &hist_param);
+                       Xy->Info().IsColumnSplit(), Xy->Info().IsSecure(), &hist_param);
 
     std::vector<CommonRowPartitioner> partitioners;
     partitioners.emplace_back(&ctx, Xy->Info().num_row_, /*base_rowid=*/0,