From c1221d982934e674f7c3fa8c627a5359cc2720b4 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 10 Dec 2023 20:11:24 +0100 Subject: [PATCH 1/2] add QuantileDMatrix creation from R dense matrices --- R-package/R/xgb.DMatrix.R | 54 +++++++++++++-- R-package/man/xgb.DMatrix.Rd | 27 +++++++- R-package/src/init.c | 2 + R-package/src/xgboost_R.cc | 91 +++++++++++++++++++++++++ R-package/src/xgboost_R.h | 13 ++++ R-package/tests/testthat/test_dmatrix.R | 64 +++++++++++++++++ 6 files changed, 243 insertions(+), 8 deletions(-) diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 9be990670e46..c5e37487110e 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -26,6 +26,25 @@ #' @param label_lower_bound Lower bound for survival training. #' @param label_upper_bound Upper bound for survival training. #' @param feature_weights Set feature weights for column sampling. +#' @param as_quantile_dmatrix Whether to generate a QuantileDMatrix instead of a regular DMatrix. +#' +#' A QuantileDMatrix generates quantilized data directly from input for the \code{hist} tree method. +#' This DMatrix variant is primarily designed to save memory in training by avoiding intermediate storage. +#' +#' Currently, QuantileDMatrix creation is only supported from dense matrices (class \code{matrix} from base R). +#' +#' When the resulting object is generated as a QuantileDMatrix, it will have an additional class +#' \code{xgb.QuantileDMatrix} in addition to inheriting from regular \code{xgb.DMatrix}. +#' @param ref The training dataset that provides quantile information, needed when creating validation/test dataset +#' with QuantileDMatrix. Supplying the training DMatrix as a reference means that the same quantisation applied to +#' the training data is applied to the validation/test data. +#' +#' This is ignored when passing \code{as_quantile_dmatrix = FALSE} or when construction of a QuantileDMatrix is not +#' possible from the supplied inputs. +#' @param max_bin The number of histogram bin, should be consistent with the training parameter \code{max_bin}. +#' +#' This is ignored when passing \code{as_quantile_dmatrix = FALSE} or when construction of a QuantileDMatrix is not +#' possible from the supplied inputs. #' #' @details #' Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}. @@ -58,11 +77,16 @@ xgb.DMatrix <- function( qid = NULL, label_lower_bound = NULL, label_upper_bound = NULL, - feature_weights = NULL + feature_weights = NULL, + as_quantile_dmatrix = FALSE, + ref = NULL, + max_bin = NULL ) { if (!is.null(group) && !is.null(qid)) { stop("Either one of 'group' or 'qid' should be NULL") } + is_quantile_dmatrix <- FALSE + nthread <- as.integer(NVL(nthread, -1)) if (typeof(data) == "character") { if (length(data) > 1) stop("'data' has class 'character' and length ", length(data), @@ -70,7 +94,18 @@ xgb.DMatrix <- function( data <- path.expand(data) handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent)) } else if (is.matrix(data)) { - handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))) + if (!as_quantile_dmatrix) { + handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, nthread) + } else { + if (!is.null(ref)) { + if (!inherits(ref, "xgb.QuantileDMatrix")) { + stop("'ref' must be an xgb.QuantileDMatrix object.") + } + } + handle <- .Call(XGQuantileDMatrixFromMat_R, data, missing, + nthread, max_bin, ref) + is_quantile_dmatrix <- TRUE + } } else if (inherits(data, "dgCMatrix")) { handle <- .Call( XGDMatrixCreateFromCSC_R, @@ -79,7 +114,7 @@ xgb.DMatrix <- function( data@x, nrow(data), missing, - as.integer(NVL(nthread, -1)) + nthread ) } else if (inherits(data, "dgRMatrix")) { handle <- .Call( @@ -89,7 +124,7 @@ xgb.DMatrix <- function( data@x, ncol(data), missing, - as.integer(NVL(nthread, -1)) + nthread ) } else if (inherits(data, "dsparseVector")) { indptr <- c(0L, as.integer(length(data@i))) @@ -101,14 +136,18 @@ xgb.DMatrix <- function( data@x, length(data), missing, - as.integer(NVL(nthread, -1)) + nthread ) } else { stop("xgb.DMatrix does not support construction from ", typeof(data)) } dmat <- handle - attributes(dmat) <- list(class = "xgb.DMatrix") + dmat_class <- "xgb.DMatrix" + if (is_quantile_dmatrix) { + dmat_class <- c(dmat_class, "xgb.QuantileDMatrix") + } + attributes(dmat) <- list(class = dmat_class) if (!is.null(label)) { setinfo(dmat, "label", label) @@ -512,7 +551,8 @@ slice.xgb.DMatrix <- function(object, idxset, ...) { #' @method print xgb.DMatrix #' @export print.xgb.DMatrix <- function(x, verbose = FALSE, ...) { - cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') + class_print <- ifelse(inherits(x, "xgb.QuantileDMatrix"), "xgb.QuantileDMatrix", "xgb.DMatrix") + cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ') infos <- character(0) if (length(getinfo(x, 'label')) > 0) infos <- 'label' if (length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight') diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 13dc3d9f56d8..ad3a3a6efe56 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -17,7 +17,10 @@ xgb.DMatrix( qid = NULL, label_lower_bound = NULL, label_upper_bound = NULL, - feature_weights = NULL + feature_weights = NULL, + as_quantile_dmatrix = FALSE, + ref = NULL, + max_bin = NULL ) } \arguments{ @@ -55,6 +58,28 @@ It is useful when a 0 or some other extreme value represents missing values in d \item{label_upper_bound}{Upper bound for survival training.} \item{feature_weights}{Set feature weights for column sampling.} + +\item{as_quantile_dmatrix}{Whether to generate a QuantileDMatrix instead of a regular DMatrix. + +A QuantileDMatrix generates quantilized data directly from input for the \code{hist} tree method. +This DMatrix variant is primarily designed to save memory in training by avoiding intermediate storage. + +Currently, QuantileDMatrix creation is only supported from dense matrices (class \code{matrix} from base R). + +When the resulting object is generated as a QuantileDMatrix, it will have an additional class +\code{xgb.QuantileDMatrix} in addition to inheriting from regular \code{xgb.DMatrix}.} + +\item{ref}{The training dataset that provides quantile information, needed when creating validation/test dataset +with QuantileDMatrix. Supplying the training DMatrix as a reference means that the same quantisation applied to +the training data is applied to the validation/test data. + +This is ignored when passing \code{as_quantile_dmatrix = FALSE} or when construction of a QuantileDMatrix is not +possible from the supplied inputs.} + +\item{max_bin}{The number of histogram bin, should be consistent with the training parameter \code{max_bin}. + +This is ignored when passing \code{as_quantile_dmatrix = FALSE} or when construction of a QuantileDMatrix is not +possible from the supplied inputs.} } \description{ Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file. diff --git a/R-package/src/init.c b/R-package/src/init.c index c35e0ecf5cac..75e5849d9a94 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -48,6 +48,7 @@ extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixSetStrFeatureInfo_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP); +extern SEXP XGQuantileDMatrixFromMat_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGBSetGlobalConfig_R(SEXP); extern SEXP XGBGetGlobalConfig_R(void); extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP); @@ -86,6 +87,7 @@ static const R_CallMethodDef CallEntries[] = { {"XGDMatrixSetInfo_R", (DL_FUNC) &XGDMatrixSetInfo_R, 3}, {"XGDMatrixSetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixSetStrFeatureInfo_R, 3}, {"XGDMatrixSliceDMatrix_R", (DL_FUNC) &XGDMatrixSliceDMatrix_R, 2}, + {"XGQuantileDMatrixFromMat_R", (DL_FUNC) &XGQuantileDMatrixFromMat_R, 5}, {"XGBSetGlobalConfig_R", (DL_FUNC) &XGBSetGlobalConfig_R, 1}, {"XGBGetGlobalConfig_R", (DL_FUNC) &XGBGetGlobalConfig_R, 0}, {"XGBoosterFeatureScore_R", (DL_FUNC) &XGBoosterFeatureScore_R, 2}, diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 9bebc5851299..31a5ba855d2d 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -455,6 +455,97 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) { return ScalarInteger(static_cast(ncol)); } +struct IteratorError : public std::exception {}; + +struct _RMatrixSingleIterator { + int iter; + DMatrixHandle proxy_dmat_handle; + const char *array_str; + + _RMatrixSingleIterator( + DMatrixHandle proxy_dmat_handle, + const char *array_str) : iter(0), proxy_dmat_handle(proxy_dmat_handle), array_str(array_str) {} + + void reset() { + this->iter = 0; + } + + int next() { + if (this->iter >= 1) { + return 0; + } + + int res_code = XGProxyDMatrixSetDataDense(this->proxy_dmat_handle, this->array_str); + if (res_code != 0) { + throw IteratorError(); + } + this->iter++; + return 1; + } +}; + +void _reset_RMatrixSingleIterator(DataIterHandle iter) { + static_cast<_RMatrixSingleIterator*>(iter)->reset(); +} + +int _next_RMatrixSingleIterator(DataIterHandle iter) { + return static_cast<_RMatrixSingleIterator*>(iter)->next(); +} + +XGB_DLL SEXP XGQuantileDMatrixFromMat_R(SEXP R_mat, SEXP missing, SEXP n_threads, + SEXP max_bin, SEXP ref_dmat) { + SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + R_API_BEGIN(); + DMatrixHandle proxy_dmat_handle; + CHECK_CALL(XGProxyDMatrixCreate(&proxy_dmat_handle)); + DMatrixHandle out_dmat; + int res_code1, res_code2; + + try { + xgboost::Json jconfig{xgboost::Object{}}; + /* FIXME: this 'missing' field should have R_NaInt when the input is an integer matrix. */ + jconfig["missing"] = Rf_asReal(missing); + if (!Rf_isNull(n_threads)) { + jconfig["nthread"] = Rf_asInteger(n_threads); + } + if (!Rf_isNull(max_bin)) { + jconfig["max_bin"] = Rf_asInteger(max_bin); + } + std::string json_str = xgboost::Json::Dump(jconfig); + + DMatrixHandle ref_dmat_handle = nullptr; + if (!Rf_isNull(ref_dmat)) { + ref_dmat_handle = R_ExternalPtrAddr(ref_dmat); + } + + std::string array_str = MakeArrayInterfaceFromRMat(R_mat); + _RMatrixSingleIterator single_iterator(proxy_dmat_handle, array_str.c_str()); + + res_code1 = XGQuantileDMatrixCreateFromCallback( + &single_iterator, + proxy_dmat_handle, + ref_dmat_handle, + _reset_RMatrixSingleIterator, + _next_RMatrixSingleIterator, + json_str.c_str(), + &out_dmat); + res_code2 = XGDMatrixFree(proxy_dmat_handle); + } catch(IteratorError &err) { + XGDMatrixFree(proxy_dmat_handle); + Rf_error(XGBGetLastError()); + } + + CHECK_CALL(res_code2); + CHECK_CALL(res_code1); + + R_SetExternalPtrAddr(ret, out_dmat); + R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); + R_API_END(); + + UNPROTECT(1); + return ret; +} + // functions related to booster void _BoosterFinalizer(SEXP ext) { if (R_ExternalPtrAddr(ext) == NULL) return; diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 4ec80b5ffb41..dd448adee686 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -133,6 +133,19 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle); */ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle); +/*! + * \brief create quantile dmatrix from dense matrix + * This assumes the matrix is stored in column major format + * \param R_mat R Matrix object + * \param missing which value to represent missing value + * \param n_threads Number of threads used to construct DMatrix from dense matrix. + * \param max_bin Maximum number of bins for building histogram. + * \param ref_dmat Optional reference DMatrix for providing quantile information + * \return created dmatrix + */ +XGB_DLL SEXP XGQuantileDMatrixFromMat_R(SEXP R_mat, SEXP missing, SEXP n_threads, + SEXP max_bin, SEXP ref_dmat); + /*! * \brief create xgboost learner * \param dmats a list of dmatrix handles that will be cached diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index f1eaa9d80f74..dbb028c07f9b 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -322,3 +322,67 @@ test_that("xgb.DMatrix: can get group for both 'qid' and 'group' constructors", expected_gr <- c(0, 20, 40, 100) expect_equal(info_gr, expected_gr) }) + +test_that("xgb.DMatrix: QuantileDMatrix produces same result as DMatrix", { + data(mtcars) + y <- mtcars[, 1] + x <- as.matrix(mtcars[, -1]) + qdm <- xgb.DMatrix( + data = x, + label = y, + as_quantile_dmatrix = TRUE, + nthread = n_threads, + max_bin = 5 + ) + params <- list( + tree_method = "hist", + objective = "reg:squarederror", + nthread = n_threads, + max_bin = 5 + ) + model_qdm <- xgb.train( + params = params, + data = qdm, + nrounds = 2 + ) + pred_qdm <- predict(model_qdm, x) + + dm <- xgb.DMatrix( + data = x, + label = y, + as_quantile_dmatrix = FALSE, + nthread = n_threads + ) + model_dm <- xgb.train( + params = params, + data = dm, + nrounds = 2 + ) + pred_dm <- predict(model_dm, x) + + expect_equal(pred_qdm, pred_dm) +}) + +test_that("xgb.DMatrix: QuantileDMatrix is not accepted by exact method", { + data(mtcars) + y <- mtcars[, 1] + x <- as.matrix(mtcars[, -1]) + qdm <- xgb.DMatrix( + data = x, + label = y, + as_quantile_dmatrix = TRUE, + nthread = n_threads + ) + params <- list( + tree_method = "exact", + objective = "reg:squarederror", + nthread = n_threads + ) + expect_error({ + xgb.train( + params = params, + data = qdm, + nrounds = 2 + ) + }) +}) From c0d22128d7c6e185cdd4532f2ff41295f742361c Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 10 Dec 2023 20:34:30 +0100 Subject: [PATCH 2/2] add static qualifier for non-exported functions --- R-package/src/xgboost_R.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 31a5ba855d2d..5d85553a373e 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -484,11 +484,11 @@ struct _RMatrixSingleIterator { } }; -void _reset_RMatrixSingleIterator(DataIterHandle iter) { +static void _reset_RMatrixSingleIterator(DataIterHandle iter) { static_cast<_RMatrixSingleIterator*>(iter)->reset(); } -int _next_RMatrixSingleIterator(DataIterHandle iter) { +static int _next_RMatrixSingleIterator(DataIterHandle iter) { return static_cast<_RMatrixSingleIterator*>(iter)->next(); }