From e6f9a686db6c416bf8e382e2000c8866cf21cb5d Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 25 Jan 2022 23:04:04 -0300 Subject: [PATCH 1/6] add missing prediction functions to R interface --- R-package/NAMESPACE | 8 + R-package/R/lgb.Booster.R | 191 +++++++++++- R-package/R/lgb.Predictor.R | 323 +++++++++++++++++++- R-package/R/lgb.restore_handle.R | 3 + R-package/R/utils.R | 14 + R-package/man/lgb.configure_fast_predict.Rd | 112 +++++++ R-package/man/lgb.restore_handle.Rd | 4 + R-package/man/predict.lgb.Booster.Rd | 27 +- R-package/src/lightgbm_R.cpp | 254 +++++++++++++++ R-package/src/lightgbm_R.h | 188 ++++++++++++ R-package/tests/testthat/test_Predictor.R | 123 ++++++++ 11 files changed, 1219 insertions(+), 28 deletions(-) create mode 100644 R-package/man/lgb.configure_fast_predict.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 02e886bbcbac..e18ec12f0c68 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -16,6 +16,7 @@ export(lgb.Dataset.create.valid) export(lgb.Dataset.save) export(lgb.Dataset.set.categorical) export(lgb.Dataset.set.reference) +export(lgb.configure_fast_predict) export(lgb.convert_with_rules) export(lgb.cv) export(lgb.drop_serialized) @@ -38,6 +39,12 @@ export(saveRDS.lgb.Booster) export(set_field) export(slice) import(methods) +importClassesFrom(Matrix,CsparseMatrix) +importClassesFrom(Matrix,RsparseMatrix) +importClassesFrom(Matrix,dgCMatrix) +importClassesFrom(Matrix,dgRMatrix) +importClassesFrom(Matrix,dsparseMatrix) +importClassesFrom(Matrix,dsparseVector) importFrom(Matrix,Matrix) importFrom(R6,R6Class) importFrom(data.table,":=") @@ -52,6 +59,7 @@ importFrom(graphics,barplot) importFrom(graphics,par) importFrom(jsonlite,fromJSON) importFrom(methods,is) +importFrom(methods,new) importFrom(stats,quantile) importFrom(utils,modifyList) importFrom(utils,read.delim) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 311d3f2b910c..8755a043b963 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -10,6 +10,8 @@ Booster <- R6::R6Class( params = list(), record_evals = list(), + fast_predict_config = list(), + # Finalize will free up the handles finalize = function() { .Call( @@ -509,6 +511,7 @@ Booster <- R6::R6Class( predictor <- Predictor$new( modelfile = private$handle , params = params + , fast_predict_config = self$fast_predict_config ) return( predictor$predict( @@ -530,6 +533,57 @@ Booster <- R6::R6Class( return(Predictor$new(modelfile = private$handle)) }, + configure_fast_predict = function(csr = FALSE, + start_iteration = NULL, + num_iteration = NULL, + rawscore = FALSE, + predleaf = FALSE, + predcontrib = FALSE, + params = list()) { + + self$restore_handle() + ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle) + + if (is.null(num_iteration)) { + num_iteration <- -1L + } + if (is.null(start_iteration)) { + start_iteration <- 0L + } + + if (!csr) { + fun <- LGBM_BoosterPredictForMatSingleRowFastInit_R + } else { + fun <- LGBM_BoosterPredictForCSRSingleRowFastInit_R + } + + fast_handle <- .Call( + fun + , private$handle + , ncols + , rawscore + , predleaf + , predcontrib + , start_iteration + , num_iteration + , lgb.params2str(params = params) + ) + + self$fast_predict_config <- list( + handle = fast_handle + , csr = as.logical(csr) + , ncols = ncols + , start_iteration = start_iteration + , num_iteration = num_iteration + , rawscore = as.logical(rawscore) + , predleaf = as.logical(predleaf) + , predcontrib = as.logical(predcontrib) + , params = params + ) + + return(invisible(NULL)) + }, + # Used for serialization raw = NULL, @@ -727,12 +781,8 @@ Booster <- R6::R6Class( ) ) -#' @name predict.lgb.Booster -#' @title Predict method for LightGBM model -#' @description Predicted values based on class \code{lgb.Booster} -#' @param object Object of class \code{lgb.Booster} -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or -#' a character representing a path to a text file (CSV, TSV, or LibSVM) + +#' @name lgb_predict_shared_params #' @param start_iteration int or None, optional (default=None) #' Start index of the iteration to predict. #' If None or <= 0, starts from the first iteration. @@ -746,13 +796,38 @@ Booster <- R6::R6Class( #' for logistic regression would result in predictions for log-odds instead of probabilities. #' @param predleaf whether predict leaf index instead. #' @param predcontrib return per-feature contributions for each record. -#' @param header only used for prediction for text file. True if text file has header -#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several -#' prediction outputs per case. #' @param params a list of additional named parameters. See #' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#predict-parameters}{ #' the "Predict Parameters" section of the documentation} for a list of parameters and #' valid values. +NULL + +#' @name predict.lgb.Booster +#' @title Predict method for LightGBM model +#' @description Predicted values based on class \code{lgb.Booster} +#' @details If the model object has been configured for fast single-row CSR predictions through +#' \link{lgb.configure_fast_predict}, this function will use the prediction parameters +#' that were configured for it - as such, extra prediction parameters should not be passed +#' here, otherwise the configuration will be ignored and the slow route will be taken. +#' @inheritParams lgb_predict_shared_params +#' @param object Object of class \code{lgb.Booster} +#' @param data a \code{matrix} object, a \code{dgCMatrix}, a \code{dgRMatrix} object, a \code{dsparseVector} object, +#' or a character representing a path to a text file (CSV, TSV, or LibSVM). +#' +#' For sparse inputs, if predictions are only going to be made for a single row, it will be faster to +#' use CSR format, in which case the data may be passed as either a single-row CSR matrix (class +#' `dgRMatrix` from package `Matrix`) or as a sparse numeric vector (class `dsparseVector` from +#' package `Matrix`). +#' +#' If single-row predictions are going to be performed frequently, it is recommended to +#' pre-configure the model object for fast single-row sparse predictions through function +#' \link{lgb.configure_fast_predict}. +#' @param header only used for prediction for text file. True if text file has header +#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several +#' prediction outputs per case. +#' +#' If passing `predcontrib=TRUE` and the input data is sparse, this parameter will be forced +#' to `TRUE`, outputting a sparse matrix or vector of the same class as the input data. #' @param ... ignored #' @return For regression or binary classification, it returns a vector of length \code{nrows(data)}. #' For multiclass classification, either a \code{num_class * nrows(data)} vector or @@ -762,6 +837,9 @@ Booster <- R6::R6Class( #' When \code{predleaf = TRUE}, the output is a matrix object with the #' number of columns corresponding to the number of trees. #' +#' If using `predcontrib=TRUE` and the input data is a sparse matrix or sparse vector, +#' the output will also be a sparse matrix or vector of the same class. +#' #' @examples #' \donttest{ #' data(agaricus.train, package = "lightgbm") @@ -821,6 +899,10 @@ predict.lgb.Booster <- function(object, )) } + if (!reshape && predcontrib && inherits(data, c("dsparseMatrix", "dsparseVector"))) { + reshape <- TRUE + } + return( object$predict( data = data @@ -836,6 +918,97 @@ predict.lgb.Booster <- function(object, ) } +#' @title Configure Fast Single-Row Predictions +#' @description Pre-configures a LightGBM model object to produce fast single-row predictions +#' for a given input data type, prediction type, and parameters. +#' @details Calling this function multiple times with different parameters might not override +#' the previous configuration and might trigger undefined behavior. +#' +#' Any saved configuration for fast predictions might be lost after making a single-row +#' prediction of a different type than what was configured. +#' +#' In some situations, setting a fast prediction configuration for one type of prediction +#' might cause the prediction function to keep using that configuration for single-row +#' predictions even if the requested type of prediction is different from what was configured. +#' +#' The configuration does not survive de-serializations, so it has to be generated +#' anew in every R process that is going to use it (e.g. if loading a model object +#' through `readRDS`, whatever configuration was there previously will be lost). +#' +#' Requesting a different prediction type or passing parameters to \link{predict.lgb.Booster} +#' will cause it to ignore the fast-predict configuration and take the slow route instead +#' (but be aware that an existing configuration might not always be overriden by supplying +#' different parameters or prediction type, so make sure to check that the output is what +#' was expected when a prediction is to be made on a single row for something different than +#' what is configured). +#' +#' Note that, if configuring a non-default prediction type (such as leaf indices), +#' then that type must also be passed in the call to \link{predict.lgb.Booster} in +#' order for it to use the configuration. This also applies for `start_iteration` +#' and `num_iteration`, but \bold{the `params` list must be empty} in the call to `predict`. +#' +#' Predictions about feature contributions do not allow a fast route for CSR inputs, +#' and as such, this function will produce an error if passing `csr=TRUE` and +#' `predcontrib=TRUE` together. +#' @inheritParams lgb_predict_shared_params +#' @param model LighGBM model object (class \code{lgb.Booster}). +#' +#' \bold{The object will be modified in-place}. +#' @param csr Whether the prediction function is going to be called on sparse CSR inputs. +#' If `FALSE`, will be assumed that predictions are going to be called on single-row +#' regular R matrices. +#' @return The same `model` that was passed as input, as invisible, with the desired +#' configuration stored inside it and available to be used in future calls to +#' \link{predict.lgb.Booster}. +#' @examples +#' library(lightgbm) +#' data(mtcars) +#' X <- as.matrix(mtcars[, -1L]) +#' y <- mtcars[, 1L] +#' dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) +#' params <- list(min_data_in_leaf = 2L) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , obj = "regression" +#' , nrounds = 5L +#' , verbose = -1L +#' ) +#' lgb.configure_fast_predict(model) +#' +#' x_single <- X[11L, , drop = FALSE] +#' predict(model, x_single) +#' +#' # Will not use it if the prediction to be made +#' # is different from what was configured +#' predict(model, x_single, predleaf = TRUE) +#' @export +lgb.configure_fast_predict <- function(model, + csr = FALSE, + start_iteration = NULL, + num_iteration = NULL, + rawscore = FALSE, + predleaf = FALSE, + predcontrib = FALSE, + params = list()) { + if (!lgb.is.Booster(x = model)) { + stop("lgb.configure_fast_predict: model should be an ", sQuote("lgb.Booster")) + } + if (csr && predcontrib) { + stop("'lgb.configure_fast_predict' does not support feature contributions for CSR data.") + } + model$configure_fast_predict( + csr + , start_iteration + , num_iteration + , rawscore + , predleaf + , predcontrib + , params + ) + return(invisible(model)) +} + #' @name print.lgb.Booster #' @title Print method for LightGBM model #' @description Show summary information about a LightGBM model object (same as \code{summary}). diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 3ca8ea98348e..4d24453e04dc 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -1,12 +1,15 @@ -#' @importFrom methods is +#' @importFrom methods is new #' @importFrom R6 R6Class #' @importFrom utils read.delim +#' @importClassesFrom Matrix dsparseMatrix dsparseVector dgCMatrix dgRMatrix CsparseMatrix RsparseMatrix Predictor <- R6::R6Class( classname = "lgb.Predictor", cloneable = FALSE, public = list( + fast_predict_config = list(), + # Finalize will free up the handles finalize = function() { @@ -26,7 +29,7 @@ Predictor <- R6::R6Class( }, # Initialize will create a starter model - initialize = function(modelfile, params = list()) { + initialize = function(modelfile, params = list(), fast_predict_config = list()) { private$params <- lgb.params2str(params = params) handle <- NULL @@ -56,6 +59,8 @@ Predictor <- R6::R6Class( } + self$fast_predict_config <- fast_predict_config + # Override class and store it class(handle) <- "lgb.Booster.handle" private$handle <- handle @@ -127,10 +132,113 @@ Predictor <- R6::R6Class( num_row <- nrow(preds) preds <- as.vector(t(preds)) + } else if (predcontrib && inherits(data, c("dsparseMatrix", "dsparseVector"))) { + + ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle) + ncols_out <- integer(1L) + .Call(LGBM_BoosterGetNumClasses_R, private$handle, ncols_out) + ncols_out <- (ncols + 1L) * max(ncols_out, 1L) + if (!inherits(data, "dsparseVector") && ncols_out > .Machine$integer.max) { + stop("Resulting matrix of feature contributions is too large for R to handle.") + } + + if (inherits(data, "dsparseVector")) { + + if (length(data) > ncols) { + stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns." + , ncols, length(data))) + } + res <- .Call( + LGBM_BoosterPredictSparseOutput_R + , private$handle + , c(0L, as.integer(length(data@x))) + , data@i - 1L + , data@x + , TRUE + , 1L + , ncols + , start_iteration + , num_iteration + , private$params + ) + out <- new("dsparseVector") + out@i <- res$indices + 1L + out@x <- res$data + out@length <- ncols_out + return(out) + + } else if (inherits(data, "dgRMatrix")) { + + if (ncol(data) > ncols) { + stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns." + , ncols, ncol(data))) + } + res <- .Call( + LGBM_BoosterPredictSparseOutput_R + , private$handle + , data@p + , data@j + , data@x + , TRUE + , nrow(data) + , ncols + , start_iteration + , num_iteration + , private$params + ) + out <- new("dgRMatrix") + out@p <- res$indptr + out@j <- res$indices + out@x <- res$data + out@Dim <- as.integer(c(nrow(data), ncols_out)) + if (NROW(data@Dimnames[[1L]])) { + out@Dimnames <- list(data@Dimnames[[1L]], NULL) + } + return(out) + + } else if (inherits(data, "dgCMatrix")) { + + if (ncol(data) != ncols) { + stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns." + , ncols, ncol(data))) + } + res <- .Call( + LGBM_BoosterPredictSparseOutput_R + , private$handle + , data@p + , data@i + , data@x + , FALSE + , nrow(data) + , ncols + , start_iteration + , num_iteration + , private$params + ) + out <- new("dgCMatrix") + out@p <- res$indptr + out@i <- res$indices + out@x <- res$data + out@Dim <- as.integer(c(nrow(data), length(res$indptr) - 1L)) + if (NROW(data@Dimnames[[1L]])) { + out@Dimnames <- list(data@Dimnames[[1L]], NULL) + } + return(out) + + } else { + + stop(sprintf("Predictions on sparse inputs are only allowed for '%s', '%s', '%s' - got: %s" + , "dsparseVector", "dgRMatrix", "dgCMatrix", paste(class(data), collapse = ", "))) + + } + } else { # Not a file, we need to predict from R object num_row <- nrow(data) + if (is.null(num_row)) { + num_row <- 1L + } npred <- 0L @@ -157,20 +265,173 @@ Predictor <- R6::R6Class( if (storage.mode(data) != "double") { storage.mode(data) <- "double" } - .Call( - LGBM_BoosterPredictForMat_R - , private$handle - , data - , as.integer(nrow(data)) - , as.integer(ncol(data)) - , as.integer(rawscore) - , as.integer(predleaf) - , as.integer(predcontrib) - , as.integer(start_iteration) - , as.integer(num_iteration) - , private$params - , preds - ) + + if (nrow(data) == 1L) { + + use_fast_config <- self$check_can_use_fast_predict_config( + FALSE + , rawscore + , predleaf + , predcontrib + , start_iteration + , num_iteration + ) + + if (use_fast_config) { + .Call( + LGBM_BoosterPredictForMatSingleRowFast_R + , self$fast_predict_config$handle + , data + , preds + ) + } else { + .Call( + LGBM_BoosterPredictForMatSingleRow_R + , private$handle + , data + , rawscore + , predleaf + , predcontrib + , start_iteration + , num_iteration + , private$params + , preds + ) + } + + } else { + .Call( + LGBM_BoosterPredictForMat_R + , private$handle + , data + , as.integer(nrow(data)) + , as.integer(ncol(data)) + , as.integer(rawscore) + , as.integer(predleaf) + , as.integer(predcontrib) + , as.integer(start_iteration) + , as.integer(num_iteration) + , private$params + , preds + ) + } + + } else if (inherits(data, "dsparseVector")) { + + if (length(self$fast_predict_config)) { + ncols <- self$fast_predict_config$ncols + use_fast_config <- self$check_can_use_fast_predict_config( + TRUE + , rawscore + , predleaf + , predcontrib + , start_iteration + , num_iteration + ) + } else { + ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle) + use_fast_config <- FALSE + } + + if (length(data) > ncols) { + stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns." + , ncols, length(data))) + } + + if (use_fast_config) { + .Call( + LGBM_BoosterPredictForCSRSingleRowFast_R + , self$fast_predict_config$handle + , data@i - 1L + , data@x + , preds + ) + } else { + .Call( + LGBM_BoosterPredictForCSRSingleRow_R + , private$handle + , data@i - 1L + , data@x + , ncols + , as.integer(rawscore) + , as.integer(predleaf) + , as.integer(predcontrib) + , start_iteration + , num_iteration + , private$params + , preds + ) + } + + } else if (inherits(data, "dgRMatrix")) { + + ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle) + if (ncol(data) > ncols) { + stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns." + , ncols, ncol(data))) + } + + if (nrow(data) == 1L) { + + if (length(self$fast_predict_config)) { + ncols <- self$fast_predict_config$ncols + use_fast_config <- self$check_can_use_fast_predict_config( + TRUE + ,rawscore + , predleaf + , predcontrib + , start_iteration + , num_iteration + ) + } else { + ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle) + use_fast_config <- FALSE + } + + if (use_fast_config) { + .Call( + LGBM_BoosterPredictForCSRSingleRowFast_R + , self$fast_predict_config$handle + , data@j + , data@x + , preds + ) + } else { + .Call( + LGBM_BoosterPredictForCSRSingleRow_R + , private$handle + , data@j + , data@x + , ncols + , as.integer(rawscore) + , as.integer(predleaf) + , as.integer(predcontrib) + , start_iteration + , num_iteration + , private$params + , preds + ) + } + + } else { + + .Call( + LGBM_BoosterPredictForCSR_R + , private$handle + , data@p + , data@j + , data@x + , ncols + , as.integer(rawscore) + , as.integer(predleaf) + , as.integer(predcontrib) + , start_iteration + , num_iteration + , private$params + , preds + ) + + } } else if (methods::is(data, "dgCMatrix")) { if (length(data@p) > 2147483647L) { @@ -232,6 +493,36 @@ Predictor <- R6::R6Class( return(preds) + }, + + check_can_use_fast_predict_config = function(csr, + rawscore, + predleaf, + predcontrib, + start_iteration, + num_iteration) { + + if (!NROW(self$fast_predict_config)) { + return(FALSE) + } + + if (lgb.is.null.handle(self$fast_predict_config$handle)) { + warning("Model had fast CSR predict configuration, but it is inactive.") + return(FALSE) + } + + if (as.logical(csr) != self$fast_predict_config$csr) { + return(FALSE) + } + + return( + private$params == "" && + self$fast_predict_config$rawscore == rawscore && + self$fast_predict_config$predleaf == predleaf && + self$fast_predict_config$predcontrib == predcontrib && + lgb.equal.or.both.null(self$fast_predict_config$start_iteration, start_iteration) && + lgb.equal.or.both.null(self$fast_predict_config$num_iteration, num_iteration) + ) } ), diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R index be3036a52986..07c68349740f 100644 --- a/R-package/R/lgb.restore_handle.R +++ b/R-package/R/lgb.restore_handle.R @@ -4,6 +4,9 @@ #' \code{saveRDS}, its underlying C++ object will be blank and needs to be restored to able to use it. Such #' object is restored automatically when calling functions such as \code{predict}, but this function can be #' used to forcibly restore it beforehand. Note that the object will be modified in-place. +#' +#' @details Be aware that fast single-row CSR prediction configurations are not restored through this +#' function - instead, they have to be generated anew with the desired parameters. #' @param model \code{lgb.Booster} object which was de-serialized and whose underlying C++ object and R handle #' need to be restored. #' diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 3cdab4dcfd08..f438cb95e3bf 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -282,3 +282,17 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v params[[main_param_name]] <- alternative_kwarg_value return(params) } + +lgb.equal.or.both.null <- function(a, b) { + if (is.null(a)) { + if (!is.null(b)) { + return(FALSE) + } + return(TRUE) + } else { + if (is.null(b)) { + return(FALSE) + } + return(a == b) + } +} diff --git a/R-package/man/lgb.configure_fast_predict.Rd b/R-package/man/lgb.configure_fast_predict.Rd new file mode 100644 index 000000000000..44d7ad75d296 --- /dev/null +++ b/R-package/man/lgb.configure_fast_predict.Rd @@ -0,0 +1,112 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lgb.Booster.R +\name{lgb.configure_fast_predict} +\alias{lgb.configure_fast_predict} +\title{Configure Fast Single-Row Predictions} +\usage{ +lgb.configure_fast_predict( + model, + csr = FALSE, + start_iteration = NULL, + num_iteration = NULL, + rawscore = FALSE, + predleaf = FALSE, + predcontrib = FALSE, + params = list() +) +} +\arguments{ +\item{model}{LighGBM model object (class \code{lgb.Booster}). + + \bold{The object will be modified in-place}.} + +\item{csr}{Whether the prediction function is going to be called on sparse CSR inputs. +If `FALSE`, will be assumed that predictions are going to be called on single-row +regular R matrices.} + +\item{start_iteration}{int or None, optional (default=None) +Start index of the iteration to predict. +If None or <= 0, starts from the first iteration.} + +\item{num_iteration}{int or None, optional (default=None) +Limit number of iterations in the prediction. +If None, if the best iteration exists and start_iteration is None or <= 0, the +best iteration is used; otherwise, all iterations from start_iteration are used. +If <= 0, all iterations from start_iteration are used (no limits).} + +\item{rawscore}{whether the prediction should be returned in the for of original untransformed +sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} +for logistic regression would result in predictions for log-odds instead of probabilities.} + +\item{predleaf}{whether predict leaf index instead.} + +\item{predcontrib}{return per-feature contributions for each record.} + +\item{params}{a list of additional named parameters. See +\href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#predict-parameters}{ +the "Predict Parameters" section of the documentation} for a list of parameters and +valid values.} +} +\value{ +The same `model` that was passed as input, as invisible, with the desired + configuration stored inside it and available to be used in future calls to + \link{predict.lgb.Booster}. +} +\description{ +Pre-configures a LightGBM model object to produce fast single-row predictions + for a given input data type, prediction type, and parameters. +} +\details{ +Calling this function multiple times with different parameters might not override + the previous configuration and might trigger undefined behavior. + + Any saved configuration for fast predictions might be lost after making a single-row + prediction of a different type than what was configured. + + In some situations, setting a fast prediction configuration for one type of prediction + might cause the prediction function to keep using that configuration for single-row + predictions even if the requested type of prediction is different from what was configured. + + The configuration does not survive de-serializations, so it has to be generated + anew in every R process that is going to use it (e.g. if loading a model object + through `readRDS`, whatever configuration was there previously will be lost). + + Requesting a different prediction type or passing parameters to \link{predict.lgb.Booster} + will cause it to ignore the fast-predict configuration and take the slow route instead + (but be aware that an existing configuration might not always be overriden by supplying + different parameters or prediction type, so make sure to check that the output is what + was expected when a prediction is to be made on a single row for something different than + what is configured). + + Note that, if configuring a non-default prediction type (such as leaf indices), + then that type must also be passed in the call to \link{predict.lgb.Booster} in + order for it to use the configuration. This also applies for `start_iteration` + and `num_iteration`, but \bold{the `params` list must be empty} in the call to `predict`. + + Predictions about feature contributions do not allow a fast route for CSR inputs, + and as such, this function will produce an error if passing `csr=TRUE` and + `predcontrib=TRUE` together. +} +\examples{ +library(lightgbm) +data(mtcars) +X <- as.matrix(mtcars[, -1L]) +y <- mtcars[, 1L] +dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) +params <- list(min_data_in_leaf = 2L) +model <- lgb.train( + params = params + , data = dtrain + , obj = "regression" + , nrounds = 5L + , verbose = -1L +) +lgb.configure_fast_predict(model) + +x_single <- X[11L, , drop = FALSE] +predict(model, x_single) + +# Will not use it if the prediction to be made +# is different from what was configured +predict(model, x_single, predleaf = TRUE) +} diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd index 199614241502..4c63e24dd138 100644 --- a/R-package/man/lgb.restore_handle.Rd +++ b/R-package/man/lgb.restore_handle.Rd @@ -19,6 +19,10 @@ After a LightGBM model object is de-serialized through functions such as \code{s object is restored automatically when calling functions such as \code{predict}, but this function can be used to forcibly restore it beforehand. Note that the object will be modified in-place. } +\details{ +Be aware that fast single-row CSR prediction configurations are not restored through this +function - instead, they have to be generated anew with the desired parameters. +} \examples{ library(lightgbm) data("agaricus.train") diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index 8948a4b17d01..bb6f6870505a 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -21,8 +21,17 @@ \arguments{ \item{object}{Object of class \code{lgb.Booster}} -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or -a character representing a path to a text file (CSV, TSV, or LibSVM)} +\item{data}{a \code{matrix} object, a \code{dgCMatrix}, a \code{dgRMatrix} object, a \code{dsparseVector} object, + or a character representing a path to a text file (CSV, TSV, or LibSVM). + + For sparse inputs, if predictions are only going to be made for a single row, it will be faster to + use CSR format, in which case the data may be passed as either a single-row CSR matrix (class + `dgRMatrix` from package `Matrix`) or as a sparse numeric vector (class `dsparseVector` from + package `Matrix`). + + If single-row predictions are going to be performed frequently, it is recommended to + pre-configure the model object for fast single-row sparse predictions through function + \link{lgb.configure_fast_predict}.} \item{start_iteration}{int or None, optional (default=None) Start index of the iteration to predict. @@ -45,7 +54,10 @@ for logistic regression would result in predictions for log-odds instead of prob \item{header}{only used for prediction for text file. True if text file has header} \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several -prediction outputs per case.} + prediction outputs per case. + + If passing `predcontrib=TRUE` and the input data is sparse, this parameter will be forced + to `TRUE`, outputting a sparse matrix or vector of the same class as the input data.} \item{params}{a list of additional named parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#predict-parameters}{ @@ -62,10 +74,19 @@ For regression or binary classification, it returns a vector of length \code{nro When \code{predleaf = TRUE}, the output is a matrix object with the number of columns corresponding to the number of trees. + + If using `predcontrib=TRUE` and the input data is a sparse matrix or sparse vector, + the output will also be a sparse matrix or vector of the same class. } \description{ Predicted values based on class \code{lgb.Booster} } +\details{ +If the model object has been configured for fast single-row CSR predictions through + \link{lgb.configure_fast_predict}, this function will use the prediction parameters + that were configured for it - as such, extra prediction parameters should not be passed + here, otherwise the configuration will be ignored and the slow route will be taken. +} \examples{ \donttest{ data(agaricus.train, package = "lightgbm") diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 781fa0024d9c..ba971f3351cf 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -65,6 +66,14 @@ SEXP wrapped_R_raw(void *len) { return Rf_allocVector(RAWSXP, *(reinterpret_cast(len))); } +SEXP wrapped_R_int(void *len) { + return Rf_allocVector(INTSXP, *(reinterpret_cast(len))); +} + +SEXP wrapped_R_real(void *len) { + return Rf_allocVector(REALSXP, *(reinterpret_cast(len))); +} + SEXP wrapped_Rf_mkChar(void *txt) { return Rf_mkChar(reinterpret_cast(txt)); } @@ -84,6 +93,14 @@ SEXP safe_R_raw(R_xlen_t len, SEXP *cont_token) { return R_UnwindProtect(wrapped_R_raw, reinterpret_cast(&len), throw_R_memerr, cont_token, *cont_token); } +SEXP safe_R_int(R_xlen_t len, SEXP *cont_token) { + return R_UnwindProtect(wrapped_R_int, reinterpret_cast(&len), throw_R_memerr, cont_token, *cont_token); +} + +SEXP safe_R_real(R_xlen_t len, SEXP *cont_token) { + return R_UnwindProtect(wrapped_R_real, reinterpret_cast(&len), throw_R_memerr, cont_token, *cont_token); +} + SEXP safe_R_mkChar(char *txt, SEXP *cont_token) { return R_UnwindProtect(wrapped_Rf_mkChar, reinterpret_cast(txt), throw_R_memerr, cont_token, *cont_token); } @@ -812,6 +829,175 @@ SEXP LGBM_BoosterPredictForCSC_R(SEXP handle, R_API_END(); } +SEXP LGBM_BoosterPredictForCSR_R(SEXP handle, + SEXP indptr, + SEXP indices, + SEXP data, + SEXP ncols, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter, + SEXP out_result) { + R_API_BEGIN(); + _AssertBoosterHandleNotNull(handle); + int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib); + const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter))); + int64_t out_len; + CHECK_CALL(LGBM_BoosterPredictForCSR(R_ExternalPtrAddr(handle), + INTEGER(indptr), C_API_DTYPE_INT32, INTEGER(indices), + REAL(data), C_API_DTYPE_FLOAT64, + Rf_xlength(indptr), Rf_xlength(data), Rf_asInteger(ncols), + pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), + parameter_ptr, &out_len, REAL(out_result))); + UNPROTECT(1); + return R_NilValue; + R_API_END(); +} + +SEXP LGBM_BoosterPredictForCSRSingleRow_R(SEXP handle, + SEXP indices, + SEXP data, + SEXP ncols, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter, + SEXP out_result) { + R_API_BEGIN(); + _AssertBoosterHandleNotNull(handle); + int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib); + const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter))); + int nnz = (int)Rf_xlength(data); + const int indptr[] = {0, nnz}; + int64_t out_len; + CHECK_CALL(LGBM_BoosterPredictForCSRSingleRow(R_ExternalPtrAddr(handle), + indptr, C_API_DTYPE_INT32, INTEGER(indices), + REAL(data), C_API_DTYPE_FLOAT64, + 2, nnz, Rf_asInteger(ncols), + pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), + parameter_ptr, &out_len, REAL(out_result))); + UNPROTECT(1); + return R_NilValue; + R_API_END(); +} + +void LGBM_FastConfigFree_wrapped(SEXP handle) { + LGBM_FastConfigFree(static_cast(R_ExternalPtrAddr(handle))); +} + +SEXP LGBM_BoosterPredictForCSRSingleRowFastInit_R(SEXP handle, + SEXP ncols, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter) { + R_API_BEGIN(); + _AssertBoosterHandleNotNull(handle); + int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib); + SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter))); + FastConfigHandle out_fastConfig; + CHECK_CALL(LGBM_BoosterPredictForCSRSingleRowFastInit(R_ExternalPtrAddr(handle), + pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), + C_API_DTYPE_FLOAT64, Rf_asInteger(ncols), + parameter_ptr, &out_fastConfig)); + R_SetExternalPtrAddr(ret, out_fastConfig); + R_RegisterCFinalizerEx(ret, LGBM_FastConfigFree_wrapped, TRUE); + UNPROTECT(2); + return ret; + R_API_END(); +} + +SEXP LGBM_BoosterPredictForCSRSingleRowFast_R(SEXP handle_fastConfig, + SEXP indices, + SEXP data, + SEXP out_result) { + R_API_BEGIN(); + int nnz = (int)Rf_xlength(data); + const int indptr[] = {0, nnz}; + int64_t out_len; + CHECK_CALL(LGBM_BoosterPredictForCSRSingleRowFast(R_ExternalPtrAddr(handle_fastConfig), + indptr, C_API_DTYPE_INT32, INTEGER(indices), + REAL(data), + 2, nnz, + &out_len, REAL(out_result))); + return R_NilValue; + R_API_END(); +} + +struct SparseOutputPointers { + void* indptr; int32_t* indices; void* data; int indptr_type; int data_type; + SparseOutputPointers(void* indptr, int32_t* indices, void* data) + : indptr(indptr), indices(indices), data(data) {}; +}; + +void delete_SparseOutputPointers(SparseOutputPointers *ptr) { + LGBM_BoosterFreePredictSparse(ptr->indptr, ptr->indices, ptr->data, C_API_DTYPE_INT32, C_API_DTYPE_FLOAT64); + delete ptr; +} + +SEXP LGBM_BoosterPredictSparseOutput_R(SEXP handle, + SEXP indptr, + SEXP indices, + SEXP data, + SEXP is_csr, + SEXP nrows, + SEXP ncols, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter) { + SEXP cont_token = PROTECT(R_MakeUnwindCont()); + R_API_BEGIN(); + _AssertBoosterHandleNotNull(handle); + const char* out_names[] = {"indptr", "indices", "data", ""}; + SEXP out = PROTECT(Rf_mkNamed(VECSXP, out_names)); + const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter))); + + int64_t out_len[2]; + void *out_indptr; + int32_t *out_indices; + void *out_data; + + CHECK_CALL(LGBM_BoosterPredictSparseOutput(R_ExternalPtrAddr(handle), + INTEGER(indptr), C_API_DTYPE_INT32, INTEGER(indices), + REAL(data), C_API_DTYPE_FLOAT64, + Rf_xlength(indptr), Rf_xlength(data), + Rf_asLogical(is_csr)? Rf_asInteger(ncols) : Rf_asInteger(nrows), + C_API_PREDICT_CONTRIB, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), + parameter_ptr, + Rf_asLogical(is_csr)? C_API_MATRIX_TYPE_CSR : C_API_MATRIX_TYPE_CSC, + out_len, &out_indptr, &out_indices, &out_data)); + + std::unique_ptr pointers_struct = { + new SparseOutputPointers( + out_indptr, + out_indices, + out_data), + &delete_SparseOutputPointers + }; + + SEXP out_indptr_R = safe_R_int(out_len[1], &cont_token); + SET_VECTOR_ELT(out, 0, out_indptr_R); + SEXP out_indices_R = safe_R_int(out_len[0], &cont_token); + SET_VECTOR_ELT(out, 1, out_indices_R); + SEXP out_data_R = safe_R_real(out_len[0], &cont_token); + SET_VECTOR_ELT(out, 2, out_data_R); + std::memcpy(INTEGER(out_indptr_R), out_indptr, out_len[1]*sizeof(int)); + std::memcpy(INTEGER(out_indices_R), out_indices, out_len[0]*sizeof(int)); + std::memcpy(REAL(out_data_R), out_data, out_len[0]*sizeof(double)); + + UNPROTECT(3); + return out; + R_API_END(); +} + SEXP LGBM_BoosterPredictForMat_R(SEXP handle, SEXP data, SEXP num_row, @@ -840,6 +1026,66 @@ SEXP LGBM_BoosterPredictForMat_R(SEXP handle, R_API_END(); } +SEXP LGBM_BoosterPredictForMatSingleRow_R(SEXP handle, + SEXP data, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter, + SEXP out_result) { + R_API_BEGIN(); + _AssertBoosterHandleNotNull(handle); + int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib); + const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter))); + double* ptr_ret = REAL(out_result); + int64_t out_len; + CHECK_CALL(LGBM_BoosterPredictForMatSingleRow(R_ExternalPtrAddr(handle), + REAL(data), C_API_DTYPE_FLOAT64, Rf_xlength(data), 1, + pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), + parameter_ptr, &out_len, ptr_ret)); + UNPROTECT(1); + return R_NilValue; + R_API_END(); +} + +SEXP LGBM_BoosterPredictForMatSingleRowFastInit_R(SEXP handle, + SEXP ncols, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter) { + R_API_BEGIN(); + _AssertBoosterHandleNotNull(handle); + int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib); + SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter))); + FastConfigHandle out_fastConfig; + CHECK_CALL(LGBM_BoosterPredictForMatSingleRowFastInit(R_ExternalPtrAddr(handle), + pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), + C_API_DTYPE_FLOAT64, Rf_asInteger(ncols), + parameter_ptr, &out_fastConfig)); + R_SetExternalPtrAddr(ret, out_fastConfig); + R_RegisterCFinalizerEx(ret, LGBM_FastConfigFree_wrapped, TRUE); + UNPROTECT(2); + return ret; + R_API_END(); +} + +SEXP LGBM_BoosterPredictForMatSingleRowFast_R(SEXP handle_fastConfig, + SEXP data, + SEXP out_result) { + R_API_BEGIN(); + int64_t out_len; + CHECK_CALL(LGBM_BoosterPredictForMatSingleRowFast(R_ExternalPtrAddr(handle_fastConfig), + REAL(data), &out_len, REAL(out_result))); + return R_NilValue; + R_API_END(); +} + SEXP LGBM_BoosterSaveModel_R(SEXP handle, SEXP num_iteration, SEXP feature_importance_type, @@ -962,7 +1208,15 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterPredictForFile_R" , (DL_FUNC) &LGBM_BoosterPredictForFile_R , 10}, {"LGBM_BoosterCalcNumPredict_R" , (DL_FUNC) &LGBM_BoosterCalcNumPredict_R , 8}, {"LGBM_BoosterPredictForCSC_R" , (DL_FUNC) &LGBM_BoosterPredictForCSC_R , 14}, + {"LGBM_BoosterPredictForCSR_R" , (DL_FUNC) &LGBM_BoosterPredictForCSR_R , 12}, + {"LGBM_BoosterPredictForCSRSingleRow_R", (DL_FUNC) &LGBM_BoosterPredictForCSRSingleRow_R, 11}, + {"LGBM_BoosterPredictForCSRSingleRowFastInit_R", (DL_FUNC) &LGBM_BoosterPredictForCSRSingleRowFastInit_R, 8}, + {"LGBM_BoosterPredictForCSRSingleRowFast_R", (DL_FUNC) &LGBM_BoosterPredictForCSRSingleRowFast_R, 4}, + {"LGBM_BoosterPredictSparseOutput_R", (DL_FUNC) &LGBM_BoosterPredictSparseOutput_R, 10}, {"LGBM_BoosterPredictForMat_R" , (DL_FUNC) &LGBM_BoosterPredictForMat_R , 11}, + {"LGBM_BoosterPredictForMatSingleRow_R", (DL_FUNC) &LGBM_BoosterPredictForMatSingleRow_R, 9}, + {"LGBM_BoosterPredictForMatSingleRowFastInit_R", (DL_FUNC) &LGBM_BoosterPredictForMatSingleRowFastInit_R, 8}, + {"LGBM_BoosterPredictForMatSingleRowFast_R", (DL_FUNC) &LGBM_BoosterPredictForMatSingleRowFast_R, 3}, {"LGBM_BoosterSaveModel_R" , (DL_FUNC) &LGBM_BoosterSaveModel_R , 4}, {"LGBM_BoosterSaveModelToString_R" , (DL_FUNC) &LGBM_BoosterSaveModelToString_R , 3}, {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 3}, diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 562ebec2e7da..6d4a73e3296c 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -527,6 +527,121 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForCSC_R( SEXP out_result ); +/*! +* \brief make prediction for a new Dataset +* Note: should pre-allocate memory for out_result, +* for normal and raw score: its length is equal to num_class * num_data +* for leaf index, its length is equal to num_class * num_data * num_iteration +* for feature contributions, its length is equal to num_data * num_class * (num_features + 1) +* \param handle Booster handle +* \param indptr array with the index pointer of the data in CSR format +* \param indices array with the non-zero indices of the data in CSR format +* \param data array with the non-zero values of the data in CSR format +* \param ncols number of columns in the data +* \param is_rawscore 1 to get raw predictions, before transformations like +* converting to probabilities, 0 otherwise +* \param is_leafidx 1 to get record of which leaf in each tree +* observations fell into, 0 otherwise +* \param is_predcontrib 1 to get feature contributions, 0 otherwise +* \param start_iteration Start index of the iteration to predict +* \param num_iteration number of iteration for prediction, <= 0 means no limit +* \param parameter additional parameters +* \param out_result prediction result +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForCSR_R( + SEXP handle, + SEXP indptr, + SEXP indices, + SEXP data, + SEXP ncols, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter, + SEXP out_result +); + +/*! +* \brief make prediction for a single row of data +* Note: should pre-allocate memory for out_result, +* for normal and raw score: its length is equal to num_class +* for leaf index, its length is equal to num_class * num_iteration +* for feature contributions, its length is equal to num_class * (num_features + 1) +* \param handle Booster handle +* \param indices array correspoinding to the indices of the columns with non-zero values of the row to predict on +* \param data array correspoinding to the non-zero values of row to predict on +* \param is_rawscore 1 to get raw predictions, before transformations like +* converting to probabilities, 0 otherwise +* \param is_leafidx 1 to get record of which leaf in each tree +* observations fell into, 0 otherwise +* \param is_predcontrib 1 to get feature contributions, 0 otherwise +* \param start_iteration Start index of the iteration to predict +* \param num_iteration number of iteration for prediction, <= 0 means no limit +* \param parameter additional parameters +* \param out_result prediction result +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForCSRSingleRow_R( + SEXP handle, + SEXP indices, + SEXP data, + SEXP ncols, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter, + SEXP out_result +); + +/*! +* \brief Initialize and return a fast configuration handle to use with ``LGBM_BoosterPredictForCSRSingleRowFast_R``. +* \param handle Booster handle +* \param num_col number columns in the data +* \param is_rawscore 1 to get raw predictions, before transformations like +* converting to probabilities, 0 otherwise +* \param is_leafidx 1 to get record of which leaf in each tree +* observations fell into, 0 otherwise +* \param is_predcontrib 1 to get feature contributions, 0 otherwise +* \param start_iteration Start index of the iteration to predict +* \param num_iteration number of iteration for prediction, <= 0 means no limit +* \param parameter additional parameters +* \return Fast configuration handle +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForCSRSingleRowFastInit_R( + SEXP handle, + SEXP ncols, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter +); + +/*! +* \brief make prediction for a single row of data +* Note: should pre-allocate memory for out_result, +* for normal and raw score: its length is equal to num_class +* for leaf index, its length is equal to num_class * num_iteration +* for feature contributions, its length is equal to num_class * (num_features + 1) +* \param handle_fastConfig Fast configuration handle +* \param indices array correspoinding to the indices of the columns with non-zero values of the row to predict on +* \param data array correspoinding to the non-zero values of row to predict on +* \param out_result prediction result +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForCSRSingleRowFast_R( + SEXP handle_fastConfig, + SEXP indices, + SEXP data, + SEXP out_result +); + /*! * \brief make prediction for a new Dataset * Note: should pre-allocate memory for out_result, @@ -561,6 +676,79 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForMat_R( SEXP out_result ); +/*! +* \brief make prediction for a single row of data +* Note: should pre-allocate memory for out_result, +* for normal and raw score: its length is equal to num_class +* for leaf index, its length is equal to num_class * num_iteration +* for feature contributions, its length is equal to num_class * (num_features + 1) +* \param handle Booster handle +* \param data array correspoinding to the row to predict on +* \param is_rawscore 1 to get raw predictions, before transformations like +* converting to probabilities, 0 otherwise +* \param is_leafidx 1 to get record of which leaf in each tree +* observations fell into, 0 otherwise +* \param is_predcontrib 1 to get feature contributions, 0 otherwise +* \param start_iteration Start index of the iteration to predict +* \param num_iteration number of iteration for prediction, <= 0 means no limit +* \param parameter additional parameters +* \param out_result prediction result +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForMatSingleRow_R( + SEXP handle, + SEXP data, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter, + SEXP out_result +); + +/*! +* \brief Initialize and return a fast configuration handle to use with ``LGBM_BoosterPredictForMatSingleRowFast_R``. +* \param handle Booster handle +* \param num_col number columns in the data +* \param is_rawscore 1 to get raw predictions, before transformations like +* converting to probabilities, 0 otherwise +* \param is_leafidx 1 to get record of which leaf in each tree +* observations fell into, 0 otherwise +* \param is_predcontrib 1 to get feature contributions, 0 otherwise +* \param start_iteration Start index of the iteration to predict +* \param num_iteration number of iteration for prediction, <= 0 means no limit +* \param parameter additional parameters +* \return Fast configuration handle +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForMatSingleRowFastInit_R( + SEXP handle, + SEXP ncols, + SEXP is_rawscore, + SEXP is_leafidx, + SEXP is_predcontrib, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter +); + +/*! +* \brief make prediction for a single row of data +* Note: should pre-allocate memory for out_result, +* for normal and raw score: its length is equal to num_class +* for leaf index, its length is equal to num_class * num_iteration +* for feature contributions, its length is equal to num_class * (num_features + 1) +* \param handle_fastConfig Fast configuration handle +* \param data array correspoinding to the row to predict on +* \param out_result prediction result +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForMatSingleRowFast_R( + SEXP handle_fastConfig, + SEXP data, + SEXP out_result +); + /*! * \brief save model into file * \param handle Booster handle diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 5a1927e4e512..917dea2cf588 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -111,3 +111,126 @@ test_that("start_iteration works correctly", { pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE) expect_equal(pred_leaf1, pred_leaf2) }) + +test_that("Single-row predictions are identical to multi-row ones", { + data(mtcars) + X <- as.matrix(mtcars[, -1L]) + y <- mtcars[, 1L] + dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) + params <- list(min_data_in_leaf = 2L) + model <- lgb.train( + params = params + , data = dtrain + , obj = "regression" + , nrounds = 5L + , verbose = -1L + ) + + x1 <- X[1L, , drop = FALSE] + x11 <- X[11L, , drop = FALSE] + x1_spv <- as(x1, "sparseVector") + x11_spv <- as(x11, "sparseVector") + x1_csr <- as(x1, "RsparseMatrix") + x11_csr <- as(x11, "RsparseMatrix") + + pred_all <- predict(model, X) + pred1_wo_config <- predict(model, x1) + pred11_wo_config <- predict(model, x11) + pred1_spv_wo_config <- predict(model, x1_spv) + pred11_spv_wo_config <- predict(model, x11_spv) + pred1_csr_wo_config <- predict(model, x1_csr) + pred11_csr_wo_config <- predict(model, x11_csr) + + lgb.configure_fast_predict(model) + pred1_w_config <- predict(model, x1) + pred11_w_config <- predict(model, x11) + + model <- lgb.train( + params = params + , data = dtrain + , obj = "regression" + , nrounds = 5L + , verbose = -1L + ) + lgb.configure_fast_predict(model, csr = TRUE) + pred1_spv_w_config <- predict(model, x1_spv) + pred11_spv_w_config <- predict(model, x11_spv) + pred1_csr_w_config <- predict(model, x1_csr) + pred11_csr_w_config <- predict(model, x11_csr) + + expect_equal(pred1_wo_config, pred_all[1L]) + expect_equal(pred11_wo_config, pred_all[11L]) + expect_equal(pred1_spv_wo_config, pred_all[1L]) + expect_equal(pred11_spv_wo_config, pred_all[11L]) + expect_equal(pred1_csr_wo_config, pred_all[1L]) + expect_equal(pred11_csr_wo_config, pred_all[11L]) + + expect_equal(pred1_w_config, pred_all[1L]) + expect_equal(pred11_w_config, pred_all[11L]) + expect_equal(pred1_spv_w_config, pred_all[1L]) + expect_equal(pred11_spv_w_config, pred_all[11L]) + expect_equal(pred1_csr_w_config, pred_all[1L]) + expect_equal(pred11_csr_w_config, pred_all[11L]) +}) + +test_that("Fast-predict configuration accepts non-default prediction types", { + data(mtcars) + X <- as.matrix(mtcars[, -1L]) + y <- mtcars[, 1L] + dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) + params <- list(min_data_in_leaf = 2L) + model <- lgb.train( + params = params + , data = dtrain + , obj = "regression" + , nrounds = 5L + , verbose = -1L + ) + + x1 <- X[1L, , drop = FALSE] + x11 <- X[11L, , drop = FALSE] + + pred_all <- predict(model, X, predleaf = TRUE) + pred1_wo_config <- predict(model, x1, predleaf = TRUE) + pred11_wo_config <- predict(model, x11, predleaf = TRUE) + expect_equal(pred1_wo_config, pred_all[1L, , drop = FALSE]) + expect_equal(pred11_wo_config, pred_all[11L, , drop = FALSE]) + + lgb.configure_fast_predict(model, predleaf = TRUE) + pred1_w_config <- predict(model, x1, predleaf = TRUE) + pred11_w_config <- predict(model, x11, predleaf = TRUE) + expect_equal(pred1_w_config, pred_all[1L, , drop = FALSE]) + expect_equal(pred11_w_config, pred_all[11L, , drop = FALSE]) +}) + +test_that("Fast-predict configuration does not block other prediction types", { + data(mtcars) + X <- as.matrix(mtcars[, -1L]) + y <- mtcars[, 1L] + dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L)) + params <- list(min_data_in_leaf = 2L) + model <- lgb.train( + params = params + , data = dtrain + , obj = "regression" + , nrounds = 5L + , verbose = -1L + ) + + x1 <- X[1L, , drop = FALSE] + x11 <- X[11L, , drop = FALSE] + + pred_all <- predict(model, X) + pred_all_leaf <- predict(model, X, predleaf = TRUE) + + lgb.configure_fast_predict(model) + pred1_w_config <- predict(model, x1) + pred11_w_config <- predict(model, x11) + pred1_leaf_w_config <- predict(model, x1, predleaf = TRUE) + pred11_leaf_w_config <- predict(model, x11, predleaf = TRUE) + + expect_equal(pred1_w_config, pred_all[1L]) + expect_equal(pred11_w_config, pred_all[11L]) + expect_equal(pred1_leaf_w_config, pred_all_leaf[1L, , drop = FALSE]) + expect_equal(pred11_leaf_w_config, pred_all_leaf[11L, , drop = FALSE]) +}) From e01eb290edcdd6c83bba3bcbbe8203b9b82e2f87 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 25 Jan 2022 23:24:45 -0300 Subject: [PATCH 2/6] linter --- R-package/R/lgb.Booster.R | 2 +- R-package/R/lgb.Predictor.R | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 8755a043b963..41b1d241f435 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -540,7 +540,7 @@ Booster <- R6::R6Class( predleaf = FALSE, predcontrib = FALSE, params = list()) { - + self$restore_handle() ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle) diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 4d24453e04dc..17e96baa2ae9 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -143,7 +143,7 @@ Predictor <- R6::R6Class( } if (inherits(data, "dsparseVector")) { - + if (length(data) > ncols) { stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns." , ncols, length(data))) @@ -226,12 +226,12 @@ Predictor <- R6::R6Class( return(out) } else { - + stop(sprintf("Predictions on sparse inputs are only allowed for '%s', '%s', '%s' - got: %s" , "dsparseVector", "dgRMatrix", "dgCMatrix", paste(class(data), collapse = ", "))) - + } - + } else { # Not a file, we need to predict from R object @@ -377,7 +377,7 @@ Predictor <- R6::R6Class( ncols <- self$fast_predict_config$ncols use_fast_config <- self$check_can_use_fast_predict_config( TRUE - ,rawscore + , rawscore , predleaf , predcontrib , start_iteration From 39502bebe475dfc98f47c341d60c95cbb6a8d226 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 25 Jan 2022 23:28:58 -0300 Subject: [PATCH 3/6] add missing function to header --- R-package/src/lightgbm_R.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 6d4a73e3296c..25aed12b3705 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -642,6 +642,35 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForCSRSingleRowFast_R( SEXP out_result ); +/*! +* \brief make feature contribution prediction for a new Dataset +* \param handle Booster handle +* \param indptr array with the index pointer of the data in CSR or CSC format +* \param indices array with the non-zero indices of the data in CSR or CSC format +* \param data array with the non-zero values of the data in CSR or CSC format +* \param is_csr whether the input data is in CSR format or not (pass FALSE for CSC) +* \param nrows number of rows in the data +* \param ncols number of columns in the data +* \param start_iteration Start index of the iteration to predict +* \param num_iteration number of iteration for prediction, <= 0 means no limit +* \param parameter additional parameters +* \return An R list with entries "indptr", "indices", "data", constituting the +* feature contributions in sparse format, in the same storage order as +* the input data. +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictSparseOutput_R( + SEXP handle, + SEXP indptr, + SEXP indices, + SEXP data, + SEXP is_csr, + SEXP nrows, + SEXP ncols, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter +); + /*! * \brief make prediction for a new Dataset * Note: should pre-allocate memory for out_result, From 8469cb1262382738c6248032f6e5ed997978ade3 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 25 Jan 2022 23:36:18 -0300 Subject: [PATCH 4/6] linter --- R-package/src/lightgbm_R.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index ba971f3351cf..8ce1f999b109 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -872,7 +872,7 @@ SEXP LGBM_BoosterPredictForCSRSingleRow_R(SEXP handle, _AssertBoosterHandleNotNull(handle); int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib); const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter))); - int nnz = (int)Rf_xlength(data); + int nnz = static_cast(Rf_xlength(data)); const int indptr[] = {0, nnz}; int64_t out_len; CHECK_CALL(LGBM_BoosterPredictForCSRSingleRow(R_ExternalPtrAddr(handle), @@ -920,7 +920,7 @@ SEXP LGBM_BoosterPredictForCSRSingleRowFast_R(SEXP handle_fastConfig, SEXP data, SEXP out_result) { R_API_BEGIN(); - int nnz = (int)Rf_xlength(data); + int nnz = static_cast(Rf_xlength(data)); const int indptr[] = {0, nnz}; int64_t out_len; CHECK_CALL(LGBM_BoosterPredictForCSRSingleRowFast(R_ExternalPtrAddr(handle_fastConfig), @@ -935,7 +935,7 @@ SEXP LGBM_BoosterPredictForCSRSingleRowFast_R(SEXP handle_fastConfig, struct SparseOutputPointers { void* indptr; int32_t* indices; void* data; int indptr_type; int data_type; SparseOutputPointers(void* indptr, int32_t* indices, void* data) - : indptr(indptr), indices(indices), data(data) {}; + : indptr(indptr), indices(indices), data(data) {} }; void delete_SparseOutputPointers(SparseOutputPointers *ptr) { @@ -992,7 +992,7 @@ SEXP LGBM_BoosterPredictSparseOutput_R(SEXP handle, std::memcpy(INTEGER(out_indptr_R), out_indptr, out_len[1]*sizeof(int)); std::memcpy(INTEGER(out_indices_R), out_indices, out_len[0]*sizeof(int)); std::memcpy(REAL(out_data_R), out_data, out_len[0]*sizeof(double)); - + UNPROTECT(3); return out; R_API_END(); From e02501f1ee8a7cccdd822748cafecd8c7cddab86 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Wed, 23 Mar 2022 22:25:41 +0100 Subject: [PATCH 5/6] remove accidentally added rownames in predcontrib --- R-package/R/lgb.Predictor.R | 6 ------ 1 file changed, 6 deletions(-) diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 17e96baa2ae9..a28ba16be524 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -191,9 +191,6 @@ Predictor <- R6::R6Class( out@j <- res$indices out@x <- res$data out@Dim <- as.integer(c(nrow(data), ncols_out)) - if (NROW(data@Dimnames[[1L]])) { - out@Dimnames <- list(data@Dimnames[[1L]], NULL) - } return(out) } else if (inherits(data, "dgCMatrix")) { @@ -220,9 +217,6 @@ Predictor <- R6::R6Class( out@i <- res$indices out@x <- res$data out@Dim <- as.integer(c(nrow(data), length(res$indptr) - 1L)) - if (NROW(data@Dimnames[[1L]])) { - out@Dimnames <- list(data@Dimnames[[1L]], NULL) - } return(out) } else { From fde3c9f8fc7fefc2da1f087814065f8a3ae13ac8 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Wed, 23 Mar 2022 22:56:32 +0100 Subject: [PATCH 6/6] redo badly solved merge conflict --- R-package/R/lgb.Booster.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 4144bbda40cf..788ca9830b28 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -880,7 +880,7 @@ predict.lgb.Booster <- function(object, )) } - if (!reshape && predcontrib && inherits(data, c("dsparseMatrix", "dsparseVector"))) { + if (!reshape && predcontrib && inherits(newdata, c("dsparseMatrix", "dsparseVector"))) { reshape <- TRUE }