microsoft · jameslamb · Jun 17, 2022 · Mar 30, 2022 · Mar 31, 2022 · Apr 3, 2022
@@ -38,6 +38,10 @@ export(saveRDS.lgb.Booster)
 export(set_field)
 export(slice)
 import(methods)
+importClassesFrom(Matrix,dgCMatrix)
+importClassesFrom(Matrix,dgRMatrix)
+importClassesFrom(Matrix,dsparseMatrix)
+importClassesFrom(Matrix,dsparseVector)
 importFrom(Matrix,Matrix)
 importFrom(R6,R6Class)
 importFrom(data.table,":=")
@@ -52,6 +56,7 @@ importFrom(graphics,barplot)
 importFrom(graphics,par)
 importFrom(jsonlite,fromJSON)
 importFrom(methods,is)
+importFrom(methods,new)
 importFrom(stats,quantile)
 importFrom(utils,modifyList)
 importFrom(utils,read.delim)

@@ -1,4 +1,5 @@
-#' @importFrom methods is
+#' @importFrom methods is new
+#' @importClassesFrom Matrix dsparseMatrix dsparseVector dgCMatrix dgRMatrix
 #' @importFrom R6 R6Class
 #' @importFrom utils read.delim
 Predictor <- R6::R6Class(
@@ -127,6 +128,107 @@ Predictor <- R6::R6Class(
         num_row <- nrow(preds)
         preds <- as.vector(t(preds))
 
+      } else if (predcontrib && inherits(data, c("dsparseMatrix", "dsparseVector"))) {
+
+        ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle)
+        ncols_out <- integer(1L)
+        .Call(LGBM_BoosterGetNumClasses_R, private$handle, ncols_out)
+        ncols_out <- (ncols + 1L) * max(ncols_out, 1L)
+        if (!inherits(data, "dsparseVector") && ncols_out > .Machine$integer.max) {
+          stop("Resulting matrix of feature contributions is too large for R to handle.")
+        }
+
+        if (inherits(data, "dsparseVector")) {
+
+          if (length(data) > ncols) {
+            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
+                         , ncols
+                         , length(data)))
+          }
+          res <- .Call(
+            LGBM_BoosterPredictSparseOutput_R
+            , private$handle
+            , c(0L, as.integer(length(data@x)))
+            , data@i - 1L
+            , data@x
+            , TRUE
+            , 1L
+            , ncols
+            , start_iteration
+            , num_iteration
+            , private$params
+          )
+          out <- new("dsparseVector")
+          out@i <- res$indices + 1L
+          out@x <- res$data
+          out@length <- ncols_out
+          return(out)
+
+        } else if (inherits(data, "dgRMatrix")) {
+
+          if (ncol(data) > ncols) {
+            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
+                         , ncols
+                         , ncol(data)))
+          }
+          res <- .Call(
+            LGBM_BoosterPredictSparseOutput_R
+            , private$handle
+            , data@p
+            , data@j
+            , data@x
+            , TRUE
+            , nrow(data)
+            , ncols
+            , start_iteration
+            , num_iteration
+            , private$params
+          )
+          out <- new("dgRMatrix")
+          out@p <- res$indptr
+          out@j <- res$indices
+          out@x <- res$data
+          out@Dim <- as.integer(c(nrow(data), ncols_out))
+          return(out)
+
+        } else if (inherits(data, "dgCMatrix")) {
+
+          if (ncol(data) != ncols) {
+            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
+                         , ncols
+                         , ncol(data)))
+          }
+          res <- .Call(
+            LGBM_BoosterPredictSparseOutput_R
+            , private$handle
+            , data@p
+            , data@i
+            , data@x
+            , FALSE
+            , nrow(data)
+            , ncols
+            , start_iteration
+            , num_iteration
+            , private$params
+          )
+          out <- new("dgCMatrix")
+          out@p <- res$indptr
+          out@i <- res$indices
+          out@x <- res$data
+          out@Dim <- as.integer(c(nrow(data), length(res$indptr) - 1L))
+          return(out)
+
+        } else {
+
+          stop(sprintf("Predictions on sparse inputs are only allowed for '%s', '%s', '%s' - got: %s"
+                       , "dsparseVector"
+                       , "dgRMatrix"
+                       , "dgCMatrix"
+                       , paste(class(data)
+                       , collapse = ", ")))
+
+        }
+
       } else {
 
         # Not a file, we need to predict from R object

@@ -65,6 +65,14 @@ SEXP wrapped_R_raw(void *len) {
   return Rf_allocVector(RAWSXP, *(reinterpret_cast<R_xlen_t*>(len)));
 }
 
+SEXP wrapped_R_int(void *len) {
+  return Rf_allocVector(INTSXP, *(reinterpret_cast<R_xlen_t*>(len)));
+}
+
+SEXP wrapped_R_real(void *len) {
+  return Rf_allocVector(REALSXP, *(reinterpret_cast<R_xlen_t*>(len)));
+}
+
 SEXP wrapped_Rf_mkChar(void *txt) {
   return Rf_mkChar(reinterpret_cast<char*>(txt));
 }
@@ -84,6 +92,14 @@ SEXP safe_R_raw(R_xlen_t len, SEXP *cont_token) {
   return R_UnwindProtect(wrapped_R_raw, reinterpret_cast<void*>(&len), throw_R_memerr, cont_token, *cont_token);
 }
 
+SEXP safe_R_int(R_xlen_t len, SEXP *cont_token) {
+  return R_UnwindProtect(wrapped_R_int, reinterpret_cast<void*>(&len), throw_R_memerr, cont_token, *cont_token);
+}
+
+SEXP safe_R_real(R_xlen_t len, SEXP *cont_token) {
+  return R_UnwindProtect(wrapped_R_real, reinterpret_cast<void*>(&len), throw_R_memerr, cont_token, *cont_token);
+}
+
 SEXP safe_R_mkChar(char *txt, SEXP *cont_token) {
   return R_UnwindProtect(wrapped_Rf_mkChar, reinterpret_cast<void*>(txt), throw_R_memerr, cont_token, *cont_token);
 }
@@ -851,6 +867,72 @@ SEXP LGBM_BoosterPredictForMat_R(SEXP handle,
   R_API_END();
 }
 
+struct SparseOutputPointers {
+  void* indptr; int32_t* indices; void* data; int indptr_type; int data_type;
+  SparseOutputPointers(void* indptr, int32_t* indices, void* data)
+  : indptr(indptr), indices(indices), data(data) {}
+};
+
+void delete_SparseOutputPointers(SparseOutputPointers *ptr) {
+  LGBM_BoosterFreePredictSparse(ptr->indptr, ptr->indices, ptr->data, C_API_DTYPE_INT32, C_API_DTYPE_FLOAT64);
+  delete ptr;
+}
+
+SEXP LGBM_BoosterPredictSparseOutput_R(SEXP handle,
+  SEXP indptr,
+  SEXP indices,
+  SEXP data,
+  SEXP is_csr,
+  SEXP nrows,
+  SEXP ncols,
+  SEXP start_iteration,
+  SEXP num_iteration,
+  SEXP parameter) {
+  SEXP cont_token = PROTECT(R_MakeUnwindCont());
+  R_API_BEGIN();
+  _AssertBoosterHandleNotNull(handle);
+  const char* out_names[] = {"indptr", "indices", "data", ""};
+  SEXP out = PROTECT(Rf_mkNamed(VECSXP, out_names));
+  const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
+
+  int64_t out_len[2];
+  void *out_indptr;
+  int32_t *out_indices;
+  void *out_data;
+
+  CHECK_CALL(LGBM_BoosterPredictSparseOutput(R_ExternalPtrAddr(handle),
+    INTEGER(indptr), C_API_DTYPE_INT32, INTEGER(indices),
+    REAL(data), C_API_DTYPE_FLOAT64,
+    Rf_xlength(indptr), Rf_xlength(data),
+    Rf_asLogical(is_csr)? Rf_asInteger(ncols) : Rf_asInteger(nrows),
+    C_API_PREDICT_CONTRIB, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration),
+    parameter_ptr,
+    Rf_asLogical(is_csr)? C_API_MATRIX_TYPE_CSR : C_API_MATRIX_TYPE_CSC,
+    out_len, &out_indptr, &out_indices, &out_data));
+
+  std::unique_ptr<SparseOutputPointers, decltype(&delete_SparseOutputPointers)> pointers_struct = {
+    new SparseOutputPointers(
+      out_indptr,
+      out_indices,
+      out_data),
+    &delete_SparseOutputPointers
+  };
+
+  SEXP out_indptr_R = safe_R_int(out_len[1], &cont_token);
+  SET_VECTOR_ELT(out, 0, out_indptr_R);
+  SEXP out_indices_R = safe_R_int(out_len[0], &cont_token);
+  SET_VECTOR_ELT(out, 1, out_indices_R);
+  SEXP out_data_R = safe_R_real(out_len[0], &cont_token);
+  SET_VECTOR_ELT(out, 2, out_data_R);
+  std::memcpy(INTEGER(out_indptr_R), out_indptr, out_len[1]*sizeof(int));
+  std::memcpy(INTEGER(out_indices_R), out_indices, out_len[0]*sizeof(int));
+  std::memcpy(REAL(out_data_R), out_data, out_len[0]*sizeof(double));
+
+  UNPROTECT(3);
+  return out;
+  R_API_END();
+}
+
 SEXP LGBM_BoosterSaveModel_R(SEXP handle,
   SEXP num_iteration,
   SEXP feature_importance_type,

@@ -574,6 +574,35 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForMat_R(
   SEXP out_result
 );
 
+/*!
+* \brief make feature contribution prediction for a new Dataset
+* \param handle Booster handle
+* \param indptr array with the index pointer of the data in CSR or CSC format
+* \param indices array with the non-zero indices of the data in CSR or CSC format
+* \param data array with the non-zero values of the data in CSR or CSC format
+* \param is_csr whether the input data is in CSR format or not (pass FALSE for CSC)
+* \param nrows number of rows in the data
+* \param ncols number of columns in the data
+* \param start_iteration Start index of the iteration to predict
+* \param num_iteration number of iteration for prediction, <= 0 means no limit
+* \param parameter additional parameters
+* \return An R list with entries "indptr", "indices", "data", constituting the
+*         feature contributions in sparse format, in the same storage order as
+*         the input data.
+*/
+LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictSparseOutput_R(
+  SEXP handle,
+  SEXP indptr,
+  SEXP indices,
+  SEXP data,
+  SEXP is_csr,
+  SEXP nrows,
+  SEXP ncols,
+  SEXP start_iteration,
+  SEXP num_iteration,
+  SEXP parameter
+);
+
 /*!
 * \brief save model into file
 * \param handle Booster handle

@@ -1,3 +1,5 @@
+library(Matrix)
+
 VERBOSITY <- as.integer(
   Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
 )
@@ -111,3 +113,28 @@ test_that("start_iteration works correctly", {
     pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE)
     expect_equal(pred_leaf1, pred_leaf2)
 })
+
+test_that("Feature contributions from sparse inputs produce sparse outputs", {
+    data(mtcars)
+    X <- as.matrix(mtcars[, -1L])
+    y <- as.numeric(mtcars[, 1L])
+    dtrain <- lgb.Dataset(X, label = y, params = list(max_bins = 5L))
+    bst <- lgb.train(
+      data = dtrain
+      , obj = "regression"
+      , nrounds = 5L
+      , verbose = -1L
+    )
+
+    Xcsc <- as(X, "CsparseMatrix")
+    pred_csc <- predict(bst, Xcsc, predcontrib = TRUE)
+    expect_s4_class(pred_csc, "dgCMatrix")
+
+    Xcsr <- as(X, "RsparseMatrix")
+    pred_csr <- predict(bst, Xcsr, predcontrib = TRUE)
+    expect_s4_class(pred_csr, "dgRMatrix")
+
+    Xspv <- as(X[1L, , drop = FALSE], "sparseVector")
+    pred_spv <- predict(bst, Xspv, predcontrib = TRUE)
+    expect_s4_class(pred_spv, "dsparseVector")
+})