From 508fb1baa1d81116269773cd0f691aebce4338dd Mon Sep 17 00:00:00 2001 From: David Cortes Date: Wed, 30 Mar 2022 22:48:28 +0200 Subject: [PATCH 01/19] add predcontrib for sparse inputs --- R-package/NAMESPACE | 5 ++ R-package/R/lgb.Predictor.R | 104 +++++++++++++++++++++- R-package/src/lightgbm_R.cpp | 82 +++++++++++++++++ R-package/src/lightgbm_R.h | 29 ++++++ R-package/tests/testthat/test_Predictor.R | 27 ++++++ 5 files changed, 246 insertions(+), 1 deletion(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 02e886bbcbac..65029b34ebbe 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -38,6 +38,10 @@ export(saveRDS.lgb.Booster) export(set_field) export(slice) import(methods) +importClassesFrom(Matrix,dgCMatrix) +importClassesFrom(Matrix,dgRMatrix) +importClassesFrom(Matrix,dsparseMatrix) +importClassesFrom(Matrix,dsparseVector) importFrom(Matrix,Matrix) importFrom(R6,R6Class) importFrom(data.table,":=") @@ -52,6 +56,7 @@ importFrom(graphics,barplot) importFrom(graphics,par) importFrom(jsonlite,fromJSON) importFrom(methods,is) +importFrom(methods,new) importFrom(stats,quantile) importFrom(utils,modifyList) importFrom(utils,read.delim) diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 3ca8ea98348e..9d14dec8d011 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -1,4 +1,5 @@ -#' @importFrom methods is +#' @importFrom methods is new +#' @importClassesFrom Matrix dsparseMatrix dsparseVector dgCMatrix dgRMatrix #' @importFrom R6 R6Class #' @importFrom utils read.delim Predictor <- R6::R6Class( @@ -127,6 +128,107 @@ Predictor <- R6::R6Class( num_row <- nrow(preds) preds <- as.vector(t(preds)) + } else if (predcontrib && inherits(data, c("dsparseMatrix", "dsparseVector"))) { + + ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle) + ncols_out <- integer(1L) + .Call(LGBM_BoosterGetNumClasses_R, private$handle, ncols_out) + ncols_out <- (ncols + 1L) * max(ncols_out, 1L) + if (!inherits(data, "dsparseVector") && ncols_out > .Machine$integer.max) { + stop("Resulting matrix of feature contributions is too large for R to handle.") + } + + if (inherits(data, "dsparseVector")) { + + if (length(data) > ncols) { + stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns." + , ncols + , length(data))) + } + res <- .Call( + LGBM_BoosterPredictSparseOutput_R + , private$handle + , c(0L, as.integer(length(data@x))) + , data@i - 1L + , data@x + , TRUE + , 1L + , ncols + , start_iteration + , num_iteration + , private$params + ) + out <- new("dsparseVector") + out@i <- res$indices + 1L + out@x <- res$data + out@length <- ncols_out + return(out) + + } else if (inherits(data, "dgRMatrix")) { + + if (ncol(data) > ncols) { + stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns." + , ncols + , ncol(data))) + } + res <- .Call( + LGBM_BoosterPredictSparseOutput_R + , private$handle + , data@p + , data@j + , data@x + , TRUE + , nrow(data) + , ncols + , start_iteration + , num_iteration + , private$params + ) + out <- new("dgRMatrix") + out@p <- res$indptr + out@j <- res$indices + out@x <- res$data + out@Dim <- as.integer(c(nrow(data), ncols_out)) + return(out) + + } else if (inherits(data, "dgCMatrix")) { + + if (ncol(data) != ncols) { + stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns." + , ncols + , ncol(data))) + } + res <- .Call( + LGBM_BoosterPredictSparseOutput_R + , private$handle + , data@p + , data@i + , data@x + , FALSE + , nrow(data) + , ncols + , start_iteration + , num_iteration + , private$params + ) + out <- new("dgCMatrix") + out@p <- res$indptr + out@i <- res$indices + out@x <- res$data + out@Dim <- as.integer(c(nrow(data), length(res$indptr) - 1L)) + return(out) + + } else { + + stop(sprintf("Predictions on sparse inputs are only allowed for '%s', '%s', '%s' - got: %s" + , "dsparseVector" + , "dgRMatrix" + , "dgCMatrix" + , paste(class(data) + , collapse = ", "))) + + } + } else { # Not a file, we need to predict from R object diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 27ce7d82e87a..3b444e50fabf 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -65,6 +65,14 @@ SEXP wrapped_R_raw(void *len) { return Rf_allocVector(RAWSXP, *(reinterpret_cast(len))); } +SEXP wrapped_R_int(void *len) { + return Rf_allocVector(INTSXP, *(reinterpret_cast(len))); +} + +SEXP wrapped_R_real(void *len) { + return Rf_allocVector(REALSXP, *(reinterpret_cast(len))); +} + SEXP wrapped_Rf_mkChar(void *txt) { return Rf_mkChar(reinterpret_cast(txt)); } @@ -84,6 +92,14 @@ SEXP safe_R_raw(R_xlen_t len, SEXP *cont_token) { return R_UnwindProtect(wrapped_R_raw, reinterpret_cast(&len), throw_R_memerr, cont_token, *cont_token); } +SEXP safe_R_int(R_xlen_t len, SEXP *cont_token) { + return R_UnwindProtect(wrapped_R_int, reinterpret_cast(&len), throw_R_memerr, cont_token, *cont_token); +} + +SEXP safe_R_real(R_xlen_t len, SEXP *cont_token) { + return R_UnwindProtect(wrapped_R_real, reinterpret_cast(&len), throw_R_memerr, cont_token, *cont_token); +} + SEXP safe_R_mkChar(char *txt, SEXP *cont_token) { return R_UnwindProtect(wrapped_Rf_mkChar, reinterpret_cast(txt), throw_R_memerr, cont_token, *cont_token); } @@ -851,6 +867,72 @@ SEXP LGBM_BoosterPredictForMat_R(SEXP handle, R_API_END(); } +struct SparseOutputPointers { + void* indptr; int32_t* indices; void* data; int indptr_type; int data_type; + SparseOutputPointers(void* indptr, int32_t* indices, void* data) + : indptr(indptr), indices(indices), data(data) {} +}; + +void delete_SparseOutputPointers(SparseOutputPointers *ptr) { + LGBM_BoosterFreePredictSparse(ptr->indptr, ptr->indices, ptr->data, C_API_DTYPE_INT32, C_API_DTYPE_FLOAT64); + delete ptr; +} + +SEXP LGBM_BoosterPredictSparseOutput_R(SEXP handle, + SEXP indptr, + SEXP indices, + SEXP data, + SEXP is_csr, + SEXP nrows, + SEXP ncols, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter) { + SEXP cont_token = PROTECT(R_MakeUnwindCont()); + R_API_BEGIN(); + _AssertBoosterHandleNotNull(handle); + const char* out_names[] = {"indptr", "indices", "data", ""}; + SEXP out = PROTECT(Rf_mkNamed(VECSXP, out_names)); + const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter))); + + int64_t out_len[2]; + void *out_indptr; + int32_t *out_indices; + void *out_data; + + CHECK_CALL(LGBM_BoosterPredictSparseOutput(R_ExternalPtrAddr(handle), + INTEGER(indptr), C_API_DTYPE_INT32, INTEGER(indices), + REAL(data), C_API_DTYPE_FLOAT64, + Rf_xlength(indptr), Rf_xlength(data), + Rf_asLogical(is_csr)? Rf_asInteger(ncols) : Rf_asInteger(nrows), + C_API_PREDICT_CONTRIB, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), + parameter_ptr, + Rf_asLogical(is_csr)? C_API_MATRIX_TYPE_CSR : C_API_MATRIX_TYPE_CSC, + out_len, &out_indptr, &out_indices, &out_data)); + + std::unique_ptr pointers_struct = { + new SparseOutputPointers( + out_indptr, + out_indices, + out_data), + &delete_SparseOutputPointers + }; + + SEXP out_indptr_R = safe_R_int(out_len[1], &cont_token); + SET_VECTOR_ELT(out, 0, out_indptr_R); + SEXP out_indices_R = safe_R_int(out_len[0], &cont_token); + SET_VECTOR_ELT(out, 1, out_indices_R); + SEXP out_data_R = safe_R_real(out_len[0], &cont_token); + SET_VECTOR_ELT(out, 2, out_data_R); + std::memcpy(INTEGER(out_indptr_R), out_indptr, out_len[1]*sizeof(int)); + std::memcpy(INTEGER(out_indices_R), out_indices, out_len[0]*sizeof(int)); + std::memcpy(REAL(out_data_R), out_data, out_len[0]*sizeof(double)); + + UNPROTECT(3); + return out; + R_API_END(); +} + SEXP LGBM_BoosterSaveModel_R(SEXP handle, SEXP num_iteration, SEXP feature_importance_type, diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index b352a5dfd513..0f2a0949b61c 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -574,6 +574,35 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictForMat_R( SEXP out_result ); +/*! +* \brief make feature contribution prediction for a new Dataset +* \param handle Booster handle +* \param indptr array with the index pointer of the data in CSR or CSC format +* \param indices array with the non-zero indices of the data in CSR or CSC format +* \param data array with the non-zero values of the data in CSR or CSC format +* \param is_csr whether the input data is in CSR format or not (pass FALSE for CSC) +* \param nrows number of rows in the data +* \param ncols number of columns in the data +* \param start_iteration Start index of the iteration to predict +* \param num_iteration number of iteration for prediction, <= 0 means no limit +* \param parameter additional parameters +* \return An R list with entries "indptr", "indices", "data", constituting the +* feature contributions in sparse format, in the same storage order as +* the input data. +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_BoosterPredictSparseOutput_R( + SEXP handle, + SEXP indptr, + SEXP indices, + SEXP data, + SEXP is_csr, + SEXP nrows, + SEXP ncols, + SEXP start_iteration, + SEXP num_iteration, + SEXP parameter +); + /*! * \brief save model into file * \param handle Booster handle diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index f33803e39c92..3284bc5ab4e0 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -1,3 +1,5 @@ +library(Matrix) + VERBOSITY <- as.integer( Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1") ) @@ -111,3 +113,28 @@ test_that("start_iteration works correctly", { pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE) expect_equal(pred_leaf1, pred_leaf2) }) + +test_that("Feature contributions from sparse inputs produce sparse outputs", { + data(mtcars) + X <- as.matrix(mtcars[, -1L]) + y <- as.numeric(mtcars[, 1L]) + dtrain <- lgb.Dataset(X, label = y, params = list(max_bins = 5L)) + bst <- lgb.train( + data = dtrain + , obj = "regression" + , nrounds = 5L + , verbose = -1L + ) + + Xcsc <- as(X, "CsparseMatrix") + pred_csc <- predict(bst, Xcsc, predcontrib = TRUE) + expect_s4_class(pred_csc, "dgCMatrix") + + Xcsr <- as(X, "RsparseMatrix") + pred_csr <- predict(bst, Xcsr, predcontrib = TRUE) + expect_s4_class(pred_csr, "dgRMatrix") + + Xspv <- as(X[1L, , drop = FALSE], "sparseVector") + pred_spv <- predict(bst, Xspv, predcontrib = TRUE) + expect_s4_class(pred_spv, "dsparseVector") +}) From 557a0050d9359975d43df4f021ff7fa6182685de Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 31 Mar 2022 09:11:09 +0200 Subject: [PATCH 02/19] register newly-added function --- R-package/src/lightgbm_R.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 3b444e50fabf..ed123b3eea17 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -1057,6 +1057,7 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterCalcNumPredict_R" , (DL_FUNC) &LGBM_BoosterCalcNumPredict_R , 8}, {"LGBM_BoosterPredictForCSC_R" , (DL_FUNC) &LGBM_BoosterPredictForCSC_R , 14}, {"LGBM_BoosterPredictForMat_R" , (DL_FUNC) &LGBM_BoosterPredictForMat_R , 11}, + {"LGBM_BoosterPredictSparseOutput_R", (DL_FUNC) &LGBM_BoosterPredictSparseOutput_R, 10}, {"LGBM_BoosterSaveModel_R" , (DL_FUNC) &LGBM_BoosterSaveModel_R , 4}, {"LGBM_BoosterSaveModelToString_R" , (DL_FUNC) &LGBM_BoosterSaveModelToString_R , 3}, {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 3}, From 631c935ecdea77d3399e01d5ca3df876d1bdebcf Mon Sep 17 00:00:00 2001 From: David Cortes Date: Sun, 3 Apr 2022 18:43:57 +0200 Subject: [PATCH 03/19] comments --- R-package/src/lightgbm_R.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index ed123b3eea17..560622788422 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -868,7 +868,11 @@ SEXP LGBM_BoosterPredictForMat_R(SEXP handle, } struct SparseOutputPointers { - void* indptr; int32_t* indices; void* data; int indptr_type; int data_type; + void* indptr; + int32_t* indices; + void* data; + int indptr_type; + int data_type; SparseOutputPointers(void* indptr, int32_t* indices, void* data) : indptr(indptr), indices(indices), data(data) {} }; From 82900f6b8cd8edea70017a17b7d1722d03309c5f Mon Sep 17 00:00:00 2001 From: David Cortes Date: Sun, 3 Apr 2022 19:19:30 +0200 Subject: [PATCH 04/19] correct wrong types in test --- R-package/tests/testthat/test_Predictor.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index dd44b1b5d5d0..f206250892ee 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -139,7 +139,7 @@ test_that("Feature contributions from sparse inputs produce sparse outputs", { Xspv <- as(X[1L, , drop = FALSE], "sparseVector") pred_spv <- predict(bst, Xspv, predcontrib = TRUE) expect_s4_class(pred_spv, "dsparseVector") - expect_equal(t(as.matrix(pred_spv)), pred_csc[1L, , drop = FALSE]) + expect_equal(t(as(pred_spv, "CsparseMatrix")), pred_csc[1L, , drop = FALSE]) }) test_that("predictions for regression and binary classification are returned as vectors", { From 6fd08687fa6c628c50166097303db99f4c6f2057 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Sun, 3 Apr 2022 20:18:42 +0200 Subject: [PATCH 05/19] forcibly take transpose function from Matrix --- R-package/tests/testthat/test_Predictor.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index f206250892ee..cdfc69e7abfc 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -139,7 +139,7 @@ test_that("Feature contributions from sparse inputs produce sparse outputs", { Xspv <- as(X[1L, , drop = FALSE], "sparseVector") pred_spv <- predict(bst, Xspv, predcontrib = TRUE) expect_s4_class(pred_spv, "dsparseVector") - expect_equal(t(as(pred_spv, "CsparseMatrix")), pred_csc[1L, , drop = FALSE]) + expect_equal(Matrix::t(as(pred_spv, "CsparseMatrix")), unname(pred_csc[1L, , drop = FALSE])) }) test_that("predictions for regression and binary classification are returned as vectors", { From 398ddb9768819bebc6896ce3b42e83aff2847bec Mon Sep 17 00:00:00 2001 From: David Cortes Date: Mon, 4 Apr 2022 17:27:44 +0200 Subject: [PATCH 06/19] keep row names, test comparison to dense inputs --- R-package/R/lgb.Predictor.R | 7 +++++-- R-package/tests/testthat/test_Predictor.R | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 45c4de588464..a26674fced34 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -188,7 +188,6 @@ Predictor <- R6::R6Class( out@j <- res$indices out@x <- res$data out@Dim <- as.integer(c(nrow(data), ncols_out)) - return(out) } else if (inherits(data, "dgCMatrix")) { @@ -215,7 +214,6 @@ Predictor <- R6::R6Class( out@i <- res$indices out@x <- res$data out@Dim <- as.integer(c(nrow(data), length(res$indptr) - 1L)) - return(out) } else { @@ -228,6 +226,11 @@ Predictor <- R6::R6Class( } + if (NROW(row.names(data))) { + out@Dimnames[[1L]] <- row.names(data) + } + return(out) + } else { # Not a file, we need to predict from R object diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index cdfc69e7abfc..b7235b98bfcb 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -127,9 +127,12 @@ test_that("Feature contributions from sparse inputs produce sparse outputs", { , params = list(min_data_in_leaf = 5L) ) + pred_dense <- predict(bst, X, predcontrib = TRUE) + Xcsc <- as(X, "CsparseMatrix") pred_csc <- predict(bst, Xcsc, predcontrib = TRUE) expect_s4_class(pred_csc, "dgCMatrix") + expect_equal(pred_dense, as.matrix(pred_csc)) Xcsr <- as(X, "RsparseMatrix") pred_csr <- predict(bst, Xcsr, predcontrib = TRUE) From 7d533535fa9aff47bd1904bf05e564c0027e3c29 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Mon, 4 Apr 2022 18:07:14 +0200 Subject: [PATCH 07/19] workaround for passing test while PR for row names is not merged --- R-package/tests/testthat/test_Predictor.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index b7235b98bfcb..5ac680a06467 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -132,7 +132,7 @@ test_that("Feature contributions from sparse inputs produce sparse outputs", { Xcsc <- as(X, "CsparseMatrix") pred_csc <- predict(bst, Xcsc, predcontrib = TRUE) expect_s4_class(pred_csc, "dgCMatrix") - expect_equal(pred_dense, as.matrix(pred_csc)) + expect_equal(unname(pred_dense), unname(as.matrix(pred_csc))) Xcsr <- as(X, "RsparseMatrix") pred_csr <- predict(bst, Xcsr, predcontrib = TRUE) From 8b016b80e75feeb4fd4c383753315157fef4f13f Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 10 Apr 2022 11:21:36 +0300 Subject: [PATCH 08/19] Update R-package/R/lgb.Predictor.R Co-authored-by: James Lamb --- R-package/R/lgb.Predictor.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 573322fc283f..638b5e66d5e6 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -183,7 +183,7 @@ Predictor <- R6::R6Class( , num_iteration , private$params ) - out <- new("dgRMatrix") + out <- methods::new("dgRMatrix") out@p <- res$indptr out@j <- res$indices out@x <- res$data From df49aaa2b83cdca4802d70a8fb8eca934c3f8b62 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 10 Apr 2022 11:21:43 +0300 Subject: [PATCH 09/19] Update R-package/R/lgb.Predictor.R Co-authored-by: James Lamb --- R-package/R/lgb.Predictor.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 638b5e66d5e6..ee4424fcb24d 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -157,7 +157,7 @@ Predictor <- R6::R6Class( , num_iteration , private$params ) - out <- new("dsparseVector") + out <- methods::new("dsparseVector") out@i <- res$indices + 1L out@x <- res$data out@length <- ncols_out From 5df97ded3e639ebfe18af6bdddd95e7ec44015dd Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 10 Apr 2022 11:21:49 +0300 Subject: [PATCH 10/19] Update R-package/R/lgb.Predictor.R Co-authored-by: James Lamb --- R-package/R/lgb.Predictor.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index ee4424fcb24d..2ecc5c6bd9be 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -209,7 +209,7 @@ Predictor <- R6::R6Class( , num_iteration , private$params ) - out <- new("dgCMatrix") + out <- methods::new("dgCMatrix") out@p <- res$indptr out@i <- res$indices out@x <- res$data From 6eeddc48961e2c4e58baee0a574aad719972c4bc Mon Sep 17 00:00:00 2001 From: David Cortes Date: Sun, 10 Apr 2022 10:26:55 +0200 Subject: [PATCH 11/19] proper handling of integer overflow --- R-package/R/lgb.Predictor.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index 2ecc5c6bd9be..ec09023d0540 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -133,6 +133,9 @@ Predictor <- R6::R6Class( ncols_out <- integer(1L) .Call(LGBM_BoosterGetNumClasses_R, private$handle, ncols_out) ncols_out <- (ncols + 1L) * max(ncols_out, 1L) + if (is.na(ncols_out)) { + ncols_out <- as.numeric(ncols + 1L) * as.numeric(max(ncols_out, 1L)) + } if (!inherits(data, "dsparseVector") && ncols_out > .Machine$integer.max) { stop("Resulting matrix of feature contributions is too large for R to handle.") } From e04c6f4f26b0c43d0e169e6229046b936e93c5ec Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 14 Apr 2022 20:35:45 +0200 Subject: [PATCH 12/19] add test for CSR contrib row names --- R-package/tests/testthat/test_Predictor.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 3e4e2b37e8bb..2960cb484b44 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -195,6 +195,8 @@ test_that("Feature contributions from sparse inputs produce sparse outputs", { .expect_has_row_names(pred, Xcsc) pred <- predict(bst, Xcsc, predcontrib = TRUE) .expect_has_row_names(pred, Xcsc) + pred <- predict(bst, as(Xcsc, "RsparseMatrix"), predcontrib = TRUE) + .expect_has_row_names(pred, Xcsc) # sparse matrix without row names Xcopy <- Xcsc From e38a6d79137c3a076dcba3381f492561f78562b2 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Sat, 14 May 2022 15:51:54 +0200 Subject: [PATCH 13/19] add more tests for predict(, predcontrib=TRUE) --- R-package/tests/testthat/test_Predictor.R | 47 +++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 3076ea590df7..160288298c47 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -149,6 +149,53 @@ test_that("Feature contributions from sparse inputs produce sparse outputs", { expect_equal(Matrix::t(as(pred_spv, "CsparseMatrix")), unname(pred_csc[1L, , drop = FALSE])) }) +test_that("Sparse feature contribution predictions do not take inputs with wrong number of columns", { + data(mtcars) + X <- as.matrix(mtcars[, -1L]) + y <- as.numeric(mtcars[, 1L]) + dtrain <- lgb.Dataset(X, label = y, params = list(max_bins = 5L)) + bst <- lgb.train( + data = dtrain + , obj = "regression" + , nrounds = 5L + , verbose = VERBOSITY + , params = list(min_data_in_leaf = 5L) + ) + + X_wrong <- cbind(X, X) + X_wrong <- as(X_wrong, "CsparseMatrix") + expect_error(predict(bst, X_wrong, predcontrib = TRUE)) + + X_wrong <- as(X_wrong, "RsparseMatrix") + expect_error(predict(bst, X_wrong, predcontrib = TRUE)) + + X_wrong <- as(X_wrong, "CsparseMatrix") + X_wrong <- X_wrong[, 1L:3L] + expect_error(predict(bst, X_wrong, predcontrib = TRUE)) +}) + +test_that("Feature contribution predictions do not take non-general CSR or CSC inputs", { + set.seed(123L) + y <- runif(25L) + Dmat <- matrix(runif(625L), nrow = 25L, ncol = 25L) + Dmat <- crossprod(Dmat) + Dmat <- as(Dmat, "symmetricMatrix") + SmatC <- as(Dmat, "sparseMatrix") + SmatR <- as(SmatC, "RsparseMatrix") + + dtrain <- lgb.Dataset(Dmat, label = y, params = list(max_bins = 5L)) + bst <- lgb.train( + data = dtrain + , obj = "regression" + , nrounds = 5L + , verbose = VERBOSITY + , params = list(min_data_in_leaf = 5L) + ) + + expect_error(predict(bst, SmatC, predcontrib = TRUE)) + expect_error(predict(bst, SmatR, predcontrib = TRUE)) +}) + test_that("predict() params should override keyword argument for raw-score predictions", { data(agaricus.train, package = "lightgbm") X <- agaricus.train$data From 5b65fd16e8f556c82ec1fd7d1eabb92847cb24f4 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Sat, 14 May 2022 16:09:59 +0200 Subject: [PATCH 14/19] make linter happy --- R-package/tests/testthat/test_Predictor.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 160288298c47..6eac601d2c7d 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -162,7 +162,7 @@ test_that("Sparse feature contribution predictions do not take inputs with wrong , params = list(min_data_in_leaf = 5L) ) - X_wrong <- cbind(X, X) + X_wrong <- X[, c(1L:10:, 1L:10L)] X_wrong <- as(X_wrong, "CsparseMatrix") expect_error(predict(bst, X_wrong, predcontrib = TRUE)) @@ -183,7 +183,7 @@ test_that("Feature contribution predictions do not take non-general CSR or CSC i SmatC <- as(Dmat, "sparseMatrix") SmatR <- as(SmatC, "RsparseMatrix") - dtrain <- lgb.Dataset(Dmat, label = y, params = list(max_bins = 5L)) + dtrain <- lgb.Dataset(as.matrix(Dmat), label = y, params = list(max_bins = 5L)) bst <- lgb.train( data = dtrain , obj = "regression" From c00a8ff9a5c0d34a761171fdfbd95b79604579cd Mon Sep 17 00:00:00 2001 From: David Cortes Date: Sat, 14 May 2022 16:16:40 +0200 Subject: [PATCH 15/19] linter --- R-package/tests/testthat/test_Predictor.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 6eac601d2c7d..599cccffe6fc 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -162,7 +162,7 @@ test_that("Sparse feature contribution predictions do not take inputs with wrong , params = list(min_data_in_leaf = 5L) ) - X_wrong <- X[, c(1L:10:, 1L:10L)] + X_wrong <- X[, c(1L:10, 1L:10L)] X_wrong <- as(X_wrong, "CsparseMatrix") expect_error(predict(bst, X_wrong, predcontrib = TRUE)) From d905910c60a2473a098fe8facdc827165c4d764e Mon Sep 17 00:00:00 2001 From: David Cortes Date: Sat, 14 May 2022 16:28:31 +0200 Subject: [PATCH 16/19] linter --- R-package/tests/testthat/test_Predictor.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 599cccffe6fc..570c62300b6e 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -162,7 +162,7 @@ test_that("Sparse feature contribution predictions do not take inputs with wrong , params = list(min_data_in_leaf = 5L) ) - X_wrong <- X[, c(1L:10, 1L:10L)] + X_wrong <- X[, c(1L:10L, 1L:10L)] X_wrong <- as(X_wrong, "CsparseMatrix") expect_error(predict(bst, X_wrong, predcontrib = TRUE)) From edaac4eae2a8a34a5c34c27dae6bf28a3e0c0a2a Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 26 May 2022 22:10:54 +0200 Subject: [PATCH 17/19] check error messages for bad input shapes --- R-package/tests/testthat/test_Predictor.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 570c62300b6e..034c68d29393 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -164,14 +164,14 @@ test_that("Sparse feature contribution predictions do not take inputs with wrong X_wrong <- X[, c(1L:10L, 1L:10L)] X_wrong <- as(X_wrong, "CsparseMatrix") - expect_error(predict(bst, X_wrong, predcontrib = TRUE)) + expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d columns") X_wrong <- as(X_wrong, "RsparseMatrix") - expect_error(predict(bst, X_wrong, predcontrib = TRUE)) + expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d columns") X_wrong <- as(X_wrong, "CsparseMatrix") X_wrong <- X_wrong[, 1L:3L] - expect_error(predict(bst, X_wrong, predcontrib = TRUE)) + expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d columns") }) test_that("Feature contribution predictions do not take non-general CSR or CSC inputs", { From ab86ad38da5c2e09a9b22b64aa67efe6062262e6 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 26 May 2022 22:40:27 +0200 Subject: [PATCH 18/19] fix regex --- R-package/tests/testthat/test_Predictor.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 034c68d29393..f29126a0788f 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -164,14 +164,14 @@ test_that("Sparse feature contribution predictions do not take inputs with wrong X_wrong <- X[, c(1L:10L, 1L:10L)] X_wrong <- as(X_wrong, "CsparseMatrix") - expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d columns") + expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d+ columns") X_wrong <- as(X_wrong, "RsparseMatrix") - expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d columns") + expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d+ columns") X_wrong <- as(X_wrong, "CsparseMatrix") X_wrong <- X_wrong[, 1L:3L] - expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d columns") + expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d+ columns") }) test_that("Feature contribution predictions do not take non-general CSR or CSC inputs", { From 3f8467ae565657e74830b3efd33d3a1072487506 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Fri, 17 Jun 2022 13:38:06 +0200 Subject: [PATCH 19/19] hard-coded number of columns in regex for tests --- R-package/tests/testthat/test_Predictor.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index f29126a0788f..bda397cddeda 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -164,14 +164,14 @@ test_that("Sparse feature contribution predictions do not take inputs with wrong X_wrong <- X[, c(1L:10L, 1L:10L)] X_wrong <- as(X_wrong, "CsparseMatrix") - expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d+ columns") + expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has 20 columns") X_wrong <- as(X_wrong, "RsparseMatrix") - expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d+ columns") + expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has 20 columns") X_wrong <- as(X_wrong, "CsparseMatrix") X_wrong <- X_wrong[, 1L:3L] - expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has \\d+ columns") + expect_error(predict(bst, X_wrong, predcontrib = TRUE), regexp = "input data has 3 columns") }) test_that("Feature contribution predictions do not take non-general CSR or CSC inputs", {