diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 02e886bbcbac..8c7c8ffd7d68 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -4,6 +4,10 @@ S3method("dimnames<-",lgb.Dataset) S3method(dim,lgb.Dataset) S3method(dimnames,lgb.Dataset) S3method(get_field,lgb.Dataset) +S3method(lightgbm,data.frame) +S3method(lightgbm,dgCMatrix) +S3method(lightgbm,formula) +S3method(lightgbm,matrix) S3method(predict,lgb.Booster) S3method(print,lgb.Booster) S3method(set_field,lgb.Dataset) @@ -38,11 +42,16 @@ export(saveRDS.lgb.Booster) export(set_field) export(slice) import(methods) +importClassesFrom(Matrix,CsparseMatrix) +importClassesFrom(Matrix,dgCMatrix) +importClassesFrom(Matrix,sparseMatrix) +importClassesFrom(Matrix,sparseVector) importFrom(Matrix,Matrix) importFrom(R6,R6Class) importFrom(data.table,":=") importFrom(data.table,as.data.table) importFrom(data.table,data.table) +importFrom(data.table,is.data.table) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setnames) @@ -51,8 +60,11 @@ importFrom(data.table,setorderv) importFrom(graphics,barplot) importFrom(graphics,par) importFrom(jsonlite,fromJSON) +importFrom(methods,as) importFrom(methods,is) +importFrom(parallel,detectCores) importFrom(stats,quantile) +importFrom(utils,head) importFrom(utils,modifyList) importFrom(utils,read.delim) useDynLib(lib_lightgbm , .registration = TRUE) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 311d3f2b910c..a0eddd9d9d65 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -9,6 +9,7 @@ Booster <- R6::R6Class( best_score = NA_real_, params = list(), record_evals = list(), + data_processor = NULL, # Finalize will free up the handles finalize = function() { @@ -497,6 +498,10 @@ Booster <- R6::R6Class( self$restore_handle() + if (!is.null(self$data_processor)) { + data <- self$data_processor$process_new_data(data) + } + if (is.null(num_iteration)) { num_iteration <- self$best_iter } @@ -510,19 +515,20 @@ Booster <- R6::R6Class( modelfile = private$handle , params = params ) - return( - predictor$predict( - data = data - , start_iteration = start_iteration - , num_iteration = num_iteration - , rawscore = rawscore - , predleaf = predleaf - , predcontrib = predcontrib - , header = header - , reshape = reshape - ) + pred <- predictor$predict( + data = data + , start_iteration = start_iteration + , num_iteration = num_iteration + , rawscore = rawscore + , predleaf = predleaf + , predcontrib = predcontrib + , header = header + , reshape = reshape ) - + if (!predleaf && !is.null(self$data_processor)) { + pred <- self$data_processor$process_predictions(pred, predcontrib) + } + return(pred) }, # Transform into predictor @@ -729,10 +735,60 @@ Booster <- R6::R6Class( #' @name predict.lgb.Booster #' @title Predict method for LightGBM model -#' @description Predicted values based on class \code{lgb.Booster} -#' @param object Object of class \code{lgb.Booster} -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or -#' a character representing a path to a text file (CSV, TSV, or LibSVM) +#' @description Predict values on new data based on a boosting model (class \code{lgb.Booster}). +#' @param object Object of class \code{lgb.Booster} from which to make predictions. +#' @param newdata New data on which to make predictions. Allowed types are:\itemize{ +#' \item `data.frame`, \bold{only if} the model object was produced through the \link{lightgbm} +#' interface. If the input to \link{lightgbm} was a `formula` or a `data.frame` with +#' categorical columns (`factor` or `character`), then \bold{only} `data.frame` inputs will +#' be accepted here. Columns will be taken according to the names that they had in the data +#' that they were passed to the model (i.e. the input here will be reordered if the order +#' does not match, and will be subsetted if it has additional columns). +#' \item `matrix` from base R. Will be converted to numeric if it isn't already. +#' \item `dgCMatrix` from package `Matrix`. +#' \item `character` with a single entry representing a path to a text file in CSV, TSV, +#' or SVMLight / LibSVM formats. +#' } +#' Other input types are not allowed. +#' +#' Note that, if using the `formula` interface, the user is responsible for making +#' factor variables' levels match to those that were passed in the data to which the model +#' was fitted, and if the model was not produced through the \link{lightgbm} interface +#' (e.g. through \link{lgb.train} or \link{lgb.cv}), then the user is responsible for +#' handling the encoding of categorical variables. +#' @param type Type of prediction to output. Allowed types are:\itemize{ +#' \item `"score"`, which will output the predicted score according to the function +#' objective function being optimized (equivalent to `"link"` in base R's `glm`) - for +#' example, for `objective="binary"`, it will output probabilities, while for +#' `objective="regression"`, it will output predicted values. For objective functions other +#' than multi-class classification, the result will be a numeric vector with number of rows +#' matching to `nrow(newdata)`. For multi-class classification, if passing `reshape=TRUE`, +#' it will output a matrix with columns matching to the number of classes (and if the model +#' object was produced through the \link{lightgbm} interface instead of through +#' \link{lgb.train} or \link{lgb.cv}, it will have class names as column names if available), +#' and if passing `reshape=FALSE`, will output a numeric vector with these same results in +#' row-major order. +#' \item `"class"` (only for binary and multi-class classification objectives), which will +#' output the class with the highest predicted score. If the model object was produced through +#' the \link{lightgbm} interface and the label was a factor variable, the result will be a +#' factor variable with levels matching to classes, otherwise it will be an integer vector +#' with indicating the class number. +#' \item `"raw"`, which will output the non-transformed numbers (sum of predictions from +#' boosting iterations' results) from which the score is produced for a given objective +#' function - for example, for `objective="binary"`, this corresponds to log-odds. The +#' output type is the same as for `type="score"`. +#' \item `"leaf"`, which will output the index of the terminal node / leaf at which +#' each observations falls in each tree in the model, outputted as as integers. If passing +#' `reshape=TRUE`, the result will be a matrix with number of columns matching to number of +#' trees, otherwise it will be a vector with this same matrix in row-major order. +#' \item `"contrib"`, which will return the per-feature contributions for each prediction. +#' If passing `reshape=TRUE`, the result will be a matrix with number of columns matching +#' to number of features that the model saw while fitting, otherwise will be a vector with +#' this same matrix outputted in row-major order. If the model object was produced through +#' the \link{lightgbm} interface, `reshape=TRUE` is passed, and the data to which the model +#' was fit had column names, then the output matrix will have column names corresponding to +#' the feature names. +#' } #' @param start_iteration int or None, optional (default=None) #' Start index of the iteration to predict. #' If None or <= 0, starts from the first iteration. @@ -741,26 +797,25 @@ Booster <- R6::R6Class( #' If None, if the best iteration exists and start_iteration is None or <= 0, the #' best iteration is used; otherwise, all iterations from start_iteration are used. #' If <= 0, all iterations from start_iteration are used (no limits). -#' @param rawscore whether the prediction should be returned in the for of original untransformed -#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} -#' for logistic regression would result in predictions for log-odds instead of probabilities. -#' @param predleaf whether predict leaf index instead. -#' @param predcontrib return per-feature contributions for each record. #' @param header only used for prediction for text file. True if text file has header #' @param reshape whether to reshape the vector of predictions to a matrix form when there are several -#' prediction outputs per case. +#' prediction outputs per case. When using `reshape=FALSE`, the output will +#' be in row-major order (contrary to R matrices which assume column-major order). +#' If passing `reshape=TRUE` and `newdata` has row names, the output will also have those +#' row names. +#' @param index1 When producing outputs that correspond to some numeration (such as +#' `type="class"` or `type="leaf"`), whether to make these outputs have a numeration +#' starting at 1 or at zero. Note that the underlying lightgbm core library uses zero-based +#' numeration, thus `index1=FALSE` will be slightly faster. #' @param params a list of additional named parameters. See #' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#predict-parameters}{ #' the "Predict Parameters" section of the documentation} for a list of parameters and #' valid values. -#' @param ... ignored -#' @return For regression or binary classification, it returns a vector of length \code{nrows(data)}. -#' For multiclass classification, either a \code{num_class * nrows(data)} vector or -#' a \code{(nrows(data), num_class)} dimension matrix is returned, depending on -#' the \code{reshape} value. -#' -#' When \code{predleaf = TRUE}, the output is a matrix object with the -#' number of columns corresponding to the number of trees. +#' @param ... Ignored. +#' @return Either a matrix with number of rows matching to the number of rows in `newdata`, or +#' a vector with number of entries matching to rows in `newdata`, or a vector representing a +#' matrix in row-major order with number of entries matching to `nrow(newdata)*n_outputs`; +#' depending on the requested `type` and `reshape` parameter. #' #' @examples #' \donttest{ @@ -797,14 +852,13 @@ Booster <- R6::R6Class( #' @importFrom utils modifyList #' @export predict.lgb.Booster <- function(object, - data, + newdata, + type = c("score", "class", "raw", "leaf", "contrib"), start_iteration = NULL, num_iteration = NULL, - rawscore = FALSE, - predleaf = FALSE, - predcontrib = FALSE, header = FALSE, - reshape = FALSE, + reshape = TRUE, + index1 = TRUE, params = list(), ...) { @@ -812,6 +866,30 @@ predict.lgb.Booster <- function(object, stop("predict.lgb.Booster: object should be an ", sQuote("lgb.Booster")) } + if (!is.character(type)) { + stop("'type' must be a character variable.") + } + type <- type[1L] + allowed_type <- c("score", "class", "raw", "leaf", "contrib") + if (!(type %in% allowed_type)) { + stop(sprintf("'type' must be one of the following: %s" + , paste(allowed_type, collapse = ", "))) + } + if (type == "class") { + reshape <- TRUE + } + rawscore <- type == "raw" + predleaf <- type == "leaf" + predcontrib <- type == "contrib" + if (type == "class") { + classif_objectives <- c("binary", "multiclass", "multiclassova") + if (!(object$params$objective %in% classif_objectives)) { + stop(sprintf(paste0("Passed prediction 'type=class', but model is not a classifier" + , "(objective: %s).") + , object$params$objective)) + } + } + additional_params <- list(...) if (length(additional_params) > 0L) { warning(paste0( @@ -821,19 +899,51 @@ predict.lgb.Booster <- function(object, )) } - return( - object$predict( - data = data - , start_iteration = start_iteration - , num_iteration = num_iteration - , rawscore = rawscore - , predleaf = predleaf - , predcontrib = predcontrib - , header = header - , reshape = reshape - , params = params - ) + pred <- object$predict( + data = newdata + , start_iteration = start_iteration + , num_iteration = num_iteration + , rawscore = rawscore + , predleaf = predleaf + , predcontrib = predcontrib + , header = header + , reshape = reshape + , params = params ) + if (type == "class") { + if (object$params$objective == "binary") { + pred <- as.integer(pred >= 0.5) + if (NROW(object$data_processor$label_levels)) { + pred <- pred + 1L + attributes(pred)$levels <- object$data_processor$label_levels + attributes(pred)$class <- "factor" + } else if (index1) { + pred <- pred + 1L + } + } else { + cnames <- colnames(pred) + pred <- max.col(pred) + if (NROW(cnames)) { + if (!is.integer(pred)) { + pred <- as.integer(pred) + } + attributes(pred)$levels <- cnames + attributes(pred)$class <- "factor" + } else if (!index1) { + pred <- pred - 1L + } + } + } else if (type == "leaf" && index1) { + pred <- pred + 1L + } + if (reshape && NROW(row.names(newdata))) { + if (is.null(dim(pred))) { + names(pred) <- row.names(newdata) + } else { + row.names(pred) <- row.names(newdata) + } + } + return(pred) } #' @name print.lgb.Booster diff --git a/R-package/R/lgb.DataProcessor.R b/R-package/R/lgb.DataProcessor.R new file mode 100644 index 000000000000..c468f3352230 --- /dev/null +++ b/R-package/R/lgb.DataProcessor.R @@ -0,0 +1,304 @@ +#' @importFrom data.table is.data.table +#' @importFrom methods as +#' @importClassesFrom Matrix sparseVector sparseMatrix CsparseMatrix dgCMatrix + +DataProcessor <- R6::R6Class( + classname = "lgb.DataProcessor", + public = list( + ncols = NULL, + colnames = NULL, + factor_levels = NULL, + formula = NULL, + formula_terms = NULL, + formula_predict = NULL, + label_levels = NULL, + initialize = function(env_out, + data, + params, + model_formula = NULL, + label = NULL, + weights = NULL, + init_score = NULL) { + + if (!is.null(model_formula)) { + + if (!is.data.frame(data)) { + stop("'lightgbm()' formula interface is only supported for 'data.frame' inputs.") + } + self$formula <- model_formula + formula_terms <- as.character(model_formula) + formula_terms[3L] <- paste0(formula_terms[3L], "-1") + model_formula <- paste0(formula_terms[2L], formula_terms[1L], formula_terms[3L]) + model_formula <- as.formula(model_formula) + self$formula_terms <- terms(model_formula, data = data) + self$formula_predict <- delete.response(self$formula_terms) + model_frame <- model.frame(model_formula, data, na.action = NULL) + label <- model.response(model_frame, type = "any") + data <- model.matrix(self$formula_predict, data = model_frame) + + } else { + + self$colnames <- colnames(data) + + if (NROW(self$colnames)) { + + # A replacement of 'deparse1' which was added in R 4.0.0, + # added for compatibility with older R versions + deparse1_ <- function(x) paste(deparse(x, width.cutoff = 500L), collapse = "") + + label_nse <- substitute(label) + label_nse <- eval.parent(substitute(substitute(label_nse)), n = 2L) + label_nse <- deparse1_(label_nse) + if (label_nse != "NULL" && label_nse %in% self$colnames) { + self$colnames <- self$colnames[self$colnames != label_nse] + if (data.table::is.data.table(data)) { + label <- data[, label_nse, with = FALSE, drop = TRUE] + } else { + label <- data[, label_nse, drop = TRUE] + } + } else if (is.character(label) && NROW(label) == 1L && label %in% self$colnames) { + self$colnames <- self$colnames[self$colnames != label] + if (data.table::is.data.table(data)) { + label <- data[, label, with = FALSE, drop = TRUE] + } else { + label <- data[, label, drop = TRUE] + } + } + + weights_nse <- substitute(weights) + weights_nse <- eval.parent(substitute(substitute(weights_nse)), n = 2L) + weights_nse <- deparse1_(weights_nse) + if (weights_nse != "NULL" && weights_nse %in% self$colnames) { + self$colnames <- self$colnames[self$colnames != weights_nse] + if (data.table::is.data.table(data)) { + weights <- data[, weights_nse, with = FALSE, drop = TRUE] + } else { + weights <- data[, weights_nse, drop = TRUE] + } + } else if (is.character(weights) && NROW(weights) == 1L && weights %in% self$colnames) { + self$colnames <- self$colnames[self$colnames != weights] + if (data.table::is.data.table(data)) { + weights <- data[, weights, with = FALSE, drop = TRUE] + } else { + weights <- data[, weights, drop = TRUE] + } + } + + init_score_nse <- substitute(init_score) + init_score_nse <- eval.parent(substitute(substitute(init_score_nse)), n = 2L) + init_score_nse <- deparse1_(init_score_nse) + if (init_score_nse != "NULL" && init_score_nse %in% self$colnames) { + self$colnames <- self$colnames[self$colnames != init_score_nse] + if (data.table::is.data.table(data)) { + init_score <- data[, init_score_nse, with = FALSE, drop = TRUE] + } else { + init_score <- data[, init_score_nse, drop = TRUE] + } + } else if (is.character(init_score) && NROW(init_score) == 1L && init_score %in% self$colnames) { + self$colnames <- self$colnames[self$colnames != init_score] + if (data.table::is.data.table(data)) { + init_score <- data[, init_score, with = FALSE, drop = TRUE] + } else { + init_score <- data[, init_score, drop = TRUE] + } + } + + if (length(self$colnames) < ncol(data)) { + if (data.table::is.data.table(data)) { + data <- data[, self$colnames, with = FALSE, drop = FALSE] + } else { + data <- data[, self$colnames, drop = FALSE] + } + } + } else { + self$colnames <- NULL + } + + self$ncols <- ncol(data) + + if (is.data.frame(data)) { + + supported_types <- c("numeric", "integer", "factor", "character", "Date", "POSIXct") + coltype_is_supported <- sapply(data, function(col) inherits(col, supported_types)) + if (!all(coltype_is_supported)) { + unsupported_types <- unique(unlist(lapply( + data + , function(col) if (inherits(col, supported_types)) NULL else type(col) + ))) + stop(sprintf("Error: 'lightgbm()' received 'data' with unsupported column types: %s" + , paste(head(unsupported_types, 5L)), collapse = ", ")) + } + + data <- data.table::as.data.table(data) + + cols_char <- names(data)[sapply(data, is.character)] + if (NROW(cols_char)) { + suppressWarnings(data[, (cols_char) := lapply(.SD, factor), .SDcols = cols_char]) + } + + cols_factors <- names(data)[sapply(data, is.factor)] + if (NROW(cols_factors)) { + has_ordered_factor <- any(sapply(data, is.ordered)) + if (has_ordered_factor) { + warning(paste0("'lighgbm()' was passed data with ordered factors." + , "The order in factor levels is ignored.")) + } + self$factor_levels <- lapply(data[, cols_factors, with = FALSE, drop = FALSE], levels) + data[ + , (cols_factors) := lapply(.SD, function(x) { + x <- as.numeric(x) - 1.0 + x[is.na(x)] <- -1.0 + return(x) + }) + , .SDcols = cols_factors + ] + + params$categorical_feature <- which(names(data) %in% cols_factors) + } else { + params$categorical_feature <- NULL + } + + data <- as.matrix(data, drop = FALSE) + } + } + + if (is.character(label)) { + label <- factor(label) + } + if (!is.factor(label)) { + label <- as.numeric(label) + env_out$objective <- "regression" + } else { + self$label_levels <- levels(label) + if (length(levels(label)) <= 1L) { + stop("Labels to predict is a factor with <2 possible values.") + } else if (length(levels(label)) == 2L) { + env_out$objective <- "binary" + } else { + env_out$objective <- "multiclass" + } + label <- as.numeric(label) - 1.0 + } + + if (!is.numeric(label)) { + label <- as.numeric(label) + } + if (length(label) != nrow(data)) { + stop("Labels to predict must have length equal to the number of rows in 'X'/'data'.") + } + + if (!is.null(weights)) { + weights <- as.numeric(weights) + if (length(weights) != nrow(data)) { + stop("'weights' must have length equal to the number of rows in 'X'/'data'.") + } + } + if (!is.null(init_score)) { + init_score <- as.numeric(init_score) + if (length(weights) != nrow(data)) { + stop("'init_score' must have length equal to the number of rows in 'X'/'data'.") + } + } + + dataset <- lgb.Dataset( + data = data + , label = label + , weight = weights + , init_score = init_score + , params = params + ) + env_out$dataset <- dataset + }, + + process_new_data = function(data) { + if (!is.null(self$formula_predict)) { + + data <- model.matrix(self$formula_predict, data = data) + + } else { + + if (is.null(dim(data))) { + if (inherits(data, "sparseVector")) { + data <- t(as(data, "CsparseMatrix")) + if (!inherits(data, "dgCMatrix")) { + data <- as(data, "dgCMatrix") + } + } else { + data <- matrix(data, nrow = 1L) + } + } + + if (ncol(data) < self$ncols) { + stop(sprintf("New data has fewer columns than expected (%d vs %d)" + , ncol(data), self$ncols)) + } + + if (NROW(self$colnames)) { + if (data.table::is.data.table(data)) { + data <- data[, self$colnames, with = FALSE, drop = FALSE] + } else { + data <- data[, self$colnames, drop = FALSE] + } + } else { + if (ncol(data) > self$ncols) { + if (data.table::is.data.table(data)) { + data <- data[, 1L:self$ncols, with = FALSE, drop = FALSE] + } else { + data <- data[, 1L:self$ncols, drop = FALSE] + } + } + } + + if (NROW(self$factor_levels)) { + if (!is.data.frame(data)) { + stop(paste0("When calling 'lightgbm()' on a 'data.frame' with factor columns," + , "new data to predict on must also be passed as 'data.frame'.")) + } + data <- as.data.table(data) + cols_cat <- names(self$factor_levels) + data[ + , (cols_cat) := mapply( + factor + , .SD + , self$factor_levels + , SIMPLIFY = FALSE + ) + , .SDcols = cols_cat + ][ + , (cols_cat) := lapply(.SD, function(x) { + x <- as.numeric(x) - 1.0 + x[is.na(x)] <- -1.0 + return(x) + }) + , .SDcols = cols_cat + ] + } + } + + if (is.data.frame(data)) { + data <- as.matrix(data, drop = FALSE) + } + + return(data) + }, + + process_predictions = function(pred, is_contrib = FALSE) { + if (!is_contrib && NROW(self$label_levels)) { + if (is.matrix(pred) && ncol(pred) == length(self$label_levels)) { + colnames(pred) <- self$label_levels + } + } + if (is_contrib) { + if (NROW(self$colnames) && ncol(pred) == NROW(self$colnames) + 1L) { + colnames(pred) <- c(self$colnames, "(Intercept)") + } else if (!is.null(self$formula_terms)) { + term_labels <- attributes(self$formula_terms)$term.labels + if (length(term_labels) + 1L == ncol(pred)) { + colnames(pred) <- c(term_labels, "(Intercept)") + } + } + } + return(pred) + } + ) +) diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R index be3036a52986..ca36749b1187 100644 --- a/R-package/R/lgb.restore_handle.R +++ b/R-package/R/lgb.restore_handle.R @@ -15,8 +15,9 @@ #' model <- lightgbm( #' agaricus.train$data #' , agaricus.train$label -#' , params = list(objective = "binary", nthreads = 1L) +#' , objective = "binary" #' , nrounds = 5L +#' , nthreads = 1L #' , save_name = NULL #' , verbose = 0) #' fname <- tempfile(fileext="rds") diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R index b40c8cc21c04..e0944075e4d8 100644 --- a/R-package/R/lightgbm.R +++ b/R-package/R/lightgbm.R @@ -86,68 +86,376 @@ #' @keywords internal NULL -#' @name lightgbm +#' @rdname lightgbm #' @title Train a LightGBM model -#' @description Simple interface for training a LightGBM model. +#' @description Simplified interface for training / fitting a LightGBM model which follows typical +#' R idioms for model fitting and predictions. Note that this interface does not +#' expose the full spectrum of library features as \link{lgb.train} does. +#' @details This is a thin wrapper over \link{lgb.Dataset} and then \link{lgb.train} which performs +#' extra steps such as automatically detecting categorical variables and handling their +#' encoding. It is intended as an easy-to-use interface that follows common R idioms for +#' predictive models. +#' +#' It uses base R's functions for processing the data, such as `factor`, which are not +#' particularly efficient - for serious usage, it is recommended to use the \link{lgb.train} +#' interface with \link{lgb.Dataset} instead, handling aspects such as encoding of categorical +#' variables externally through your favorite tools. +#' +#' \bold{Important:} using the `formula` interface relies on R's own formula handling, which +#' might be very slow for large inputs and will dummy-encode all categorical variables +#' (meaning: they will not be treated as categorical in tree splits, rather each level will be +#' treated as a separate variable, without exploiting the sparsity and independence patterns +#' in the encoded data). +#' +#' When models are produced through this interface (as opposed to \link{lgb.train}), the +#' method \link{predict.lgb.Booster} will additionally gain new behaviors, such as taking +#' columns by name from the new input data or adding names to the resulting predicted matrices +#' (based on the classes or features depending on what is being predicted). #' @inheritParams lgb_shared_params -#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}} -#' @param weight vector of response values. If not NULL, will set to dataset +#' @param formula A formula for specifying the response/label and predictors/features in the +#' model to be fitted. This is provided for ease of use, but using the `formula` interface +#' is discouraged for a couple reasons (see details section for mode details):\itemize{ +#' \item It converts all factor variables to dummy encoding, which typically does not lead to +#' models as good as those in which categorical variables are treated as such. +#' \item It uses base R's formula handling for inputs, which can be particularly +#' computationally inefficient compared to the alternatives. +#' \item If the number of variables is large, it can increase model size quite a bit. +#' } +#' +#' If using the `formula` interface, then `data` must be a `data.frame`. +#' @param data A `data.frame`. In the non-formula interface, it will use all available variables +#' (those not specified as being `label`, `weight`, or `init_score`) as features / predictors, +#' and will assume their types are:\itemize{ +#' \item Numeric, if they are of type `numeric`, `integer`, `Date`, `POSIXct`. +#' \item Categorical, if they are of type `factor`, `character`. +#' } +#' +#' Other variable types are not accepted. Note that the underlying core library only accepts +#' `numeric` inputs, thus other types will end up being casted. +#' +#' Note that, if using the `data.frame` interface, it is not possible to manually specify +#' categorical variables through `params` - instead, these will be deduced from the data types, +#' and their encoding will be handled internally in the fitting and prediction functions. +#' Under the `data.frame` interface, if the data contains any categorical variables, then at +#' prediction time only `data.frame` inputs will be allowed. +#' @param X Data features / covariates / predictors with which the model will try to predict `y`. +#' +#' Note that, if using non-standard evaluation for `y`, `weights`, or `init_score` (specifying +#' them as column names from `X`), then `X` will be subsetted, and any additional parameters +#' passed that correspond to column indices (such as per-column `max_bin` or +#' `categorical_features`) will be applied on the subsetted data. +#' +#' Supports dense matrices from base R (class `matrix`, will be casted to `double` storage +#' mode if it isn't already) and sparse matrices in CSC format from the `Matrix` package +#' (class `dgCMatrix`). +#' @param y,label Target / response variable to predict. May be passed as:\itemize{ +#' \item The name of a column from `X` / `data`, if it has column names. Will use non-standard +#' evaluation in order to try to determine if it matches with the name of a column in +#' `X` / `data` (i.e. will accept it as the name of a column without putting quotes +#' around it), and can also be passed as a character. +#' \item A vector with the number of entries matching to the number of rows in `X` / `data`. +#' } +#' If passing `objective="auto"`, the optimization objective will be determined according to +#' the type / class of this variable. +#' +#' If `y` is passed as a factor, then `num_class` in `params` will be set automatically +#' according to its levels. +#' +#' Passing `y` as a factor will also make \link{predict.lgb.Booster} use its levels in the +#' outputs from predictions when appropriate. +#' @param weights Sample / observation weights for rows in `X` / `data`. Same format as +#' `y` (i.e. accepts non-standard evaluation for column names, and accepts numeric vectors). +#' @param init_score Initial values for each observation from which the boosting process will +#' be started (e.g. as the result of some previous model). If not passing it (the default), +#' will start from a blank state. +#' @param objective Optimization objective (e.g. `"regression"`, `"binary"`, etc.). +#' For a list of accepted objectives, see +#' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ +#' the "Parameters" section of the documentation}. +#' +#' If passing `"auto"`, will be deduced from the type of `y` / `label`:\itemize{ +#' \item If `y` is not a factor, will set the objective to `"regression"`. +#' \item If `y` is a factor with two classes, will set the objective to `"binary"`. +#' \item If `y` is a factor with more than two classes, will set the objective to `"multiclass"`. +#' } +#' +#' If `y` is a factor, then it will automatically set parameter `num_classes` based on +#' its number of levels, overriding any such entry in `params` if it is present there. +#' @param nthreads Number of parallel threads to use. For best speed, this should be set to the number of +#' physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the +#' number of maximum threads (e.g. `nthreads = max(parallel::detectCores() / 2L, 1L)` as +#' a shorthand for the optimal value). +#' +#' Be aware that using too many threads can result in speed degradation in smaller datasets +#' (see the parameters documentation for more details). +#' +#' If passing zero, will use the default number of threads configured for OpenMP. +#' +#' This parameter overrides `num_threads` in `params` if it exists there. +#' @param dataset_params Extra parameters to pass to \link{lgb.Dataset} once it comes the +#' time to convert the dataset to this library's internal format. +#' +#' For a list of the accepted parameters, see +#' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#io-parameters}{ +#' the "I/O Parameters" section of the documentation}. #' @param save_name File name to use when writing the trained model to disk. Should end in ".model". #' If passing `NULL`, will not save the trained model to disk. -#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example +#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example: #' \itemize{ #' \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation} -#' \item{\code{obj}: objective function, can be character or custom objective function. Examples include -#' \code{regression}, \code{regression_l1}, \code{huber}, -#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}} #' \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function} #' \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}} #' \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset} #' \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature -#' names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to -#' say "the first and tenth columns").} +#' names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to +#' say "the first and tenth columns"). This parameter is not supported in the `formula` and +#' `data.frame` interfaces.} #' \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model #' into a predictor model which frees up memory and the original datasets} #' } #' @inheritSection lgb_shared_params Early Stopping -#' @return a trained \code{lgb.Booster} +#' @return A trained \code{lgb.Booster} model object. +#' @importFrom utils head +#' @importFrom parallel detectCores +#' @examples +#' library(lightgbm) +#' data("iris") +#' model <- lightgbm(Species ~ ., data = iris, verbose = -1L, nthreads = 1L) +#' pred <- predict(model, iris, type = "class") +#' all(pred == iris$Species) +#' +#' model <- lightgbm(iris, Species, verbose = -1L, nthreads = 1L) +#' head(predict(model, iris, type = "score")) +#' +#' model <- lightgbm(as.matrix(iris[, -5L]), iris$Species, verbose = -1L, nthreads = 1L) +#' head(predict(model, iris, type = "raw")) +#' @export +lightgbm <- function(...) { + UseMethod("lightgbm") +} + +#' @rdname lightgbm #' @export -lightgbm <- function(data, - label = NULL, - weight = NULL, - params = list(), - nrounds = 100L, - verbose = 1L, - eval_freq = 1L, - early_stopping_rounds = NULL, - save_name = "lightgbm.model", - init_model = NULL, - callbacks = list(), - serializable = TRUE, - ...) { +lightgbm.formula <- function(formula, + data, + weights = NULL, + init_score = NULL, + objective = "auto", + nrounds = 100L, + nthreads = parallel::detectCores(), + params = list(), + dataset_params = list(), + verbose = 1L, + eval_freq = 1L, + early_stopping_rounds = NULL, + save_name = NULL, + serializable = TRUE, + ... + ) { + data_processor_outputs <- new.env() + data_processor <- DataProcessor$new( + data_processor_outputs + , data + , dataset_params + , model_formula = formula + , label = NULL + , weights = weights + , init_score = init_score + ) + return( + lightgbm_internal( + data_processor_outputs = data_processor_outputs + , data_processor = data_processor + , objective = objective + , nthreads = nthreads + , params = params + , nrounds = nrounds + , verbose = verbose + , eval_freq = eval_freq + , early_stopping_rounds = early_stopping_rounds + , save_name = save_name + , serializable = serializable + , ... + ) + ) +} + +#' @rdname lightgbm +#' @export +lightgbm.data.frame <- function(data, + label, + weights = NULL, + init_score = NULL, + objective = "auto", + nrounds = 100L, + nthreads = parallel::detectCores(), + params = list(), + dataset_params = list(), + verbose = 1L, + eval_freq = 1L, + early_stopping_rounds = NULL, + save_name = NULL, + serializable = TRUE, + ...) { + if (!is.null(params$categorical_feature) || !is.null(dataset_params$categorical_feature)) { + stop("'categorical_feature' is not supported for 'data.frame' inputs in 'lightgbm()'.") + } + data_processor_outputs <- new.env() + data_processor <- DataProcessor$new( + data_processor_outputs + , as.data.frame(data) + , dataset_params + , model_formula = NULL + , label = label + , weights = weights + , init_score = init_score + ) + return( + lightgbm_internal( + data_processor_outputs = data_processor_outputs + , data_processor = data_processor + , objective = objective + , nthreads = nthreads + , params = params + , nrounds = nrounds + , verbose = verbose + , eval_freq = eval_freq + , early_stopping_rounds = early_stopping_rounds + , save_name = save_name + , serializable = serializable + , ... + ) + ) +} + +#' @rdname lightgbm +#' @export +lightgbm.matrix <- function(X, + y, + weights = NULL, + init_score = NULL, + objective = "auto", + nrounds = 100L, + nthreads = parallel::detectCores(), + params = list(), + dataset_params = list(), + verbose = 1L, + eval_freq = 1L, + early_stopping_rounds = NULL, + save_name = NULL, + serializable = TRUE, + ...) { + data_processor_outputs <- new.env() + data_processor <- DataProcessor$new( + data_processor_outputs + , X + , dataset_params + , model_formula = NULL + , label = y + , weights = weights + , init_score = init_score + ) + return( + lightgbm_internal( + data_processor_outputs = data_processor_outputs + , data_processor = data_processor + , objective = objective + , nthreads = nthreads + , params = params + , nrounds = nrounds + , verbose = verbose + , eval_freq = eval_freq + , early_stopping_rounds = early_stopping_rounds + , save_name = save_name + , serializable = serializable + , ... + ) + ) +} + +#' @rdname lightgbm +#' @export +lightgbm.dgCMatrix <- function(X, + y, + weights = NULL, + init_score = NULL, + objective = "auto", + nrounds = 100L, + nthreads = parallel::detectCores(), + params = list(), + dataset_params = list(), + verbose = 1L, + eval_freq = 1L, + early_stopping_rounds = NULL, + save_name = NULL, + serializable = TRUE, + ...) { + data_processor_outputs <- new.env() + data_processor <- DataProcessor$new( + data_processor_outputs + , X + , dataset_params + , model_formula = NULL + , label = y + , weights = weights + , init_score = init_score + ) + return( + lightgbm_internal( + data_processor_outputs = data_processor_outputs + , data_processor = data_processor + , objective = objective + , nthreads = nthreads + , params = params + , nrounds = nrounds + , verbose = verbose + , eval_freq = eval_freq + , early_stopping_rounds = early_stopping_rounds + , save_name = save_name + , serializable = serializable + , ... + ) + ) +} + +lightgbm_internal <- function(data_processor_outputs, + data_processor, + objective, + nthreads, + params = list(), + nrounds = 100L, + verbose = 1L, + eval_freq = 1L, + early_stopping_rounds = NULL, + save_name = "lightgbm.model", + serializable = TRUE, + ...) { + if (objective == "auto") { + objective <- data_processor_outputs$objective + } + if (objective %in% c("multiclass", "multiclassova") && NROW(data_processor$label_levels)) { + if (!is.null(params$num_class)) { + warning("'num_class' is overriden when using 'lightgbm()' interface with factors.") + } + params$num_class <- length(data_processor$label_levels) + } + params$num_threads <- nthreads # validate inputs early to avoid unnecessary computation if (nrounds <= 0L) { stop("nrounds should be greater than zero") } - # Set data to a temporary variable - dtrain <- data - - # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually - if (!lgb.is.Dataset(x = dtrain)) { - dtrain <- lgb.Dataset(data = data, label = label, weight = weight) - } + dtrain <- data_processor_outputs$dataset train_args <- list( "params" = params , "data" = dtrain , "nrounds" = nrounds + , "obj" = objective , "verbose" = verbose , "eval_freq" = eval_freq , "early_stopping_rounds" = early_stopping_rounds - , "init_model" = init_model - , "callbacks" = callbacks , "serializable" = serializable ) train_args <- append(train_args, list(...)) @@ -166,6 +474,7 @@ lightgbm <- function(data, what = lgb.train , args = train_args ) + bst$data_processor <- data_processor # Store model under a specific name if (!is.null(save_name)) { diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd index 199614241502..0c3504301907 100644 --- a/R-package/man/lgb.restore_handle.Rd +++ b/R-package/man/lgb.restore_handle.Rd @@ -25,8 +25,9 @@ data("agaricus.train") model <- lightgbm( agaricus.train$data , agaricus.train$label - , params = list(objective = "binary", nthreads = 1L) + , objective = "binary" , nrounds = 5L + , nthreads = 1L , save_name = NULL , verbose = 0) fname <- tempfile(fileext="rds") diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd index 1e6be676f62e..61579474d99b 100644 --- a/R-package/man/lightgbm.Rd +++ b/R-package/man/lightgbm.Rd @@ -2,37 +2,173 @@ % Please edit documentation in R/lightgbm.R \name{lightgbm} \alias{lightgbm} +\alias{lightgbm.formula} +\alias{lightgbm.data.frame} +\alias{lightgbm.matrix} +\alias{lightgbm.dgCMatrix} \title{Train a LightGBM model} \usage{ -lightgbm( +lightgbm(...) + +\method{lightgbm}{formula}( + formula, + data, + weights = NULL, + init_score = NULL, + objective = "auto", + nrounds = 100L, + nthreads = parallel::detectCores(), + params = list(), + dataset_params = list(), + verbose = 1L, + eval_freq = 1L, + early_stopping_rounds = NULL, + save_name = NULL, + serializable = TRUE, + ... +) + +\method{lightgbm}{data.frame}( data, - label = NULL, - weight = NULL, + label, + weights = NULL, + init_score = NULL, + objective = "auto", + nrounds = 100L, + nthreads = parallel::detectCores(), params = list(), + dataset_params = list(), + verbose = 1L, + eval_freq = 1L, + early_stopping_rounds = NULL, + save_name = NULL, + serializable = TRUE, + ... +) + +\method{lightgbm}{matrix}( + X, + y, + weights = NULL, + init_score = NULL, + objective = "auto", nrounds = 100L, + nthreads = parallel::detectCores(), + params = list(), + dataset_params = list(), verbose = 1L, eval_freq = 1L, early_stopping_rounds = NULL, - save_name = "lightgbm.model", - init_model = NULL, - callbacks = list(), + save_name = NULL, + serializable = TRUE, + ... +) + +\method{lightgbm}{dgCMatrix}( + X, + y, + weights = NULL, + init_score = NULL, + objective = "auto", + nrounds = 100L, + nthreads = parallel::detectCores(), + params = list(), + dataset_params = list(), + verbose = 1L, + eval_freq = 1L, + early_stopping_rounds = NULL, + save_name = NULL, serializable = TRUE, ... ) } \arguments{ -\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}, -may allow you to pass other types of data like \code{matrix} and then separately supply -\code{label} as a keyword argument.} +\item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example: +\itemize{ + \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation} + \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function} + \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}} + \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset} + \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature + names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to + say "the first and tenth columns"). This parameter is not supported in the `formula` and + `data.frame` interfaces.} + \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model + into a predictor model which frees up memory and the original datasets} +}} + +\item{formula}{A formula for specifying the response/label and predictors/features in the + model to be fitted. This is provided for ease of use, but using the `formula` interface + is discouraged for a couple reasons (see details section for mode details):\itemize{ + \item It converts all factor variables to dummy encoding, which typically does not lead to + models as good as those in which categorical variables are treated as such. + \item It uses base R's formula handling for inputs, which can be particularly + computationally inefficient compared to the alternatives. + \item If the number of variables is large, it can increase model size quite a bit. + } -\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}} + If using the `formula` interface, then `data` must be a `data.frame`.} -\item{weight}{vector of response values. If not NULL, will set to dataset} +\item{data}{A `data.frame`. In the non-formula interface, it will use all available variables + (those not specified as being `label`, `weight`, or `init_score`) as features / predictors, + and will assume their types are:\itemize{ + \item Numeric, if they are of type `numeric`, `integer`, `Date`, `POSIXct`. + \item Categorical, if they are of type `factor`, `character`. + } + + Other variable types are not accepted. Note that the underlying core library only accepts + `numeric` inputs, thus other types will end up being casted. + + Note that, if using the `data.frame` interface, it is not possible to manually specify + categorical variables through `params` - instead, these will be deduced from the data types, + and their encoding will be handled internally in the fitting and prediction functions. + Under the `data.frame` interface, if the data contains any categorical variables, then at + prediction time only `data.frame` inputs will be allowed.} + +\item{weights}{Sample / observation weights for rows in `X` / `data`. Same format as +`y` (i.e. accepts non-standard evaluation for column names, and accepts numeric vectors).} + +\item{init_score}{Initial values for each observation from which the boosting process will +be started (e.g. as the result of some previous model). If not passing it (the default), +will start from a blank state.} + +\item{objective}{Optimization objective (e.g. `"regression"`, `"binary"`, etc.). + For a list of accepted objectives, see + \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ + the "Parameters" section of the documentation}. + + If passing `"auto"`, will be deduced from the type of `y` / `label`:\itemize{ + \item If `y` is not a factor, will set the objective to `"regression"`. + \item If `y` is a factor with two classes, will set the objective to `"binary"`. + \item If `y` is a factor with more than two classes, will set the objective to `"multiclass"`. + } + + If `y` is a factor, then it will automatically set parameter `num_classes` based on + its number of levels, overriding any such entry in `params` if it is present there.} + +\item{nrounds}{number of training rounds} + +\item{nthreads}{Number of parallel threads to use. For best speed, this should be set to the number of + physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the + number of maximum threads (e.g. `nthreads = max(parallel::detectCores() / 2L, 1L)` as + a shorthand for the optimal value). + + Be aware that using too many threads can result in speed degradation in smaller datasets + (see the parameters documentation for more details). + + If passing zero, will use the default number of threads configured for OpenMP. + + This parameter overrides `num_threads` in `params` if it exists there.} \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ the "Parameters" section of the documentation} for a list of parameters and valid values.} -\item{nrounds}{number of training rounds} +\item{dataset_params}{Extra parameters to pass to \link{lgb.Dataset} once it comes the + time to convert the dataset to this library's internal format. + + For a list of the accepted parameters, see + \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#io-parameters}{ + the "I/O Parameters" section of the documentation}.} \item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training} @@ -47,34 +183,65 @@ set to the iteration number of the best iteration.} \item{save_name}{File name to use when writing the trained model to disk. Should end in ".model". If passing `NULL`, will not save the trained model to disk.} -\item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model} - -\item{callbacks}{List of callback functions that are applied at each iteration.} - \item{serializable}{whether to make the resulting objects serializable through functions such as \code{save} or \code{saveRDS} (see section "Model serialization").} -\item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example -\itemize{ - \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation} - \item{\code{obj}: objective function, can be character or custom objective function. Examples include - \code{regression}, \code{regression_l1}, \code{huber}, - \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}} - \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function} - \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}} - \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset} - \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature - names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to - say "the first and tenth columns").} - \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model - into a predictor model which frees up memory and the original datasets} -}} +\item{X}{Data features / covariates / predictors with which the model will try to predict `y`. + + Note that, if using non-standard evaluation for `y`, `weights`, or `init_score` (specifying + them as column names from `X`), then `X` will be subsetted, and any additional parameters + passed that correspond to column indices (such as per-column `max_bin` or + `categorical_features`) will be applied on the subsetted data. + + Supports dense matrices from base R (class `matrix`, will be casted to `double` storage + mode if it isn't already) and sparse matrices in CSC format from the `Matrix` package + (class `dgCMatrix`).} + +\item{y, label}{Target / response variable to predict. May be passed as:\itemize{ + \item The name of a column from `X` / `data`, if it has column names. Will use non-standard + evaluation in order to try to determine if it matches with the name of a column in + `X` / `data` (i.e. will accept it as the name of a column without putting quotes + around it), and can also be passed as a character. + \item A vector with the number of entries matching to the number of rows in `X` / `data`. + } + If passing `objective="auto"`, the optimization objective will be determined according to + the type / class of this variable. + + If `y` is passed as a factor, then `num_class` in `params` will be set automatically + according to its levels. + + Passing `y` as a factor will also make \link{predict.lgb.Booster} use its levels in the + outputs from predictions when appropriate.} } \value{ -a trained \code{lgb.Booster} +A trained \code{lgb.Booster} model object. } \description{ -Simple interface for training a LightGBM model. +Simplified interface for training / fitting a LightGBM model which follows typical + R idioms for model fitting and predictions. Note that this interface does not + expose the full spectrum of library features as \link{lgb.train} does. +} +\details{ +This is a thin wrapper over \link{lgb.Dataset} and then \link{lgb.train} which performs + extra steps such as automatically detecting categorical variables and handling their + encoding. It is intended as an easy-to-use interface that follows common R idioms for + predictive models. + + It uses base R's functions for processing the data, such as `factor`, which are not + particularly efficient - for serious usage, it is recommended to use the \link{lgb.train} + interface with \link{lgb.Dataset} instead, handling aspects such as encoding of categorical + variables externally through your favorite tools. + + \bold{Important:} using the `formula` interface relies on R's own formula handling, which + might be very slow for large inputs and will dummy-encode all categorical variables + (meaning: they will not be treated as categorical in tree splits, rather each level will be + treated as a separate variable, without exploiting the sparsity and independence patterns + in the encoded data). + + When models are produced through this interface (as opposed to \link{lgb.train}), the + method \link{predict.lgb.Booster} will additionally gain new behaviors, such as taking + columns by name from the new input data or adding names to the resulting predicted matrices + (based on the classes or features depending on what is being predicted). } \section{Early Stopping}{ @@ -93,3 +260,16 @@ Simple interface for training a LightGBM model. or \code{objective} (passed into \code{params}). } +\examples{ +library(lightgbm) +data("iris") +model <- lightgbm(Species ~ ., data = iris, verbose = -1L, nthreads = 1L) +pred <- predict(model, iris, type = "class") +all(pred == iris$Species) + +model <- lightgbm(iris, Species, verbose = -1L, nthreads = 1L) +head(predict(model, iris, type = "score")) + +model <- lightgbm(as.matrix(iris[, -5L]), iris$Species, verbose = -1L, nthreads = 1L) +head(predict(model, iris, type = "raw")) +} diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index 8948a4b17d01..8b4eaaa40e6b 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -6,23 +6,73 @@ \usage{ \method{predict}{lgb.Booster}( object, - data, + newdata, + type = c("score", "class", "raw", "leaf", "contrib"), start_iteration = NULL, num_iteration = NULL, - rawscore = FALSE, - predleaf = FALSE, - predcontrib = FALSE, header = FALSE, - reshape = FALSE, + reshape = TRUE, + index1 = TRUE, params = list(), ... ) } \arguments{ -\item{object}{Object of class \code{lgb.Booster}} +\item{object}{Object of class \code{lgb.Booster} from which to make predictions.} -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or -a character representing a path to a text file (CSV, TSV, or LibSVM)} +\item{newdata}{New data on which to make predictions. Allowed types are:\itemize{ + \item `data.frame`, \bold{only if} the model object was produced through the \link{lightgbm} + interface. If the input to \link{lightgbm} was a `formula` or a `data.frame` with + categorical columns (`factor` or `character`), then \bold{only} `data.frame` inputs will + be accepted here. Columns will be taken according to the names that they had in the data + that they were passed to the model (i.e. the input here will be reordered if the order + does not match, and will be subsetted if it has additional columns). + \item `matrix` from base R. Will be converted to numeric if it isn't already. + \item `dgCMatrix` from package `Matrix`. + \item `character` with a single entry representing a path to a text file in CSV, TSV, + or SVMLight / LibSVM formats. + } + Other input types are not allowed. + + Note that, if using the `formula` interface, the user is responsible for making + factor variables' levels match to those that were passed in the data to which the model + was fitted, and if the model was not produced through the \link{lightgbm} interface + (e.g. through \link{lgb.train} or \link{lgb.cv}), then the user is responsible for + handling the encoding of categorical variables.} + +\item{type}{Type of prediction to output. Allowed types are:\itemize{ +\item `"score"`, which will output the predicted score according to the function + objective function being optimized (equivalent to `"link"` in base R's `glm`) - for + example, for `objective="binary"`, it will output probabilities, while for + `objective="regression"`, it will output predicted values. For objective functions other + than multi-class classification, the result will be a numeric vector with number of rows + matching to `nrow(newdata)`. For multi-class classification, if passing `reshape=TRUE`, + it will output a matrix with columns matching to the number of classes (and if the model + object was produced through the \link{lightgbm} interface instead of through + \link{lgb.train} or \link{lgb.cv}, it will have class names as column names if available), + and if passing `reshape=FALSE`, will output a numeric vector with these same results in + row-major order. + \item `"class"` (only for binary and multi-class classification objectives), which will + output the class with the highest predicted score. If the model object was produced through + the \link{lightgbm} interface and the label was a factor variable, the result will be a + factor variable with levels matching to classes, otherwise it will be an integer vector + with indicating the class number. + \item `"raw"`, which will output the non-transformed numbers (sum of predictions from + boosting iterations' results) from which the score is produced for a given objective + function - for example, for `objective="binary"`, this corresponds to log-odds. The + output type is the same as for `type="score"`. + \item `"leaf"`, which will output the index of the terminal node / leaf at which + each observations falls in each tree in the model, outputted as as integers. If passing + `reshape=TRUE`, the result will be a matrix with number of columns matching to number of + trees, otherwise it will be a vector with this same matrix in row-major order. + \item `"contrib"`, which will return the per-feature contributions for each prediction. + If passing `reshape=TRUE`, the result will be a matrix with number of columns matching + to number of features that the model saw while fitting, otherwise will be a vector with + this same matrix outputted in row-major order. If the model object was produced through + the \link{lightgbm} interface, `reshape=TRUE` is passed, and the data to which the model + was fit had column names, then the output matrix will have column names corresponding to + the feature names. + }} \item{start_iteration}{int or None, optional (default=None) Start index of the iteration to predict. @@ -34,37 +84,34 @@ If None, if the best iteration exists and start_iteration is None or <= 0, the best iteration is used; otherwise, all iterations from start_iteration are used. If <= 0, all iterations from start_iteration are used (no limits).} -\item{rawscore}{whether the prediction should be returned in the for of original untransformed -sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} -for logistic regression would result in predictions for log-odds instead of probabilities.} - -\item{predleaf}{whether predict leaf index instead.} - -\item{predcontrib}{return per-feature contributions for each record.} - \item{header}{only used for prediction for text file. True if text file has header} \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several -prediction outputs per case.} +prediction outputs per case. When using `reshape=FALSE`, the output will +be in row-major order (contrary to R matrices which assume column-major order). +If passing `reshape=TRUE` and `newdata` has row names, the output will also have those +row names.} + +\item{index1}{When producing outputs that correspond to some numeration (such as +`type="class"` or `type="leaf"`), whether to make these outputs have a numeration +starting at 1 or at zero. Note that the underlying lightgbm core library uses zero-based +numeration, thus `index1=FALSE` will be slightly faster.} \item{params}{a list of additional named parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#predict-parameters}{ the "Predict Parameters" section of the documentation} for a list of parameters and valid values.} -\item{...}{ignored} +\item{...}{Ignored.} } \value{ -For regression or binary classification, it returns a vector of length \code{nrows(data)}. - For multiclass classification, either a \code{num_class * nrows(data)} vector or - a \code{(nrows(data), num_class)} dimension matrix is returned, depending on - the \code{reshape} value. - - When \code{predleaf = TRUE}, the output is a matrix object with the - number of columns corresponding to the number of trees. +Either a matrix with number of rows matching to the number of rows in `newdata`, or + a vector with number of entries matching to rows in `newdata`, or a vector representing a + matrix in row-major order with number of entries matching to `nrow(newdata)*n_outputs`; + depending on the requested `type` and `reshape` parameter. } \description{ -Predicted values based on class \code{lgb.Booster} +Predict values on new data based on a boosting model (class \code{lgb.Booster}). } \examples{ \donttest{ diff --git a/R-package/tests/testthat/test_DataProcessor.R b/R-package/tests/testthat/test_DataProcessor.R new file mode 100644 index 000000000000..5e68299856bb --- /dev/null +++ b/R-package/tests/testthat/test_DataProcessor.R @@ -0,0 +1,380 @@ +# Note: the lgb.DataProcessor class is meant to look for symbols in two +# environments above from where it is called. Thus, it should not be called +# or tested directly, only as part of calls to lightgbm(). +library(Matrix) +data("iris") +data("mtcars") +data(bank, package = "lightgbm") + +test_that("lightgbm() and predict() work with formula interface", { + model <- lightgbm( + Species ~ . + , data = iris + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, iris, type = "class") + expect_true(all(pred == iris$Species)) + + expect_s3_class(pred, "factor") + expect_equal(levels(pred), levels(iris$Species)) + + model <- lightgbm( + Species ~ . + log(Petal.Length) + I(Petal.Length^2.0) - Sepal.Width + , data = iris + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, iris, type = "class") + expect_true(all(pred == iris$Species)) + expect_equal(5L, .Call(LGBM_BoosterGetNumFeature_R, model$.__enclos_env__$private$handle)) +}) + +test_that("lightgbm() and predict() work with data.frame interface", { + model <- lightgbm( + iris + , Species + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, iris, type = "class") + expect_true(all(pred == iris$Species)) + expect_s3_class(pred, "factor") + expect_equal(levels(pred), levels(iris$Species)) + + model <- lightgbm( + iris + , "Species" + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, iris, type = "class") + expect_true(all(pred == iris$Species)) + expect_s3_class(pred, "factor") + expect_equal(levels(pred), levels(iris$Species)) + + model <- lightgbm( + iris[, -5L] + , iris$Species + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, iris, type = "class") + expect_true(all(pred == iris$Species)) + expect_s3_class(pred, "factor") + expect_equal(levels(pred), levels(iris$Species)) +}) + +test_that("lightgbm() and predict() accept data.tables as data.frames", { + iris_dt <- data.table::as.data.table(iris) + model <- lightgbm( + iris_dt + , Species + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, iris_dt, type = "class") + expect_true(all(pred == iris_dt$Species)) + expect_s3_class(pred, "factor") + expect_equal(levels(pred), levels(iris_dt$Species)) + + model <- lightgbm( + iris_dt + , "Species" + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, iris_dt, type = "class") + expect_true(all(pred == iris$Species)) + expect_s3_class(pred, "factor") + expect_equal(levels(pred), levels(iris_dt$Species)) + + model <- lightgbm( + iris_dt[, -5L] + , iris_dt$Species + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, iris_dt, type = "class") + expect_true(all(pred == iris_dt$Species)) + expect_s3_class(pred, "factor") + expect_equal(levels(pred), levels(iris_dt$Species)) +}) + +test_that("lightgbm() and predict() work with matrix interface", { + model <- lightgbm( + as.matrix(mtcars) + , mpg + , nthreads = 1L + , verbose = -1L + , nrounds = 5L + , params = list( + max_bins = 5L + , min_data_in_leaf = 5L + ) + ) + pred <- predict(model, mtcars) + expect_true(all(names(pred) == row.names(mtcars))) + + model <- lightgbm( + as.matrix(mtcars) + , "mpg" + , nthreads = 1L + , verbose = -1L + , nrounds = 5L + , params = list( + max_bins = 5L + , min_data_in_leaf = 5L + ) + ) + pred2 <- predict(model, mtcars) + expect_true(all(pred == pred2)) + + model <- lightgbm( + as.matrix(mtcars[, -1L]) + , mtcars$mpg + , nthreads = 1L + , verbose = -1L + , nrounds = 5L + , params = list( + max_bins = 5L + , min_data_in_leaf = 5L + ) + ) + pred3 <- predict(model, mtcars) + expect_true(all(pred == pred3)) +}) + +test_that("lightgbm() and predict() work with dgCMatrix interface", { + model <- lightgbm( + as(as.matrix(mtcars), "dgCMatrix") + , mpg + , nthreads = 1L + , verbose = -1L + , nrounds = 5L + , params = list( + max_bins = 5L + , min_data_in_leaf = 5L + ) + ) + pred <- predict(model, mtcars) + expect_true(all(names(pred) == row.names(mtcars))) + + model <- lightgbm( + as(as.matrix(mtcars), "dgCMatrix") + , "mpg" + , nthreads = 1L + , verbose = -1L + , nrounds = 5L + , params = list( + max_bins = 5L + , min_data_in_leaf = 5L + ) + ) + pred2 <- predict(model, mtcars) + expect_true(all(pred == pred2)) + + model <- lightgbm( + as(as.matrix(mtcars[, -1L]), "dgCMatrix") + , mtcars$mpg + , nthreads = 1L + , verbose = -1L + , nrounds = 5L + , params = list( + max_bins = 5L + , min_data_in_leaf = 5L + ) + ) + pred3 <- predict(model, mtcars) + expect_true(all(pred == pred3)) +}) + +test_that("lightgbm() handles single-column inputs", { + model <- lightgbm( + iris[, 1L, drop = FALSE] + , iris$Species + , nrounds = 5L + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, iris, type = "score") + expect_equal(nrow(pred), nrow(iris)) + expect_equal(ncol(pred), 3L) +}) + +test_that("lightbm() data.frame interface handles categorical features", { + model <- lightgbm( + bank + , y + , nrounds = 5L + , nthreads = 1L + , verbose = -1L + ) + expect_equal( + model$params$categorical_feature + , unname(which(sapply(within(bank, rm(y)), is.character))) + ) +}) + +test_that("lightgbm() accepts dataset parameters", { + set.seed(123L) + df <- data.frame(col1 = c(runif(1000L), rep(0.0, 100L))) + df$col2 <- df$col1 + n_bins <- 5L + model <- lightgbm( + df + , col2 + , nthreads = 1L + , verbose = -1L + , params = list(max_bin = n_bins) + ) + expect_equal(length(table(predict(model, df))), n_bins) + + model <- lightgbm( + df + , col2 + , nthreads = 1L + , verbose = -1L + , dataset_params = list(max_bin = n_bins) + ) + expect_equal(length(table(predict(model, df))), n_bins) +}) + +test_that("lightgbm() accepts NSE for different arguments", { + iris_dt <- data.table::as.data.table(iris) + iris_dt[, wcol := 1.0] + model <- lightgbm( + iris_dt + , "Species" + , weights = wcol + , nrounds = 5L + , nthreads = 1L + , verbose = -1L + ) + expect_equal( + ncol(iris_dt) - 2L + , .Call(LGBM_BoosterGetNumFeature_R, model$.__enclos_env__$private$handle) + ) +}) + +test_that("lightgbm() does not throw warnings in the presence of NAs", { + df <- data.frame( + col1 = rep(c(1.0, 2.0, NA), 100L) + , col2 = rep(c("a", NA, "b"), 100L) + , col3 = rep(c(1.0, 2.0, 1.0), 100L) + ) + expect_warning({ + model <- lightgbm( + df + , col3 + , nrounds = 5L + , nthreads = 1L + , verbose = -1L + ) + pred <- predict(model, df) + }, regexp = NA) +}) + +test_that("lightgbm() adjusts objective according to data", { + model <- lightgbm( + mpg ~ . + , data = mtcars + , nrounds = 5L + , nthreads = 1L + , verbose = -1L + ) + expect_equal(model$params$objective, "regression") + + model <- lightgbm( + y ~ . + , data = bank + , nrounds = 5L + , nthreads = 1L + , verbose = -1L + ) + expect_equal(model$params$objective, "binary") + + model <- lightgbm( + Species ~ . + , data = iris + , nrounds = 5L + , nthreads = 1L + , verbose = -1L + ) + expect_equal(model$params$objective, "multiclass") + expect_equal(model$params$num_class, length(levels(iris$Species))) + + data("agaricus.train") + model <- lightgbm( + agaricus.train$data + , agaricus.train$label + , objective = "poisson" + , nrounds = 5L + , nthreads = 1L + , verbose = -1L + ) + expect_equal(model$params$objective, "poisson") +}) + +test_that("predict() from lightgbm() names columns correctly", { + model <- lightgbm( + Species ~ . + , data = iris + , nrounds = 1L + , nthreads = 1L + , verbose = -1L + ) + pred_score <- predict(model, iris, type = "score") + pred_class <- predict(model, iris, type = "class") + pred_leaf <- predict(model, iris, type = "leaf") + pred_contrib <- predict(model, iris, type = "contrib") + + expect_equal(colnames(pred_score), levels(iris$Species)) + expect_equal(levels(pred_class), levels(iris$Species)) + expect_null(colnames(pred_leaf)) + expect_null(colnames(pred_contrib)) + + model <- lightgbm( + mpg ~ . + , data = mtcars + , nrounds = 10L + , nthreads = 1L + , verbose = -1L + , params = list( + max_bin = 5L + , min_data_in_leaf = 5L + ) + ) + pred_score <- predict(model, mtcars, type = "score") + expect_error(pred_class <- predict(model, mtcars, type = "class")) + pred_leaf <- predict(model, mtcars, type = "leaf") + pred_contrib <- predict(model, mtcars, type = "contrib") + + expect_true(is.numeric(pred_score)) + expect_null(dim(pred_score)) + expect_null(colnames(pred_score)) + + expect_equal(ncol(pred_leaf), 10L) + expect_null(colnames(pred_leaf)) + expect_equal( + colnames(pred_contrib) + , c(names(mtcars)[names(mtcars) != "mpg"], "(Intercept)") + ) + + model <- lightgbm( + mpg ~ cyl + wt + , data = mtcars + , nrounds = 10L + , nthreads = 1L + , verbose = -1L + , params = list( + max_bin = 5L + , min_data_in_leaf = 5L + ) + ) + pred_contrib <- predict(model, mtcars, type = "contrib") + expect_equal( + colnames(pred_contrib) + , c("cyl", "wt", "(Intercept)") + ) +}) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 5a1927e4e512..014d285236f3 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -66,21 +66,22 @@ test_that("start_iteration works correctly", { , label = agaricus.test$label ) bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( num_leaves = 4L , learning_rate = 0.6 - , objective = "binary" , verbosity = VERBOSITY ) + , objective = "binary" , nrounds = 50L + , nthreads = 1L , valids = list("test" = dtest) , early_stopping_rounds = 2L ) expect_true(lgb.is.Booster(bst)) - pred1 <- predict(bst, data = test$data, rawscore = TRUE) - pred_contrib1 <- predict(bst, test$data, predcontrib = TRUE) + pred1 <- predict(bst, newdata = test$data, type = "raw") + pred_contrib1 <- predict(bst, test$data, type = "contrib") pred2 <- rep(0.0, length(pred1)) pred_contrib2 <- rep(0.0, length(pred2)) step <- 11L @@ -94,7 +95,7 @@ test_that("start_iteration works correctly", { inc_pred <- predict(bst, test$data , start_iteration = start_iter , num_iteration = n_iter - , rawscore = TRUE + , type = "raw" ) inc_pred_contrib <- bst$predict(test$data , start_iteration = start_iter @@ -107,7 +108,7 @@ test_that("start_iteration works correctly", { expect_equal(pred2, pred1) expect_equal(pred_contrib2, pred_contrib1) - pred_leaf1 <- predict(bst, test$data, predleaf = TRUE) - pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE) + pred_leaf1 <- predict(bst, test$data, type = "leaf") + pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, type = "leaf") expect_equal(pred_leaf1, pred_leaf2) }) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 8b6f5f6ceb44..a91e888df9d0 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -72,14 +72,15 @@ DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset( test_that("train and predict binary classification", { nrounds <- 10L bst <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label , params = list( num_leaves = 5L - , objective = "binary" , metric = "binary_error" ) + , objective = "binary" , nrounds = nrounds + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_false(is.null(bst$record_evals)) @@ -102,18 +103,19 @@ test_that("train and predict softmax", { lb <- as.numeric(iris$Species) - 1L bst <- lightgbm( - data = as.matrix(iris[, -5L]) - , label = lb + X = as.matrix(iris[, -5L]) + , y = lb , params = list( num_leaves = 4L , learning_rate = 0.05 , min_data = 20L , min_hessian = 10.0 - , objective = "multiclass" , metric = "multi_error" , num_class = 3L ) + , objective = "multiclass" , nrounds = 20L + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) @@ -129,15 +131,16 @@ test_that("train and predict softmax", { test_that("use of multiple eval metrics works", { metrics <- list("binary_error", "auc", "binary_logloss") bst <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" , metric = metrics ) + , objective = "binary" , nrounds = 10L + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_false(is.null(bst$record_evals)) @@ -153,14 +156,15 @@ test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expec set.seed(708L) nrounds <- 10L bst <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label , params = list( num_leaves = 5L - , objective = "binary" , metric = "binary_error" ) + , objective = "binary" , nrounds = nrounds + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_true(abs(bst$lower_bound() - -1.590853) < TOLERANCE) @@ -171,14 +175,15 @@ test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expec set.seed(708L) nrounds <- 10L bst <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label , params = list( num_leaves = 5L - , objective = "regression" , metric = "l2" ) + , objective = "regression" , nrounds = nrounds + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_true(abs(bst$lower_bound() - 0.1513859) < TOLERANCE) @@ -186,15 +191,16 @@ test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expec }) test_that("lightgbm() rejects negative or 0 value passed to nrounds", { - dtrain <- lgb.Dataset(train$data, label = train$label) - params <- list(objective = "regression", metric = "l2,l1") + params <- list(metric = "l2,l1") for (nround_value in c(-10L, 0L)) { expect_error({ bst <- lightgbm( - data = dtrain + X = train$data + , y = train$label , params = params + , objective = "regression" , nrounds = nround_value - , save_name = tempfile(fileext = ".model") + , nthreads = 1L ) }, "nrounds should be greater than zero") } @@ -205,12 +211,13 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete set.seed(708L) top_level_bst <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label + , objective = "regression" , nrounds = nrounds + , nthreads = 1L , params = list( - objective = "regression" - , metric = "l2" + metric = "l2" , num_leaves = 5L ) , save_name = tempfile(fileext = ".model") @@ -218,25 +225,27 @@ test_that("lightgbm() accepts nrounds as either a top-level argument or paramete set.seed(708L) param_bst <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label , params = list( - objective = "regression" - , metric = "l2" + metric = "l2" , num_leaves = 5L , nrounds = nrounds ) + , objective = "regression" + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) set.seed(708L) both_customized <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label + , objective = "regression" , nrounds = 20L + , nthreads = 1L , params = list( - objective = "regression" - , metric = "l2" + metric = "l2" , num_leaves = 5L , nrounds = nrounds ) @@ -274,17 +283,18 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide ) nrounds <- 10L bst <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label , params = list( num_leaves = 5L - , objective = "binary" , metric = c( "binary_error" , "auc" ) ) + , objective = "binary" , nrounds = nrounds + , nthreads = 1L , valids = list( "valid1" = dvalid1 , "valid2" = dvalid2 @@ -311,10 +321,11 @@ test_that("lightgbm() does not write model to disk if save_name=NULL", { files_before <- list.files(getwd()) model <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label , nrounds = 5L - , params = list(objective = "binary") + , nthreads = 1L + , objective = "binary" , verbose = 0L , save_name = NULL ) @@ -790,8 +801,8 @@ test_that("lgb.train() accepts nrounds as either a top-level argument or paramet expect_true(is.numeric(both_l2)) # check that model produces identical performance - expect_identical(top_level_l2, params_l2) - expect_identical(both_l2, params_l2) + expect_equal(top_level_l2, params_l2) + expect_equal(both_l2, params_l2) expect_identical(param_bst$current_iter(), top_level_bst$current_iter()) expect_identical(param_bst$current_iter(), both_customized$current_iter()) @@ -1650,7 +1661,7 @@ test_that("lgb.train() supports non-ASCII feature names", { } }) -test_that("lgb.train() works with integer, double, and numeric data", { +test_that("lightgbm() works with integer, double, and numeric data", { data(mtcars) X <- as.matrix(mtcars[, -1L]) y <- mtcars[, 1L, drop = TRUE] @@ -1659,15 +1670,16 @@ test_that("lgb.train() works with integer, double, and numeric data", { mode(X) <- data_mode nrounds <- 10L bst <- lightgbm( - data = X - , label = y + X = X + , y = y , params = list( - objective = "regression" - , min_data = 1L + min_data = 1L , learning_rate = 0.01 , seed = 708L ) + , objective = "regression" , nrounds = nrounds + , nthreads = 1L ) # should have trained for 10 iterations and found splits @@ -1893,7 +1905,7 @@ test_that("when early stopping is not activated, best_iter and best_score come f expect_identical(bst$best_score, NA_real_) }) -test_that("lightgbm.train() gives the correct best_score and best_iter for a metric where higher values are better", { +test_that("lgb.train() gives the correct best_score and best_iter for a metric where higher values are better", { set.seed(708L) trainDF <- data.frame( "feat1" = runif(n = 500L, min = 0.0, max = 15.0) @@ -1967,16 +1979,18 @@ test_that("using lightgbm() without early stopping, best_iter and best_score com ) nrounds <- 10L bst <- lightgbm( - data = dtrain + X = as.matrix(trainDF[["feat1"]], drop = FALSE) + , y = trainDF[["target"]] + , objective = "binary" , nrounds = nrounds + , nthreads = 1L , valids = list( "valid1" = dvalid1 , "something-random-we-would-not-hardcode" = dtrain , "valid2" = dvalid2 ) , params = list( - objective = "binary" - , metric = "auc" + metric = "auc" , learning_rate = 1.5 , num_leaves = 5L ) @@ -1987,7 +2001,7 @@ test_that("using lightgbm() without early stopping, best_iter and best_score com # untouched. If you set verbose to > 0, the training data will still be first but called "train" expect_named( bst$record_evals - , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") + , c("start_iter", "valid1", "something-random-we-would-not-hardcode", "valid2") , ignore.order = FALSE , ignore.case = FALSE ) @@ -2516,7 +2530,7 @@ test_that("lgb.train() throws an informative error if interaction_constraints is dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = "[1,2],[3]") expect_error({ - bst <- lightgbm( + bst <- lgb.train( data = dtrain , params = params , nrounds = 2L @@ -2530,9 +2544,12 @@ test_that(paste0("lgb.train() throws an informative error if the members of inte params <- list(objective = "regression", interaction_constraints = list(list(1L, 2L), list(3L))) expect_error({ bst <- lightgbm( - data = dtrain + X = train$data + , y = train$label , params = params , nrounds = 2L + , objective = "regression" + , nthreads = 1L ) }, "every element in interaction_constraints must be a character vector or numeric vector") }) @@ -2542,7 +2559,7 @@ test_that("lgb.train() throws an informative error if interaction_constraints co params <- list(objective = "regression", interaction_constraints = list(c(1L, length(colnames(train$data)) + 1L), 3L)) expect_error({ - bst <- lightgbm( + bst <- lgb.train( data = dtrain , params = params , nrounds = 2L @@ -2556,7 +2573,7 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) - bst <- lightgbm( + bst <- lgb.train( data = dtrain , params = params , nrounds = 2L @@ -2565,7 +2582,7 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is cnames <- colnames(train$data) params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]])) - bst <- lightgbm( + bst <- lgb.train( data = dtrain , params = params , nrounds = 2L @@ -2573,7 +2590,7 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is pred2 <- bst$predict(test$data) params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L)) - bst <- lightgbm( + bst <- lgb.train( data = dtrain , params = params , nrounds = 2L @@ -2590,7 +2607,7 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) - bst <- lightgbm( + bst <- lgb.train( data = dtrain , params = params , nrounds = 2L @@ -2600,7 +2617,7 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai new_colnames <- paste0(colnames(train$data), "_x") params <- list(objective = "regression" , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])) - bst <- lightgbm( + bst <- lgb.train( data = dtrain , params = params , nrounds = 2L diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 77a5ce402238..1308413660b1 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -127,8 +127,8 @@ test_that("lgb.load() gives the expected error messages given different incorrec train <- agaricus.train test <- agaricus.test bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( objective = "binary" , num_leaves = 4L @@ -136,6 +136,7 @@ test_that("lgb.load() gives the expected error messages given different incorrec , verbose = VERBOSITY ) , nrounds = 2L + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) @@ -174,8 +175,8 @@ test_that("Loading a Booster from a text file works", { train <- agaricus.train test <- agaricus.test bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( num_leaves = 4L , learning_rate = 1.0 @@ -183,6 +184,7 @@ test_that("Loading a Booster from a text file works", { , verbose = VERBOSITY ) , nrounds = 2L + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_true(lgb.is.Booster(bst)) @@ -251,8 +253,8 @@ test_that("Loading a Booster from a string works", { train <- agaricus.train test <- agaricus.test bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( num_leaves = 4L , learning_rate = 1.0 @@ -260,6 +262,7 @@ test_that("Loading a Booster from a string works", { , verbose = VERBOSITY ) , nrounds = 2L + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_true(lgb.is.Booster(bst)) @@ -284,21 +287,22 @@ test_that("Saving a large model to string should work", { data(agaricus.train, package = "lightgbm") train <- agaricus.train bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( num_leaves = 100L , learning_rate = 0.01 , objective = "binary" ) , nrounds = 500L + , nthreads = 1L , save_name = tempfile(fileext = ".model") , verbose = VERBOSITY ) pred <- predict(bst, train$data) - pred_leaf_indx <- predict(bst, train$data, predleaf = TRUE) - pred_raw_score <- predict(bst, train$data, rawscore = TRUE) + pred_leaf_indx <- predict(bst, train$data, type = "leaf") + pred_raw_score <- predict(bst, train$data, type = "raw") model_string <- bst$save_model_to_string() # make sure this test is still producing a model bigger than the default @@ -316,8 +320,8 @@ test_that("Saving a large model to string should work", { model_str = model_string ) pred2 <- predict(bst2, train$data) - pred2_leaf_indx <- predict(bst2, train$data, predleaf = TRUE) - pred2_raw_score <- predict(bst2, train$data, rawscore = TRUE) + pred2_leaf_indx <- predict(bst2, train$data, type = "leaf") + pred2_raw_score <- predict(bst2, train$data, type = "raw") expect_identical(pred, pred2) expect_identical(pred_leaf_indx, pred2_leaf_indx) expect_identical(pred_raw_score, pred2_raw_score) @@ -328,14 +332,15 @@ test_that("Saving a large model to JSON should work", { data(agaricus.train, package = "lightgbm") train <- agaricus.train bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( num_leaves = 100L , learning_rate = 0.01 - , objective = "binary" ) + , objective = "binary" , nrounds = 200L + , nthreads = 1L , save_name = tempfile(fileext = ".model") , verbose = VERBOSITY ) @@ -358,15 +363,16 @@ test_that("If a string and a file are both passed to lgb.load() the file is used train <- agaricus.train test <- agaricus.test bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" , verbose = VERBOSITY ) + , objective = "binary" , nrounds = 2L + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_true(lgb.is.Booster(bst)) @@ -414,15 +420,16 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w data(agaricus.train, package = "lightgbm") nrounds <- 2L bst <- lightgbm( - data = as.matrix(agaricus.train$data) - , label = agaricus.train$label + X = as.matrix(agaricus.train$data) + , y = agaricus.train$label , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" , verbose = VERBOSITY ) + , objective = "binary" , nrounds = nrounds + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) data(agaricus.test, package = "lightgbm") @@ -508,15 +515,16 @@ test_that("Booster$rollback_one_iter() should work as expected", { test <- agaricus.test nrounds <- 5L bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" , verbose = VERBOSITY ) + , objective = "binary" , nrounds = nrounds + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_equal(bst$current_iter(), nrounds) @@ -543,15 +551,16 @@ test_that("Booster$update() passing a train_set works as expected", { # train with 2 rounds and then update bst <- lightgbm( - data = as.matrix(agaricus.train$data) - , label = agaricus.train$label + X = as.matrix(agaricus.train$data) + , y = agaricus.train$label , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" , verbose = VERBOSITY ) + , objective = "binary" , nrounds = nrounds + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_true(lgb.is.Booster(bst)) @@ -567,15 +576,16 @@ test_that("Booster$update() passing a train_set works as expected", { # train with 3 rounds directly bst2 <- lightgbm( - data = as.matrix(agaricus.train$data) - , label = agaricus.train$label + X = as.matrix(agaricus.train$data) + , y = agaricus.train$label , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" , verbose = VERBOSITY ) + , objective = "binary" , nrounds = nrounds + 1L + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_true(lgb.is.Booster(bst2)) @@ -593,15 +603,16 @@ test_that("Booster$update() throws an informative error if you provide a non-Dat # train with 2 rounds and then update bst <- lightgbm( - data = as.matrix(agaricus.train$data) - , label = agaricus.train$label + X = as.matrix(agaricus.train$data) + , y = agaricus.train$label , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" , verbose = VERBOSITY ) + , objective = "binary" , nrounds = nrounds + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_error({ @@ -689,15 +700,16 @@ test_that("Saving a model with different feature importance types works", { data(agaricus.train, package = "lightgbm") train <- agaricus.train bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" , verbose = VERBOSITY ) + , objective = "binary" , nrounds = 2L + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_true(lgb.is.Booster(bst)) @@ -745,15 +757,16 @@ test_that("Saving a model with unknown importance type fails", { data(agaricus.train, package = "lightgbm") train <- agaricus.train bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label + X = as.matrix(train$data) + , y = train$label , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" , verbose = VERBOSITY ) + , objective = "binary" , nrounds = 2L + , nthreads = 1L , save_name = tempfile(fileext = ".model") ) expect_true(lgb.is.Booster(bst)) @@ -1097,8 +1110,9 @@ test_that("Handle is automatically restored when calling predict", { bst <- lightgbm( agaricus.train$data , agaricus.train$label + , objective = "binary" , nrounds = 5L - , obj = "binary" + , nthreads = 1L , params = list( verbose = VERBOSITY ) diff --git a/R-package/tests/testthat/test_parameters.R b/R-package/tests/testthat/test_parameters.R index 1b03f09aa379..f4a898eafde2 100644 --- a/R-package/tests/testthat/test_parameters.R +++ b/R-package/tests/testthat/test_parameters.R @@ -12,16 +12,17 @@ test_that("Feature penalties work properly", { feature_penalties <- rep(1.0, ncol(train$data)) feature_penalties[var_index] <- x lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label , params = list( num_leaves = 5L , learning_rate = 0.05 - , objective = "binary" , feature_penalty = paste0(feature_penalties, collapse = ",") , metric = "binary_error" ) + , objective = "binary" , nrounds = 5L + , nthreads = 1L , verbose = -1L , save_name = tempfile(fileext = ".model") ) @@ -64,16 +65,17 @@ test_that("training should warn if you use 'dart' boosting, specified with 'boos params <- list( num_leaves = 5L , learning_rate = 0.05 - , objective = "binary" , metric = "binary_error" ) params[[boosting_param]] <- "dart" expect_warning({ result <- lightgbm( - data = train$data - , label = train$label + X = train$data + , y = train$label , params = params + , objective = "binary" , nrounds = 5L + , nthreads = 1L , verbose = -1L , save_name = tempfile(fileext = ".model") ) diff --git a/R-package/vignettes/basic_walkthrough.Rmd b/R-package/vignettes/basic_walkthrough.Rmd index d7aaf676f386..3379a49f76dc 100644 --- a/R-package/vignettes/basic_walkthrough.Rmd +++ b/R-package/vignettes/basic_walkthrough.Rmd @@ -51,22 +51,37 @@ The R package of LightGBM offers two functions to train a model: ### Using the `lightgbm()` function -In a first step, you need to convert data to numeric. Afterwards, you are ready to fit the model by the `lightgbm()` function. +The `lightgbm()` function provides a formula interface as well as an X/y interface. As a +first step, the variable to predict needs to be converted to a `factor`. ```{r} # Numeric response and feature matrix +bank$y <- factor(bank$y) + +# Train with formula interface +fit <- lightgbm( + y ~ age + balance + , data = bank + , params = list( + num_leaves = 4L + , learning_rate = 1.0 + ) + , nrounds = 10L + , verbose = -1L +) + +# Train with X/y interface y <- as.numeric(bank$y == "yes") X <- data.matrix(bank[, c("age", "balance")]) -# Train fit <- lightgbm( - data = X - , label = y + X + , y , params = list( num_leaves = 4L , learning_rate = 1.0 - , objective = "binary" ) + , objective = "binary" , nrounds = 10L , verbose = -1L )