From e398823a9aeddd4afd66c3fbaf0550b5a286416f Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 20 Apr 2021 21:35:15 -0400 Subject: [PATCH 1/7] accept data frames as inputs --- R-package/R/lgb.Booster.R | 28 ++++ R-package/R/lgb.Dataset.R | 209 ++++++++++++++++++++++++-- R-package/R/lgb.cv.R | 20 ++- R-package/R/lightgbm.R | 39 +++-- R-package/man/lgb.Dataset.Rd | 34 ++++- R-package/man/lgb.cv.Rd | 22 ++- R-package/man/lgb.train.Rd | 7 +- R-package/man/lightgbm.Rd | 22 ++- R-package/tests/testthat/test_basic.R | 116 ++++++++++++-- 9 files changed, 440 insertions(+), 57 deletions(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index dc912477cd76..552f04e777bb 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -62,6 +62,14 @@ Booster <- R6::R6Class( private$num_dataset <- 1L private$init_predictor <- train_set$.__enclos_env__$private$predictor + # For processing predictions on data frames + if (train_set$get_is_from_data_frame()) { + private$is_from_data_frame <- TRUE + private$colnames <- train_set$get_colnames() + private$categorical_feature <- train_set$get_categorical_feature() + private$factor_levels <- train_set$get_factor_levels() + } + # Check if predictor is existing if (!is.null(private$init_predictor)) { @@ -524,6 +532,21 @@ Booster <- R6::R6Class( start_iteration <- 0L } + # Process data frame if required + if (is.data.frame(data)) { + if (private$is_from_data_frame) { + data <- Dataset$public_methods$process_data_frame_columns( + data, + private$colnames, + private$categorical_feature, + private$factor_levels + ) + } else { + data <- as.matrix(data) + mode(data) <- "double" + } + } + # Predict on new data predictor <- Predictor$new(private$handle, ...) return( @@ -575,6 +598,11 @@ Booster <- R6::R6Class( higher_better_inner_eval = NULL, set_objective_to_none = FALSE, train_set_version = 0L, + # For processing predictions on data frames + is_from_data_frame = FALSE, + colnames = NULL, + categorical_feature = NULL, + factor_levels = NULL, # Predict data inner_predict = function(idx) { diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index c8ffb837080b..7cdc967fe4e2 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -32,6 +32,10 @@ Dataset <- R6::R6Class( free_raw_data = TRUE, used_indices = NULL, info = list(), + label = NULL, + weight = NULL, + init_score = NULL, + group = NULL, ...) { # validate inputs early to avoid unnecessary computation @@ -42,28 +46,86 @@ Dataset <- R6::R6Class( stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor")) } + # Create known attributes list + if (!is.null(label)) info[["label"]] <- label + if (!is.null(weight)) info[["weight"]] <- weight + if (!is.null(init_score)) info[["init_score"]] <- init_score + if (!is.null(group)) info[["group"]] <- group + # Check for additional parameters additional_params <- list(...) - # Create known attributes list - INFO_KEYS <- c("label", "weight", "init_score", "group") - # Check if attribute key is in the known attribute list for (key in names(additional_params)) { - # Key existing - if (key %in% INFO_KEYS) { + # Store as param + params[[key]] <- additional_params[[key]] - # Store as info - info[[key]] <- additional_params[[key]] + } - } else { + # If it's a data.frame, will keep track of the categorical encodings + if (inherits(data, "data.frame")) { + + if (!nrow(data) || !ncol(data)) + stop("'data' is empty.") + + if (is.null(reference)) { + + # Factors are taken directly in data frames, so should not be supplied + if (!is.null(categorical_feature)) + stop("Cannot pass 'categorical_feature' for data.frame. Categorical features should be factor columns.") + + # Column names will also be taken directly + if (!is.null(colnames)) + stop("Cannot pass 'colnames' for data.frame. Column names will be taken from it directly.") + colnames <- names(data) + + # First check if the column types are all numeric or categorical + supported_coltypes <- c("numeric", "integer", "logical", "character", "factor", "POSIXct", "Date") + coltype_is_unsupported <- sapply(data, function(x) !inherits(x, supported_coltypes)) + if (any(coltype_is_unsupported)) + stop("'data' contains unsupported column types.") + + # Ordered factors are not supported, so it will warn if there's any + has_ordered_factor <- sapply(data, is.ordered) + if (any(has_ordered_factor)) + warning("Warning: ordered factors are not supported, will interpret them as unordered.") - # Store as param - params[[key]] <- additional_params[[key]] + # For faster conversions between types + data <- data.table::as.data.table(data) + # Now see if there are any categorical columns that will be encoded + cols_char <- sapply(data, is.character) + if (any(cols_char)) { + names_cols_char <- names(data)[cols_char] + data[, (names_cols_char) := lapply(.SD, factor), .SDcols=names_cols_char] + } + cols_factor <- sapply(data, is.factor) + if (any(cols_factor)) { + categorical_feature <- names(data)[cols_factor] + data[, (categorical_feature) := lapply(.SD, factor), .SDcols=categorical_feature] + private$factor_levels <- lapply(data[, categorical_feature, with=FALSE], levels) + data[ + , (categorical_feature) := lapply(.SD, function(x) ifelse(is.na(x), 0, as.numeric(x))-1) + , .SDcols=categorical_feature + ] + } + + # Finally, convert all columns to numeric and turn it into a matrix + data <- as.matrix(data[, lapply(.SD, as.numeric)]) + + } else { + + # When passing a reference, will take the columns and categorical encodings from it instead + data <- self$process_data_frame_columns( + data, + reference$get_colnames(), + reference$get_categorical_feature(), + reference$get_factor_levels() + ) } + private$is_from_data_frame <- TRUE } # Check for matrix format @@ -127,7 +189,7 @@ Dataset <- R6::R6Class( cnames <- colnames(private$raw_data) } - # set feature names if not exist + # set feature names if they don't exist if (is.null(private$colnames) && !is.null(cnames)) { private$colnames <- as.character(cnames) } @@ -219,7 +281,7 @@ Dataset <- R6::R6Class( ) } else if (methods::is(private$raw_data, "dgCMatrix")) { - if (length(private$raw_data@p) > 2147483647L) { + if (length(private$raw_data@p) > .Machine$integer.max) { stop("Cannot support large CSC matrix") } # Are we using a dgCMatrix (sparsed matrix column compressed) @@ -426,6 +488,43 @@ Dataset <- R6::R6Class( }, + # Get levels used to encode factor variables in data frames + get_factor_levels = function() { + return(private$factor_levels) + }, + + get_categorical_feature = function() { + return(private$categorical_feature) + }, + + get_is_from_data_frame = function() { + return(private$is_from_data_frame) + }, + + process_data_frame_columns = function(data, colnames, categorical_feature, factor_levels) { + data <- as.data.table(data) + if (!is.null(colnames)) + data <- data[, colnames, with=FALSE] + if (!is.null(factor_levels)) { + data[ + , (categorical_feature) + := mapply( + function(col, levs) factor(col, levs), + .SD, factor_levels, SIMPLIFY=FALSE + ) + , .SDcols=categorical_feature + ] + data[ + , (categorical_feature) := lapply(.SD, function(x) ifelse(is.na(x), 0, as.numeric(x)) - 1) + , .SDcols=categorical_feature + ] + } else { + if (any(sapply(data, function(x) is.character(x) || is.factor(x)))) + stop("'data' contains categorical columns, but 'reference' did not have encodings for them.") + } + return(as.matrix(data[, lapply(.SD, as.numeric)])) + }, + # Get information getinfo = function(name) { @@ -674,6 +773,8 @@ Dataset <- R6::R6Class( reference = NULL, colnames = NULL, categorical_feature = NULL, + factor_levels = NULL, + is_from_data_frame = FALSE, predictor = NULL, free_raw_data = TRUE, used_indices = NULL, @@ -721,6 +822,48 @@ Dataset <- R6::R6Class( self$finalize() return(invisible(self)) + }, + + substitute_from_df_cols = function(data, label, weight, init_score, + label_name, weight_name, init_score_name, + env_where_to_substitute) { + + + check_is_df_col = function(var, var_name, data) { + var_name <- head(as.character(var_name), 1) + if (inherits(data, "data.frame") && NROW(var_name) && var_name != "NULL") { + if (var_name %in% names(data)) { + var <- data[[var_name]] + data <- as.data.table(data)[, setdiff(names(data), var_name), with=FALSE] + } else if (is.character(var) && NROW(var) == 1L && var %in% names(data)) { + var <- data[[var]] + data <- as.data.table(data)[, setdiff(names(data), var), with=FALSE] + } + } + return(list(var, data)) + } + + label_name <- head(as.character(label_name), 1) + weight_name <- head(as.character(weight_name), 1) + init_score_name <- head(as.character(init_score_name), 1) + + temp <- check_is_df_col(label, label_name, data) + label <- temp[[1L]] + data <- temp[[2L]] + + temp <- check_is_df_col(weight, weight_name, data) + weight <- temp[[1L]] + data <- temp[[2L]] + + temp <- check_is_df_col(init_score, init_score_name, data) + init_score <- temp[[1L]] + data <- temp[[2L]] + + env_where_to_substitute$data <- data + env_where_to_substitute$label <- label + env_where_to_substitute$weight <- weight + env_where_to_substitute$init_score <- init_score + return(NULL) } ) @@ -729,14 +872,22 @@ Dataset <- R6::R6Class( #' @title Construct \code{lgb.Dataset} object #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix #' or local file (that was created previously by saving an \code{lgb.Dataset}). -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename +#' @param data a \code{matrix} object, a \code{data.frame} object, a \code{dgCMatrix} object, +#' or a character representing a filename. +#' +#' If passing a `data.frame`, will assume that columns are numeric if they are of types +#' numeric, integer, logical, Date, or POSIXct; and will assume they are categorical if +#' they are of types factor or character (ordered factors are taken as unordered). +#' Other column types are not supported. #' @param params a list of parameters. See #' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{ #' The "Dataset Parameters" section of the documentation} for a list of parameters #' and valid values. #' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning #' continuous features into histograms. If you want to apply the same bin boundaries from an existing -#' dataset to new \code{data}, pass that existing Dataset to this argument. +#' dataset to new \code{data}, pass that existing Dataset to this argument. If the reference passed +#' was constructed from a `data.frame`, will also take its column names, column order, column types, +#' and levels of factor columns. #' @param colnames names of columns #' @param categorical_feature categorical features. This can either be a character vector of feature #' names or an integer vector with the indices of the features (e.g. @@ -747,6 +898,20 @@ Dataset <- R6::R6Class( #' cannot be changed after it has been constructed. If you'd prefer to be able to #' change the Dataset object after construction, set \code{free_raw_data = FALSE}. #' @param info a list of information of the \code{lgb.Dataset} object +#' @param label Label of the data (target variable). Should be a numeric vector. +#' If `data` is a `data.frame`, can also specify it as a column name, passed either as a character +#' variable or as a name. +#' @param weight Weight for each instance/observation. Should be a numeric vector. +#' If `data` is a `data.frame`, can also specify it as a column name, passed either as a character +#' variable or as a name. +#' @param init_score Init score for Dataset. Should be a numeric vector. +#' If `data` is a `data.frame`, can also specify it as a column name, passed either as a character +#' variable or as a name. +#' @param group Group/query data, as integer vector. Only used in the learning-to-rank task. +#' sum(group) = nrow(data). +#' For example, if you have a 100-document dataset with `group = c(10, 20, 40, 10, 10, 10)`, +#' that means that you have 6 groups, where the first 10 records are in the first group, +#' records 11-30 are in the second group, records 31-70 are in the third group, etc. #' @param ... other information to pass to \code{info} or parameters pass to \code{params} #' #' @return constructed dataset @@ -769,7 +934,19 @@ lgb.Dataset <- function(data, categorical_feature = NULL, free_raw_data = TRUE, info = list(), + label = NULL, + weight = NULL, + init_score = NULL, + group = NULL, ...) { + # Take variables from column names if appropriate + if (is.data.frame(data)) { + Dataset$private_methods$substitute_from_df_cols( + data, label, weight, init_score, + substitute(label), substitute(weight), substitute(init_score), + environment() + ) + } # Create new dataset return( @@ -783,6 +960,10 @@ lgb.Dataset <- function(data, , free_raw_data = free_raw_data , used_indices = NULL , info = info + , label = label + , weight = weight + , init_score = init_score + , group = group , ... )) ) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index 7d39c4420f0b..4aea4133cf85 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -23,8 +23,12 @@ CVBooster <- R6::R6Class( #' @description Cross validation logic used by LightGBM #' @inheritParams lgb_shared_params #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. -#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}} -#' @param weight vector of response values. If not NULL, will set to dataset +#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}. +#' If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character +#' variable or as a name. +#' @param weight vector of response values. If not NULL, will set to dataset. +#' If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character +#' variable or as a name. #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals} #' @param showsd \code{boolean}, whether to show standard deviation of cross validation #' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified @@ -32,10 +36,13 @@ CVBooster <- R6::R6Class( #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds #' (each element must be a vector of test fold's indices). When folds are supplied, #' the \code{nfold} and \code{stratified} parameters are ignored. -#' @param colnames feature names, if not null, will use this to overwrite the names in dataset +#' @param colnames feature names, if not null, will use this to overwrite the names in dataset. +#' Not supported for `data.frame` inputs. #' @param categorical_feature categorical features. This can either be a character vector of feature #' names or an integer vector with the indices of the features (e.g. #' \code{c(1L, 10L)} to say "the first and tenth columns"). +#' Not supported for `data.frame` inputs as for them it will determine this automatically +#' according to the column type (see the documentation of \link{lgb.Dataset} for details). #' @param callbacks List of callback functions that are applied at each iteration. #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model #' into a predictor model which frees up memory and the original datasets @@ -99,6 +106,13 @@ lgb.cv <- function(params = list() # If 'data' is not an lgb.Dataset, try to construct one using 'label' if (!lgb.is.Dataset(x = data)) { + if (inherits(data, "data.frame")) { + Dataset$private_methods$substitute_from_df_cols( + data, label, weight, NULL, + substitute(label), substitute(weight), NULL, + environment() + ) + } if (is.null(label)) { stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'") } diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R index e2df9063ed26..03473ff8fe24 100644 --- a/R-package/R/lightgbm.R +++ b/R-package/R/lightgbm.R @@ -2,9 +2,10 @@ #' @title Shared parameter docs #' @description Parameter docs shared by \code{lgb.train}, \code{lgb.cv}, and \code{lightgbm} #' @param callbacks List of callback functions that are applied at each iteration. -#' @param data a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}, -#' may allow you to pass other types of data like \code{matrix} and then separately supply -#' \code{label} as a keyword argument. +#' @param data a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}} +#' and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then +#' separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset} +#' for more details. #' @param early_stopping_rounds int. Activates early stopping. When this parameter is non-null, #' training will stop if the evaluation of any metric on any validation set #' fails to improve for \code{early_stopping_rounds} consecutive boosting rounds. @@ -73,8 +74,12 @@ NULL #' @title Train a LightGBM model #' @description Simple interface for training a LightGBM model. #' @inheritParams lgb_shared_params -#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}} -#' @param weight vector of response values. If not NULL, will set to dataset +#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}. +#' If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character +#' variable or as a name. +#' @param weight vector of response values. If not NULL, will set to dataset. +#' If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character +#' variable or as a name. #' @param save_name File name to use when writing the trained model to disk. Should end in ".model". #' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example #' \itemize{ @@ -84,10 +89,13 @@ NULL #' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}} #' \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function} #' \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}} -#' \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset} +#' \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset. +#' Not supported for `data.frame` inputs.} #' \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature #' names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to -#' say "the first and tenth columns").} +#' say "the first and tenth columns"). +#' Not supported for `data.frame` inputs as for them it will determine this automatically +#' according to the column type (see the documentation of \link{lgb.Dataset} for details).} #' \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model #' into a predictor model which frees up memory and the original datasets} #' \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} @@ -120,14 +128,21 @@ lightgbm <- function(data, stop("nrounds should be greater than zero") } - # Set data to a temporary variable - dtrain <- data - # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually - if (!lgb.is.Dataset(x = dtrain)) { - dtrain <- lgb.Dataset(data = data, label = label, weight = weight) + if (!lgb.is.Dataset(x = data)) { + if (inherits(data, "data.frame")) { + Dataset$private_methods$substitute_from_df_cols( + data, label, weight, NULL, + substitute(label), substitute(weight), NULL, + environment() + ) + } + data <- lgb.Dataset(data = data, label = label, weight = weight) } + # Set data to a temporary variable + dtrain <- data + train_args <- list( "params" = params , "data" = dtrain diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index 4a5abcf78f2c..3afa565d5b6a 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -12,11 +12,21 @@ lgb.Dataset( categorical_feature = NULL, free_raw_data = TRUE, info = list(), + label = NULL, + weight = NULL, + init_score = NULL, + group = NULL, ... ) } \arguments{ -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} +\item{data}{a \code{matrix} object, a \code{data.frame} object, a \code{dgCMatrix} object, +or a character representing a filename. + +If passing a `data.frame`, will assume that columns are numeric if they are of types +numeric, integer, logical, Date, or POSIXct; and will assume they are categorical if +they are of types factor or character (ordered factors are taken as unordered). +Other column types are not supported.} \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{ @@ -25,7 +35,9 @@ and valid values.} \item{reference}{reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning continuous features into histograms. If you want to apply the same bin boundaries from an existing -dataset to new \code{data}, pass that existing Dataset to this argument.} +dataset to new \code{data}, pass that existing Dataset to this argument. If the reference passed +was constructed from a `data.frame`, will also take its column names, column order, column types, +and levels of factor columns.} \item{colnames}{names of columns} @@ -41,6 +53,24 @@ change the Dataset object after construction, set \code{free_raw_data = FALSE}.} \item{info}{a list of information of the \code{lgb.Dataset} object} +\item{label}{Label of the data (target variable). Should be a numeric vector. +If `data` is a `data.frame`, can also specify it as a column name, passed either as a character +variable or as a name.} + +\item{weight}{Weight for each instance/observation. Should be a numeric vector. +If `data` is a `data.frame`, can also specify it as a column name, passed either as a character +variable or as a name.} + +\item{init_score}{Init score for Dataset. Should be a numeric vector. +If `data` is a `data.frame`, can also specify it as a column name, passed either as a character +variable or as a name.} + +\item{group}{Group/query data, as integer vector. Only used in the learning-to-rank task. +sum(group) = nrow(data). +For example, if you have a 100-document dataset with `group = c(10, 20, 40, 10, 10, 10)`, +that means that you have 6 groups, where the first 10 records are in the first group, +records 11-30 are in the second group, records 31-70 are in the third group, etc.} + \item{...}{other information to pass to \code{info} or parameters pass to \code{params}} } \value{ diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd index ec606d880ac6..70bd258bc90c 100644 --- a/R-package/man/lgb.cv.Rd +++ b/R-package/man/lgb.cv.Rd @@ -32,17 +32,22 @@ lgb.cv( \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ the "Parameters" section of the documentation} for a list of parameters and valid values.} -\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}, -may allow you to pass other types of data like \code{matrix} and then separately supply -\code{label} as a keyword argument.} +\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}} +and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then +separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset} +for more details.} \item{nrounds}{number of training rounds} \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.} -\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}} +\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}. +If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character +variable or as a name.} -\item{weight}{vector of response values. If not NULL, will set to dataset} +\item{weight}{vector of response values. If not NULL, will set to dataset. +If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character +variable or as a name.} \item{obj}{objective function, can be character or custom objective function. Examples include \code{regression}, \code{regression_l1}, \code{huber}, @@ -99,11 +104,14 @@ the \code{nfold} and \code{stratified} parameters are ignored.} \item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model} -\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset} +\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset. +Not supported for `data.frame` inputs.} \item{categorical_feature}{categorical features. This can either be a character vector of feature names or an integer vector with the indices of the features (e.g. -\code{c(1L, 10L)} to say "the first and tenth columns").} +\code{c(1L, 10L)} to say "the first and tenth columns"). +Not supported for `data.frame` inputs as for them it will determine this automatically +according to the column type (see the documentation of \link{lgb.Dataset} for details).} \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null, training will stop if the evaluation of any metric on any validation set diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index 40c7135d3b26..105682fe75a5 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -27,9 +27,10 @@ lgb.train( \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ the "Parameters" section of the documentation} for a list of parameters and valid values.} -\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}, -may allow you to pass other types of data like \code{matrix} and then separately supply -\code{label} as a keyword argument.} +\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}} +and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then +separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset} +for more details.} \item{nrounds}{number of training rounds} diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd index 6512dbc6b23a..6d5793a0d566 100644 --- a/R-package/man/lightgbm.Rd +++ b/R-package/man/lightgbm.Rd @@ -20,13 +20,18 @@ lightgbm( ) } \arguments{ -\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}, -may allow you to pass other types of data like \code{matrix} and then separately supply -\code{label} as a keyword argument.} +\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}} +and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then +separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset} +for more details.} -\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}} +\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}. +If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character +variable or as a name.} -\item{weight}{vector of response values. If not NULL, will set to dataset} +\item{weight}{vector of response values. If not NULL, will set to dataset. +If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character +variable or as a name.} \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{ the "Parameters" section of the documentation} for a list of parameters and valid values.} @@ -57,10 +62,13 @@ set to the iteration number of the best iteration.} \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}} \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function} \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}} - \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset} + \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset. + Not supported for `data.frame` inputs.} \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to - say "the first and tenth columns").} + say "the first and tenth columns"). + Not supported for `data.frame` inputs as for them it will determine this automatically + according to the column type (see the documentation of \link{lgb.Dataset} for details).} \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets} \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.} diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index c762e778602b..b6cdf2e3ba78 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -13,7 +13,6 @@ train <- agaricus.train test <- agaricus.test TOLERANCE <- 1e-6 -set.seed(708L) # [description] Every time this function is called, it adds 0.1 # to an accumulator then returns the current value. @@ -50,18 +49,22 @@ CONSTANT_METRIC_VALUE <- 0.2 } # sample datasets to test early stopping +set.seed(708L) DTRAIN_RANDOM_REGRESSION <- lgb.Dataset( data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE) , label = rnorm(100L) ) +set.seed(708L) DVALID_RANDOM_REGRESSION <- lgb.Dataset( data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE) , label = rnorm(50L) ) +set.seed(708L) DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset( data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE) , label = sample(c(0L, 1L), size = 120L, replace = TRUE) ) +set.seed(708L) DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset( data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE) , label = sample(c(0L, 1L), size = 37L, replace = TRUE) @@ -1040,8 +1043,8 @@ test_that("lgb.train() works when a mixture of functions and strings are passed # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] - expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE) - expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE) + expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 0.9165341) < TOLERANCE) + expect_true(abs(results[["l2"]][["eval"]][[1L]] - 0.8400348) < TOLERANCE) expected_increasing_metric <- increasing_metric_starting_value + 0.1 expect_true( abs( @@ -1091,10 +1094,10 @@ test_that("lgb.train() works when a list of strings or a character vector is pas # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] if ("binary_error" %in% unlist(eval_variation)) { - expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) + expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5405405) < TOLERANCE) } if ("binary_logloss" %in% unlist(eval_variation)) { - expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) + expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.699359) < TOLERANCE) } } }) @@ -1126,8 +1129,8 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] - expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) - expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) + expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5405405) < TOLERANCE) + expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.699359) < TOLERANCE) }) test_that("lgb.train() works when you give a function for eval", { @@ -1537,8 +1540,8 @@ test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid"]] - expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE) - expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE) + expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5161012) < TOLERANCE) + expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7007832) < TOLERANCE) # all boosters should have been created expect_length(bst$boosters, nfolds) @@ -1675,6 +1678,101 @@ test_that("early stopping works with lgb.cv()", { ) }) +test_that("lgb.train() works correctly with data frames", { + data(mtcars) + y <- mtcars$mpg + X <- mtcars[,-1] + # adding fake categorical features + X[["cyl"]] <- paste0("cyl", X[["cyl"]]) + X[["gear"]] <- paste0("gear", X[["gear"]]) + X[["carb"]] <- paste0("carb", X[["carb"]]) + + # fitting a model + model <- lightgbm(data=X, label=y, + params=list(objective="regression", min_data=1), + verbose=-1) + pred <- predict(model, X) + + # checking that the columns are re-ordered if needed + X <- X[, rev(names(X))] + pred_new <- predict(model, X) + expect_equal(pred, pred_new) + + # now try altering the categorical encodings + X[["cyl"]] <- factor(X[["cyl"]], rev(unique(X[["cyl"]]))) + X[["gear"]] <- factor(X[["gear"]], rev(unique(X[["gear"]]))) + X[["carb"]] <- factor(X[["carb"]], rev(unique(X[["carb"]]))) + + # check that predictions are still the same + pred_new <- predict(model, X) + expect_equal(pred, pred_new) + + # now alter it in an incompatible way + X[["cyl"]] <- seq(1, nrow(X)) + X[["gear"]] <- seq(1, nrow(X)) + X[["carb"]] <- seq(1, nrow(X)) + + # check that the results were altered + pred_new <- predict(model, X) + diff <- pred - pred_new + diff <- diff %*% diff + expect_true(diff > .Machine$double.eps) + + # check that the results match when using other functions + X_lgb <- lgb.Dataset(mtcars[,-1]) + model_new <- lgb.train(params=list(objective="regression", min_data=1), + data=X_lgb, verbose=-1) + pred <- predict(model_new, mtcars[,-1]) + pred_new <- predict(model_new, as.matrix(mtcars[,-1])) + expect_equal(pred, pred_new) + + pred_new <- model_new$predict(mtcars[,-1]) + expect_equal(pred, pred_new) + + # check that it throws an error when there's mising columns + expect_error(predict(model, mtcars[, 3:4])) + + # check that it accepts data frames even when fitting to matrices + X <- mtcars[,-1] + model_new <- lightgbm(data=as.matrix(X), label=y, + params=list(objective="regression", min_data=1), + verbose=-1) + pred <- predict(model_new, as.matrix(X)) + pred_new <- predict(model_new, X) + expect_equal(pred, pred_new) + + # verify that labels and weights can be passed as column names + X_lgb <- lgb.Dataset(mtcars[,-1]) + model <- lgb.train(params=list(objective="regression", min_data=1), + data=X_lgb, verbose=-1) + pred <- predict(model, mtcars[,-1]) + + X_lgb_new <- lgb.Dataset(mtcars, label=mpg) + model_new <- lgb.train(params=list(objective="regression", min_data=1), + data=X_lgb, verbose=-1) + pred_new <- predict(model, mtcars) + expect_equal(pred, pred_new) + pred_new <- predict(model, mtcars[,-1]) + expect_equal(pred, pred_new) + X_lgb_new <- lgb.Dataset(mtcars, label="mpg") + model_new <- lgb.train(params=list(objective="regression", min_data=1), + data=X_lgb, verbose=-1) + pred_new <- predict(model, mtcars) + expect_equal(pred, pred_new) + + X_lgb_new <- lgb.Dataset(mtcars, label=mpg, weight=rep(1, nrow(mtcars))) + model_new <- lgb.train(params=list(objective="regression", min_data=1), + data=X_lgb, verbose=-1) + pred_new <- predict(model, mtcars) + expect_equal(pred, pred_new) + + model_new <- lightgbm(params=list(objective="regression", min_data=1), + data=mtcars, label=mpg, weight=rep(1, nrow(mtcars)), + verbose=-1) + pred_new <- predict(model, mtcars) + expect_equal(pred, pred_new) +}) + context("linear learner") test_that("lgb.train() fit on linearly-relatead data improves when using linear learners", { From 7bd4ed6b94aaac2210b7f26dd09df282359cc4fd Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 20 Apr 2021 22:54:35 -0400 Subject: [PATCH 2/7] solve linter complains --- R-package/R/lgb.Dataset.R | 34 ++++++------- R-package/tests/testthat/test_basic.R | 72 +++++++++++++-------------- 2 files changed, 53 insertions(+), 53 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 7cdc967fe4e2..a2d5cf2ed1ed 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -98,16 +98,16 @@ Dataset <- R6::R6Class( cols_char <- sapply(data, is.character) if (any(cols_char)) { names_cols_char <- names(data)[cols_char] - data[, (names_cols_char) := lapply(.SD, factor), .SDcols=names_cols_char] + data[, (names_cols_char) := lapply(.SD, factor), .SDcols = names_cols_char] } cols_factor <- sapply(data, is.factor) if (any(cols_factor)) { categorical_feature <- names(data)[cols_factor] - data[, (categorical_feature) := lapply(.SD, factor), .SDcols=categorical_feature] - private$factor_levels <- lapply(data[, categorical_feature, with=FALSE], levels) + data[, (categorical_feature) := lapply(.SD, factor), .SDcols = categorical_feature] + private$factor_levels <- lapply(data[, categorical_feature, with = FALSE], levels) data[ - , (categorical_feature) := lapply(.SD, function(x) ifelse(is.na(x), 0, as.numeric(x))-1) - , .SDcols=categorical_feature + , (categorical_feature) := lapply(.SD, function(x) {x <- as.numeric(x); x[is.na(x)] <- 0.0; return(x-1.0)}) + , .SDcols = categorical_feature ] } @@ -504,19 +504,19 @@ Dataset <- R6::R6Class( process_data_frame_columns = function(data, colnames, categorical_feature, factor_levels) { data <- as.data.table(data) if (!is.null(colnames)) - data <- data[, colnames, with=FALSE] + data <- data[, colnames, with = FALSE] if (!is.null(factor_levels)) { data[ , (categorical_feature) := mapply( function(col, levs) factor(col, levs), - .SD, factor_levels, SIMPLIFY=FALSE + .SD, factor_levels, SIMPLIFY = FALSE ) - , .SDcols=categorical_feature + , .SDcols = categorical_feature ] data[ - , (categorical_feature) := lapply(.SD, function(x) ifelse(is.na(x), 0, as.numeric(x)) - 1) - , .SDcols=categorical_feature + , (categorical_feature) := lapply(.SD, function(x) {x <- as.numeric(x); x[is.na(x)] <- 0.0; return(x-1.0)}) + , .SDcols = categorical_feature ] } else { if (any(sapply(data, function(x) is.character(x) || is.factor(x)))) @@ -829,23 +829,23 @@ Dataset <- R6::R6Class( env_where_to_substitute) { - check_is_df_col = function(var, var_name, data) { - var_name <- head(as.character(var_name), 1) + check_is_df_col <- function(var, var_name, data) { + var_name <- head(as.character(var_name), 1L) if (inherits(data, "data.frame") && NROW(var_name) && var_name != "NULL") { if (var_name %in% names(data)) { var <- data[[var_name]] - data <- as.data.table(data)[, setdiff(names(data), var_name), with=FALSE] + data <- as.data.table(data)[, setdiff(names(data), var_name), with = FALSE] } else if (is.character(var) && NROW(var) == 1L && var %in% names(data)) { var <- data[[var]] - data <- as.data.table(data)[, setdiff(names(data), var), with=FALSE] + data <- as.data.table(data)[, setdiff(names(data), var), with = FALSE] } } return(list(var, data)) } - label_name <- head(as.character(label_name), 1) - weight_name <- head(as.character(weight_name), 1) - init_score_name <- head(as.character(init_score_name), 1) + label_name <- head(as.character(label_name), 1L) + weight_name <- head(as.character(weight_name), 1L) + init_score_name <- head(as.character(init_score_name), 1L) temp <- check_is_df_col(label, label_name, data) label <- temp[[1L]] diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index b6cdf2e3ba78..f33e79ff5d19 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -1681,16 +1681,16 @@ test_that("early stopping works with lgb.cv()", { test_that("lgb.train() works correctly with data frames", { data(mtcars) y <- mtcars$mpg - X <- mtcars[,-1] + X <- mtcars[, -1L] # adding fake categorical features X[["cyl"]] <- paste0("cyl", X[["cyl"]]) X[["gear"]] <- paste0("gear", X[["gear"]]) X[["carb"]] <- paste0("carb", X[["carb"]]) # fitting a model - model <- lightgbm(data=X, label=y, - params=list(objective="regression", min_data=1), - verbose=-1) + model <- lightgbm(data = X, label = y, + params = list(objective = "regression", min_data = 1L), + verbose = -1L) pred <- predict(model, X) # checking that the columns are re-ordered if needed @@ -1708,9 +1708,9 @@ test_that("lgb.train() works correctly with data frames", { expect_equal(pred, pred_new) # now alter it in an incompatible way - X[["cyl"]] <- seq(1, nrow(X)) - X[["gear"]] <- seq(1, nrow(X)) - X[["carb"]] <- seq(1, nrow(X)) + X[["cyl"]] <- seq(1.0, nrow(X)) + X[["gear"]] <- seq(1.0, nrow(X)) + X[["carb"]] <- seq(1.0, nrow(X)) # check that the results were altered pred_new <- predict(model, X) @@ -1719,56 +1719,56 @@ test_that("lgb.train() works correctly with data frames", { expect_true(diff > .Machine$double.eps) # check that the results match when using other functions - X_lgb <- lgb.Dataset(mtcars[,-1]) - model_new <- lgb.train(params=list(objective="regression", min_data=1), - data=X_lgb, verbose=-1) - pred <- predict(model_new, mtcars[,-1]) - pred_new <- predict(model_new, as.matrix(mtcars[,-1])) + X_lgb <- lgb.Dataset(mtcars[, -1L]) + model_new <- lgb.train(params = list(objective = "regression", min_data = 1L), + data = X_lgb, verbose = -1L) + pred <- predict(model_new, mtcars[ ,-1L]) + pred_new <- predict(model_new, as.matrix(mtcars[, -1L])) expect_equal(pred, pred_new) - pred_new <- model_new$predict(mtcars[,-1]) + pred_new <- model_new$predict(mtcars[, -1L]) expect_equal(pred, pred_new) # check that it throws an error when there's mising columns - expect_error(predict(model, mtcars[, 3:4])) + expect_error(predict(model, mtcars[, 3L:4L])) # check that it accepts data frames even when fitting to matrices - X <- mtcars[,-1] - model_new <- lightgbm(data=as.matrix(X), label=y, - params=list(objective="regression", min_data=1), - verbose=-1) + X <- mtcars[, -1L] + model_new <- lightgbm(data = as.matrix(X), label = y, + params = list(objective = "regression", min_data = 1L), + verbose = -1L) pred <- predict(model_new, as.matrix(X)) pred_new <- predict(model_new, X) expect_equal(pred, pred_new) # verify that labels and weights can be passed as column names - X_lgb <- lgb.Dataset(mtcars[,-1]) - model <- lgb.train(params=list(objective="regression", min_data=1), - data=X_lgb, verbose=-1) - pred <- predict(model, mtcars[,-1]) - - X_lgb_new <- lgb.Dataset(mtcars, label=mpg) - model_new <- lgb.train(params=list(objective="regression", min_data=1), - data=X_lgb, verbose=-1) + X_lgb <- lgb.Dataset(mtcars[, -1L]) + model <- lgb.train(params = list(objective = "regression", min_data = 1L), + data = X_lgb, verbose = -1L) + pred <- predict(model, mtcars[, -1L]) + + X_lgb_new <- lgb.Dataset(mtcars, label = mpg) + model_new <- lgb.train(params = list(objective = "regression", min_data = 1L), + data = X_lgb, verbose = -1L) pred_new <- predict(model, mtcars) expect_equal(pred, pred_new) - pred_new <- predict(model, mtcars[,-1]) + pred_new <- predict(model, mtcars[, -1L]) expect_equal(pred, pred_new) - X_lgb_new <- lgb.Dataset(mtcars, label="mpg") - model_new <- lgb.train(params=list(objective="regression", min_data=1), - data=X_lgb, verbose=-1) + X_lgb_new <- lgb.Dataset(mtcars, label = "mpg") + model_new <- lgb.train(params = list(objective = "regression", min_data = 1L), + data = X_lgb, verbose = -1L) pred_new <- predict(model, mtcars) expect_equal(pred, pred_new) - X_lgb_new <- lgb.Dataset(mtcars, label=mpg, weight=rep(1, nrow(mtcars))) - model_new <- lgb.train(params=list(objective="regression", min_data=1), - data=X_lgb, verbose=-1) + X_lgb_new <- lgb.Dataset(mtcars, label = mpg, weight = rep(1.0, nrow(mtcars))) + model_new <- lgb.train(params = list(objective = "regression", min_data = 1L), + data = X_lgb, verbose = -1L) pred_new <- predict(model, mtcars) expect_equal(pred, pred_new) - model_new <- lightgbm(params=list(objective="regression", min_data=1), - data=mtcars, label=mpg, weight=rep(1, nrow(mtcars)), - verbose=-1) + model_new <- lightgbm(params = list(objective = "regression", min_data = 1L), + data = mtcars, label = mpg, weight = rep(1.0, nrow(mtcars)), + verbose = -1L) pred_new <- predict(model, mtcars) expect_equal(pred, pred_new) }) From f90304a837bca49506e93d0b03032f6d9e2ec143 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 20 Apr 2021 23:05:31 -0400 Subject: [PATCH 3/7] attempt at solving issues with random seed reproducibility in different OSes --- R-package/tests/testthat/test_basic.R | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index f33e79ff5d19..a813c20fd544 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -50,24 +50,32 @@ CONSTANT_METRIC_VALUE <- 0.2 # sample datasets to test early stopping set.seed(708L) +data <- as.matrix(rnorm(100L), ncol = 1L, drop = FALSE) +label <- rnorm(100L) DTRAIN_RANDOM_REGRESSION <- lgb.Dataset( - data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE) - , label = rnorm(100L) + data = data + , label = label ) set.seed(708L) +data <- as.matrix(rnorm(50L), ncol = 1L, drop = FALSE) +label <- rnorm(50L) DVALID_RANDOM_REGRESSION <- lgb.Dataset( - data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE) - , label = rnorm(50L) + data = data + , label = label ) set.seed(708L) +data <- as.matrix(rnorm(120L), ncol = 1L, drop = FALSE) +label <- sample(c(0L, 1L), size = 120L, replace = TRUE) DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset( - data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE) - , label = sample(c(0L, 1L), size = 120L, replace = TRUE) + data = data + , label = label ) set.seed(708L) +data <- as.matrix(rnorm(37L), ncol = 1L, drop = FALSE) +label <- sample(c(0L, 1L), size = 37L, replace = TRUE) DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset( - data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE) - , label = sample(c(0L, 1L), size = 37L, replace = TRUE) + data = data + , label = label ) test_that("train and predict binary classification", { From a911c37aa023d4dbbef6b9a030a87438d1e4ea49 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 20 Apr 2021 23:09:51 -0400 Subject: [PATCH 4/7] more linter complains --- R-package/R/lgb.Dataset.R | 16 ++++++++++++++-- R-package/tests/testthat/test_basic.R | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index a2d5cf2ed1ed..14731838b8c5 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -105,8 +105,14 @@ Dataset <- R6::R6Class( categorical_feature <- names(data)[cols_factor] data[, (categorical_feature) := lapply(.SD, factor), .SDcols = categorical_feature] private$factor_levels <- lapply(data[, categorical_feature, with = FALSE], levels) + encode_categ <- function(x) { + x <- as.numeric(x) + x[is.na(x)] <- 0.0 + x <- x - 1.0 + return(x) + } data[ - , (categorical_feature) := lapply(.SD, function(x) {x <- as.numeric(x); x[is.na(x)] <- 0.0; return(x-1.0)}) + , (categorical_feature) := lapply(.SD, encode_categ) , .SDcols = categorical_feature ] } @@ -514,8 +520,14 @@ Dataset <- R6::R6Class( ) , .SDcols = categorical_feature ] + encode_categ <- function(x) { + x <- as.numeric(x) + x[is.na(x)] <- 0.0 + x <- x - 1.0 + return(x) + } data[ - , (categorical_feature) := lapply(.SD, function(x) {x <- as.numeric(x); x[is.na(x)] <- 0.0; return(x-1.0)}) + , (categorical_feature) := lapply(.SD, function(x) encode_categ) , .SDcols = categorical_feature ] } else { diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index a813c20fd544..399fbc285942 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -1730,7 +1730,7 @@ test_that("lgb.train() works correctly with data frames", { X_lgb <- lgb.Dataset(mtcars[, -1L]) model_new <- lgb.train(params = list(objective = "regression", min_data = 1L), data = X_lgb, verbose = -1L) - pred <- predict(model_new, mtcars[ ,-1L]) + pred <- predict(model_new, mtcars[, -1L]) pred_new <- predict(model_new, as.matrix(mtcars[, -1L])) expect_equal(pred, pred_new) From d6671dfc2bd1a131ccdae47f1945d1624fbde0bb Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 20 Apr 2021 23:25:32 -0400 Subject: [PATCH 5/7] missing file --- R-package/man/lgb_shared_params.Rd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R-package/man/lgb_shared_params.Rd b/R-package/man/lgb_shared_params.Rd index e5288e0450ae..38845bed1a98 100644 --- a/R-package/man/lgb_shared_params.Rd +++ b/R-package/man/lgb_shared_params.Rd @@ -6,9 +6,10 @@ \arguments{ \item{callbacks}{List of callback functions that are applied at each iteration.} -\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}, -may allow you to pass other types of data like \code{matrix} and then separately supply -\code{label} as a keyword argument.} +\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}} +and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then +separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset} +for more details.} \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null, training will stop if the evaluation of any metric on any validation set From 08d8a344ad77e366932933aaa95fdefead9f3c92 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Wed, 21 Apr 2021 00:13:00 -0400 Subject: [PATCH 6/7] fix error --- R-package/R/lgb.Dataset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 14731838b8c5..2e1cd3b26b9c 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -527,7 +527,7 @@ Dataset <- R6::R6Class( return(x) } data[ - , (categorical_feature) := lapply(.SD, function(x) encode_categ) + , (categorical_feature) := lapply(.SD, encode_categ) , .SDcols = categorical_feature ] } else { From 9fb2ffa2a78899d0c93c6cb1b53718b1ba49c307 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Wed, 21 Apr 2021 19:05:41 -0400 Subject: [PATCH 7/7] revert corrections --- R-package/R/lgb.Dataset.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 2e1cd3b26b9c..a28b50d4591c 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -195,7 +195,7 @@ Dataset <- R6::R6Class( cnames <- colnames(private$raw_data) } - # set feature names if they don't exist + # set feature names if they not exist if (is.null(private$colnames) && !is.null(cnames)) { private$colnames <- as.character(cnames) } @@ -287,7 +287,7 @@ Dataset <- R6::R6Class( ) } else if (methods::is(private$raw_data, "dgCMatrix")) { - if (length(private$raw_data@p) > .Machine$integer.max) { + if (length(private$raw_data@p) > 2147483647L) { stop("Cannot support large CSC matrix") } # Are we using a dgCMatrix (sparsed matrix column compressed)