From e398823a9aeddd4afd66c3fbaf0550b5a286416f Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes.rivera@gmail.com>
Date: Tue, 20 Apr 2021 21:35:15 -0400
Subject: [PATCH 1/7] accept data frames as inputs

---
 R-package/R/lgb.Booster.R             |  28 ++++
 R-package/R/lgb.Dataset.R             | 209 ++++++++++++++++++++++++--
 R-package/R/lgb.cv.R                  |  20 ++-
 R-package/R/lightgbm.R                |  39 +++--
 R-package/man/lgb.Dataset.Rd          |  34 ++++-
 R-package/man/lgb.cv.Rd               |  22 ++-
 R-package/man/lgb.train.Rd            |   7 +-
 R-package/man/lightgbm.Rd             |  22 ++-
 R-package/tests/testthat/test_basic.R | 116 ++++++++++++--
 9 files changed, 440 insertions(+), 57 deletions(-)

diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R
index dc912477cd76..552f04e777bb 100644
--- a/R-package/R/lgb.Booster.R
+++ b/R-package/R/lgb.Booster.R
@@ -62,6 +62,14 @@ Booster <- R6::R6Class(
           private$num_dataset <- 1L
           private$init_predictor <- train_set$.__enclos_env__$private$predictor
 
+          # For processing predictions on data frames
+          if (train_set$get_is_from_data_frame()) {
+            private$is_from_data_frame <- TRUE
+            private$colnames <- train_set$get_colnames()
+            private$categorical_feature <- train_set$get_categorical_feature()
+            private$factor_levels <- train_set$get_factor_levels()
+          }
+
           # Check if predictor is existing
           if (!is.null(private$init_predictor)) {
 
@@ -524,6 +532,21 @@ Booster <- R6::R6Class(
         start_iteration <- 0L
       }
 
+      # Process data frame if required
+      if (is.data.frame(data)) {
+        if (private$is_from_data_frame) {
+          data <- Dataset$public_methods$process_data_frame_columns(
+                    data,
+                    private$colnames,
+                    private$categorical_feature,
+                    private$factor_levels
+                  )
+        } else {
+          data <- as.matrix(data)
+          mode(data) <- "double"
+        }
+      }
+
       # Predict on new data
       predictor <- Predictor$new(private$handle, ...)
       return(
@@ -575,6 +598,11 @@ Booster <- R6::R6Class(
     higher_better_inner_eval = NULL,
     set_objective_to_none = FALSE,
     train_set_version = 0L,
+    # For processing predictions on data frames
+    is_from_data_frame = FALSE,
+    colnames = NULL,
+    categorical_feature = NULL,
+    factor_levels = NULL,
     # Predict data
     inner_predict = function(idx) {
 
diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index c8ffb837080b..7cdc967fe4e2 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -32,6 +32,10 @@ Dataset <- R6::R6Class(
                           free_raw_data = TRUE,
                           used_indices = NULL,
                           info = list(),
+                          label = NULL,
+                          weight = NULL,
+                          init_score = NULL,
+                          group = NULL,
                           ...) {
 
       # validate inputs early to avoid unnecessary computation
@@ -42,28 +46,86 @@ Dataset <- R6::R6Class(
           stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
       }
 
+      # Create known attributes list
+      if (!is.null(label)) info[["label"]] <- label
+      if (!is.null(weight)) info[["weight"]] <- weight
+      if (!is.null(init_score)) info[["init_score"]] <- init_score
+      if (!is.null(group)) info[["group"]] <- group
+
       # Check for additional parameters
       additional_params <- list(...)
 
-      # Create known attributes list
-      INFO_KEYS <- c("label", "weight", "init_score", "group")
-
       # Check if attribute key is in the known attribute list
       for (key in names(additional_params)) {
 
-        # Key existing
-        if (key %in% INFO_KEYS) {
+        # Store as param
+        params[[key]] <- additional_params[[key]]
 
-          # Store as info
-          info[[key]] <- additional_params[[key]]
+      }
 
-        } else {
+      # If it's a data.frame, will keep track of the categorical encodings
+      if (inherits(data, "data.frame")) {
+
+        if (!nrow(data) || !ncol(data))
+          stop("'data' is empty.")
+
+        if (is.null(reference)) {
+
+          # Factors are taken directly in data frames, so should not be supplied
+          if (!is.null(categorical_feature))
+            stop("Cannot pass 'categorical_feature' for data.frame. Categorical features should be factor columns.")
+
+          # Column names will also be taken directly
+          if (!is.null(colnames))
+            stop("Cannot pass 'colnames' for data.frame. Column names will be taken from it directly.")
+          colnames <- names(data)
+
+          # First check if the column types are all numeric or categorical
+          supported_coltypes <- c("numeric", "integer", "logical", "character", "factor", "POSIXct", "Date")
+          coltype_is_unsupported <- sapply(data, function(x) !inherits(x, supported_coltypes))
+          if (any(coltype_is_unsupported))
+            stop("'data' contains unsupported column types.")
+
+          # Ordered factors are not supported, so it will warn if there's any
+          has_ordered_factor <- sapply(data, is.ordered)
+          if (any(has_ordered_factor))
+            warning("Warning: ordered factors are not supported, will interpret them as unordered.")
 
-          # Store as param
-          params[[key]] <- additional_params[[key]]
+          # For faster conversions between types
+          data <- data.table::as.data.table(data)
 
+          # Now see if there are any categorical columns that will be encoded
+          cols_char <- sapply(data, is.character)
+          if (any(cols_char)) {
+            names_cols_char <- names(data)[cols_char]
+            data[, (names_cols_char) := lapply(.SD, factor), .SDcols=names_cols_char]
+          }
+          cols_factor <- sapply(data, is.factor)
+          if (any(cols_factor)) {
+            categorical_feature <- names(data)[cols_factor]
+            data[, (categorical_feature) := lapply(.SD, factor), .SDcols=categorical_feature]
+            private$factor_levels <- lapply(data[, categorical_feature, with=FALSE], levels)
+            data[
+                  , (categorical_feature) := lapply(.SD, function(x) ifelse(is.na(x), 0, as.numeric(x))-1)
+                  , .SDcols=categorical_feature
+            ]
+          }
+
+          # Finally, convert all columns to numeric and turn it into a matrix
+          data <- as.matrix(data[, lapply(.SD, as.numeric)])
+
+        } else {
+
+          # When passing a reference, will take the columns and categorical encodings from it instead
+          data <- self$process_data_frame_columns(
+            data,
+            reference$get_colnames(),
+            reference$get_categorical_feature(),
+            reference$get_factor_levels()
+          )
         }
 
+        private$is_from_data_frame <- TRUE
       }
 
       # Check for matrix format
@@ -127,7 +189,7 @@ Dataset <- R6::R6Class(
         cnames <- colnames(private$raw_data)
       }
 
-      # set feature names if not exist
+      # set feature names if they don't exist
       if (is.null(private$colnames) && !is.null(cnames)) {
         private$colnames <- as.character(cnames)
       }
@@ -219,7 +281,7 @@ Dataset <- R6::R6Class(
           )
 
         } else if (methods::is(private$raw_data, "dgCMatrix")) {
-          if (length(private$raw_data@p) > 2147483647L) {
+          if (length(private$raw_data@p) > .Machine$integer.max) {
             stop("Cannot support large CSC matrix")
           }
           # Are we using a dgCMatrix (sparsed matrix column compressed)
@@ -426,6 +488,43 @@ Dataset <- R6::R6Class(
 
     },
 
+    # Get levels used to encode factor variables in data frames
+    get_factor_levels = function() {
+      return(private$factor_levels)
+    },
+
+    get_categorical_feature = function() {
+      return(private$categorical_feature)
+    },
+
+    get_is_from_data_frame = function() {
+      return(private$is_from_data_frame)
+    },
+
+    process_data_frame_columns = function(data, colnames, categorical_feature, factor_levels) {
+      data <- as.data.table(data)
+      if (!is.null(colnames))
+        data <- data[, colnames, with=FALSE]
+      if (!is.null(factor_levels)) {
+        data[
+              , (categorical_feature)
+                  := mapply(
+                      function(col, levs) factor(col, levs),
+                      .SD, factor_levels, SIMPLIFY=FALSE
+                    )
+              , .SDcols=categorical_feature
+        ]
+        data[
+              , (categorical_feature) := lapply(.SD, function(x) ifelse(is.na(x), 0, as.numeric(x)) - 1)
+              , .SDcols=categorical_feature
+        ]
+      } else {
+        if (any(sapply(data, function(x) is.character(x) || is.factor(x))))
+          stop("'data' contains categorical columns, but 'reference' did not have encodings for them.")
+      }
+      return(as.matrix(data[, lapply(.SD, as.numeric)]))
+    },
+
     # Get information
     getinfo = function(name) {
 
@@ -674,6 +773,8 @@ Dataset <- R6::R6Class(
     reference = NULL,
     colnames = NULL,
     categorical_feature = NULL,
+    factor_levels = NULL,
+    is_from_data_frame = FALSE,
     predictor = NULL,
     free_raw_data = TRUE,
     used_indices = NULL,
@@ -721,6 +822,48 @@ Dataset <- R6::R6Class(
       self$finalize()
       return(invisible(self))
 
+    },
+
+    substitute_from_df_cols = function(data, label, weight, init_score,
+                                       label_name, weight_name, init_score_name,
+                                       env_where_to_substitute) {
+
+
+      check_is_df_col = function(var, var_name, data) {
+        var_name <- head(as.character(var_name), 1)
+        if (inherits(data, "data.frame") && NROW(var_name) && var_name != "NULL") {
+          if (var_name %in% names(data)) {
+            var <- data[[var_name]]
+            data <- as.data.table(data)[, setdiff(names(data), var_name), with=FALSE]
+          } else if (is.character(var) && NROW(var) == 1L && var %in% names(data)) {
+            var <- data[[var]]
+            data <- as.data.table(data)[, setdiff(names(data), var), with=FALSE]
+          }
+        }
+        return(list(var, data))
+      }
+
+      label_name <- head(as.character(label_name), 1)
+      weight_name <- head(as.character(weight_name), 1)
+      init_score_name <- head(as.character(init_score_name), 1)
+
+      temp <- check_is_df_col(label, label_name, data)
+      label <- temp[[1L]]
+      data <- temp[[2L]]
+
+      temp <- check_is_df_col(weight, weight_name, data)
+      weight <- temp[[1L]]
+      data <- temp[[2L]]
+
+      temp <- check_is_df_col(init_score, init_score_name, data)
+      init_score <- temp[[1L]]
+      data <- temp[[2L]]
+
+      env_where_to_substitute$data <- data
+      env_where_to_substitute$label <- label
+      env_where_to_substitute$weight <- weight
+      env_where_to_substitute$init_score <- init_score
+      return(NULL)
     }
 
   )
@@ -729,14 +872,22 @@ Dataset <- R6::R6Class(
 #' @title Construct \code{lgb.Dataset} object
 #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
 #'              or local file (that was created previously by saving an \code{lgb.Dataset}).
-#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
+#' @param data a \code{matrix} object, a \code{data.frame} object, a \code{dgCMatrix} object,
+#' or a character representing a filename.
+#'
+#' If passing a `data.frame`, will assume that columns are numeric if they are of types
+#' numeric, integer, logical, Date, or POSIXct; and will assume they are categorical if
+#' they are of types factor or character (ordered factors are taken as unordered).
+#' Other column types are not supported.
 #' @param params a list of parameters. See
 #'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
 #'               The "Dataset Parameters" section of the documentation} for a list of parameters
 #'               and valid values.
 #' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
 #'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
-#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
+#'                  dataset to new \code{data}, pass that existing Dataset to this argument. If the reference passed
+#'                  was constructed from a `data.frame`, will also take its column names, column order, column types,
+#'                  and levels of factor columns.
 #' @param colnames names of columns
 #' @param categorical_feature categorical features. This can either be a character vector of feature
 #'                            names or an integer vector with the indices of the features (e.g.
@@ -747,6 +898,20 @@ Dataset <- R6::R6Class(
 #'                      cannot be changed after it has been constructed. If you'd prefer to be able to
 #'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
 #' @param info a list of information of the \code{lgb.Dataset} object
+#' @param label Label of the data (target variable). Should be a numeric vector.
+#' If `data` is a `data.frame`, can also specify it as a column name, passed either as a character
+#' variable or as a name.
+#' @param weight Weight for each instance/observation. Should be a numeric vector.
+#' If `data` is a `data.frame`, can also specify it as a column name, passed either as a character
+#' variable or as a name.
+#' @param init_score Init score for Dataset. Should be a numeric vector.
+#' If `data` is a `data.frame`, can also specify it as a column name, passed either as a character
+#' variable or as a name.
+#' @param group Group/query data, as integer vector. Only used in the learning-to-rank task.
+#' sum(group) = nrow(data).
+#' For example, if you have a 100-document dataset with `group = c(10, 20, 40, 10, 10, 10)`,
+#' that means that you have 6 groups, where the first 10 records are in the first group,
+#' records 11-30 are in the second group, records 31-70 are in the third group, etc.
 #' @param ... other information to pass to \code{info} or parameters pass to \code{params}
 #'
 #' @return constructed dataset
@@ -769,7 +934,19 @@ lgb.Dataset <- function(data,
                         categorical_feature = NULL,
                         free_raw_data = TRUE,
                         info = list(),
+                        label = NULL,
+                        weight = NULL,
+                        init_score = NULL,
+                        group = NULL,
                         ...) {
+  # Take variables from column names if appropriate
+  if (is.data.frame(data)) {
+    Dataset$private_methods$substitute_from_df_cols(
+      data, label, weight, init_score,
+      substitute(label), substitute(weight), substitute(init_score),
+      environment()
+    )
+  }
 
   # Create new dataset
   return(
@@ -783,6 +960,10 @@ lgb.Dataset <- function(data,
       , free_raw_data = free_raw_data
       , used_indices = NULL
       , info = info
+      , label = label
+      , weight = weight
+      , init_score = init_score
+      , group = group
       , ...
     ))
   )
diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index 7d39c4420f0b..4aea4133cf85 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -23,8 +23,12 @@ CVBooster <- R6::R6Class(
 #' @description Cross validation logic used by LightGBM
 #' @inheritParams lgb_shared_params
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
-#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
-#' @param weight vector of response values. If not NULL, will set to dataset
+#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}.
+#' If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character
+#' variable or as a name.
+#' @param weight vector of response values. If not NULL, will set to dataset.
+#' If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character
+#' variable or as a name.
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
 #' @param showsd \code{boolean}, whether to show standard deviation of cross validation
 #' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
@@ -32,10 +36,13 @@ CVBooster <- R6::R6Class(
 #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
 #'              (each element must be a vector of test fold's indices). When folds are supplied,
 #'              the \code{nfold} and \code{stratified} parameters are ignored.
-#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
+#' @param colnames feature names, if not null, will use this to overwrite the names in dataset.
+#' Not supported for `data.frame` inputs.
 #' @param categorical_feature categorical features. This can either be a character vector of feature
 #'                            names or an integer vector with the indices of the features (e.g.
 #'                            \code{c(1L, 10L)} to say "the first and tenth columns").
+#'                            Not supported for `data.frame` inputs as for them it will determine this automatically
+#'                            according to the column type (see the documentation of \link{lgb.Dataset} for details).
 #' @param callbacks List of callback functions that are applied at each iteration.
 #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
 #'                   into a predictor model which frees up memory and the original datasets
@@ -99,6 +106,13 @@ lgb.cv <- function(params = list()
 
   # If 'data' is not an lgb.Dataset, try to construct one using 'label'
   if (!lgb.is.Dataset(x = data)) {
+    if (inherits(data, "data.frame")) {
+      Dataset$private_methods$substitute_from_df_cols(
+        data, label, weight, NULL,
+        substitute(label), substitute(weight), NULL,
+        environment()
+      )
+    }
     if (is.null(label)) {
       stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
     }
diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R
index e2df9063ed26..03473ff8fe24 100644
--- a/R-package/R/lightgbm.R
+++ b/R-package/R/lightgbm.R
@@ -2,9 +2,10 @@
 #' @title Shared parameter docs
 #' @description Parameter docs shared by \code{lgb.train}, \code{lgb.cv}, and \code{lightgbm}
 #' @param callbacks List of callback functions that are applied at each iteration.
-#' @param data a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}},
-#'             may allow you to pass other types of data like \code{matrix} and then separately supply
-#'             \code{label} as a keyword argument.
+#' @param data a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}
+#'             and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then
+#'             separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset}
+#'             for more details.
 #' @param early_stopping_rounds int. Activates early stopping. When this parameter is non-null,
 #'                              training will stop if the evaluation of any metric on any validation set
 #'                              fails to improve for \code{early_stopping_rounds} consecutive boosting rounds.
@@ -73,8 +74,12 @@ NULL
 #' @title Train a LightGBM model
 #' @description Simple interface for training a LightGBM model.
 #' @inheritParams lgb_shared_params
-#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
-#' @param weight vector of response values. If not NULL, will set to dataset
+#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}.
+#' If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character
+#' variable or as a name.
+#' @param weight vector of response values. If not NULL, will set to dataset.
+#' If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character
+#' variable or as a name.
 #' @param save_name File name to use when writing the trained model to disk. Should end in ".model".
 #' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
 #'     \itemize{
@@ -84,10 +89,13 @@ NULL
 #'                    \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
 #'        \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function}
 #'        \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}}
-#'        \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset}
+#'        \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset.
+#'                            Not supported for `data.frame` inputs.}
 #'        \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature
 #'                            names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to
-#'                            say "the first and tenth columns").}
+#'                            say "the first and tenth columns").
+#'                            Not supported for `data.frame` inputs as for them it will determine this automatically
+#'                            according to the column type (see the documentation of \link{lgb.Dataset} for details).}
 #'        \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model
 #'                          into a predictor model which frees up memory and the original datasets}
 #'         \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.}
@@ -120,14 +128,21 @@ lightgbm <- function(data,
     stop("nrounds should be greater than zero")
   }
 
-  # Set data to a temporary variable
-  dtrain <- data
-
   # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
-  if (!lgb.is.Dataset(x = dtrain)) {
-    dtrain <- lgb.Dataset(data = data, label = label, weight = weight)
+  if (!lgb.is.Dataset(x = data)) {
+    if (inherits(data, "data.frame")) {
+      Dataset$private_methods$substitute_from_df_cols(
+        data, label, weight, NULL,
+        substitute(label), substitute(weight), NULL,
+        environment()
+      )
+    }
+    data <- lgb.Dataset(data = data, label = label, weight = weight)
   }
 
+  # Set data to a temporary variable
+  dtrain <- data
+
   train_args <- list(
     "params" = params
     , "data" = dtrain
diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd
index 4a5abcf78f2c..3afa565d5b6a 100644
--- a/R-package/man/lgb.Dataset.Rd
+++ b/R-package/man/lgb.Dataset.Rd
@@ -12,11 +12,21 @@ lgb.Dataset(
   categorical_feature = NULL,
   free_raw_data = TRUE,
   info = list(),
+  label = NULL,
+  weight = NULL,
+  init_score = NULL,
+  group = NULL,
   ...
 )
 }
 \arguments{
-\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
+\item{data}{a \code{matrix} object, a \code{data.frame} object, a \code{dgCMatrix} object,
+or a character representing a filename.
+
+If passing a `data.frame`, will assume that columns are numeric if they are of types
+numeric, integer, logical, Date, or POSIXct; and will assume they are categorical if
+they are of types factor or character (ordered factors are taken as unordered).
+Other column types are not supported.}
 
 \item{params}{a list of parameters. See
 \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
@@ -25,7 +35,9 @@ and valid values.}
 
 \item{reference}{reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
 continuous features into histograms. If you want to apply the same bin boundaries from an existing
-dataset to new \code{data}, pass that existing Dataset to this argument.}
+dataset to new \code{data}, pass that existing Dataset to this argument. If the reference passed
+was constructed from a `data.frame`, will also take its column names, column order, column types,
+and levels of factor columns.}
 
 \item{colnames}{names of columns}
 
@@ -41,6 +53,24 @@ change the Dataset object after construction, set \code{free_raw_data = FALSE}.}
 
 \item{info}{a list of information of the \code{lgb.Dataset} object}
 
+\item{label}{Label of the data (target variable). Should be a numeric vector.
+If `data` is a `data.frame`, can also specify it as a column name, passed either as a character
+variable or as a name.}
+
+\item{weight}{Weight for each instance/observation. Should be a numeric vector.
+If `data` is a `data.frame`, can also specify it as a column name, passed either as a character
+variable or as a name.}
+
+\item{init_score}{Init score for Dataset. Should be a numeric vector.
+If `data` is a `data.frame`, can also specify it as a column name, passed either as a character
+variable or as a name.}
+
+\item{group}{Group/query data, as integer vector. Only used in the learning-to-rank task.
+sum(group) = nrow(data).
+For example, if you have a 100-document dataset with `group = c(10, 20, 40, 10, 10, 10)`,
+that means that you have 6 groups, where the first 10 records are in the first group,
+records 11-30 are in the second group, records 31-70 are in the third group, etc.}
+
 \item{...}{other information to pass to \code{info} or parameters pass to \code{params}}
 }
 \value{
diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd
index ec606d880ac6..70bd258bc90c 100644
--- a/R-package/man/lgb.cv.Rd
+++ b/R-package/man/lgb.cv.Rd
@@ -32,17 +32,22 @@ lgb.cv(
 \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{
 the "Parameters" section of the documentation} for a list of parameters and valid values.}
 
-\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}},
-may allow you to pass other types of data like \code{matrix} and then separately supply
-\code{label} as a keyword argument.}
+\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}
+and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then
+separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset}
+for more details.}
 
 \item{nrounds}{number of training rounds}
 
 \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
 
-\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}}
+\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}.
+If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character
+variable or as a name.}
 
-\item{weight}{vector of response values. If not NULL, will set to dataset}
+\item{weight}{vector of response values. If not NULL, will set to dataset.
+If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character
+variable or as a name.}
 
 \item{obj}{objective function, can be character or custom objective function. Examples include
 \code{regression}, \code{regression_l1}, \code{huber},
@@ -99,11 +104,14 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
 
 \item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
 
-\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
+\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset.
+Not supported for `data.frame` inputs.}
 
 \item{categorical_feature}{categorical features. This can either be a character vector of feature
 names or an integer vector with the indices of the features (e.g.
-\code{c(1L, 10L)} to say "the first and tenth columns").}
+\code{c(1L, 10L)} to say "the first and tenth columns").
+Not supported for `data.frame` inputs as for them it will determine this automatically
+according to the column type (see the documentation of \link{lgb.Dataset} for details).}
 
 \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null,
 training will stop if the evaluation of any metric on any validation set
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
index 40c7135d3b26..105682fe75a5 100644
--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@@ -27,9 +27,10 @@ lgb.train(
 \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{
 the "Parameters" section of the documentation} for a list of parameters and valid values.}
 
-\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}},
-may allow you to pass other types of data like \code{matrix} and then separately supply
-\code{label} as a keyword argument.}
+\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}
+and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then
+separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset}
+for more details.}
 
 \item{nrounds}{number of training rounds}
 
diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd
index 6512dbc6b23a..6d5793a0d566 100644
--- a/R-package/man/lightgbm.Rd
+++ b/R-package/man/lightgbm.Rd
@@ -20,13 +20,18 @@ lightgbm(
 )
 }
 \arguments{
-\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}},
-may allow you to pass other types of data like \code{matrix} and then separately supply
-\code{label} as a keyword argument.}
+\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}
+and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then
+separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset}
+for more details.}
 
-\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}}
+\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}.
+If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character
+variable or as a name.}
 
-\item{weight}{vector of response values. If not NULL, will set to dataset}
+\item{weight}{vector of response values. If not NULL, will set to dataset.
+If \code{data} is a `data.frame`, can also specify it as a column name, passed either as a character
+variable or as a name.}
 
 \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{
 the "Parameters" section of the documentation} for a list of parameters and valid values.}
@@ -57,10 +62,13 @@ set to the iteration number of the best iteration.}
                \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
    \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function}
    \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}}
-   \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset}
+   \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset.
+                       Not supported for `data.frame` inputs.}
    \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature
                        names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to
-                       say "the first and tenth columns").}
+                       say "the first and tenth columns").
+                       Not supported for `data.frame` inputs as for them it will determine this automatically
+                       according to the column type (see the documentation of \link{lgb.Dataset} for details).}
    \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model
                      into a predictor model which frees up memory and the original datasets}
     \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.}
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index c762e778602b..b6cdf2e3ba78 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -13,7 +13,6 @@ train <- agaricus.train
 test <- agaricus.test
 
 TOLERANCE <- 1e-6
-set.seed(708L)
 
 # [description] Every time this function is called, it adds 0.1
 #               to an accumulator then returns the current value.
@@ -50,18 +49,22 @@ CONSTANT_METRIC_VALUE <- 0.2
 }
 
 # sample datasets to test early stopping
+set.seed(708L)
 DTRAIN_RANDOM_REGRESSION <- lgb.Dataset(
   data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE)
   , label = rnorm(100L)
 )
+set.seed(708L)
 DVALID_RANDOM_REGRESSION <- lgb.Dataset(
   data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE)
   , label = rnorm(50L)
 )
+set.seed(708L)
 DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset(
   data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE)
   , label = sample(c(0L, 1L), size = 120L, replace = TRUE)
 )
+set.seed(708L)
 DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset(
   data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE)
   , label = sample(c(0L, 1L), size = 37L, replace = TRUE)
@@ -1040,8 +1043,8 @@ test_that("lgb.train() works when a mixture of functions and strings are passed
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE)
-  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE)
+  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 0.9165341) < TOLERANCE)
+  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 0.8400348) < TOLERANCE)
   expected_increasing_metric <- increasing_metric_starting_value + 0.1
   expect_true(
     abs(
@@ -1091,10 +1094,10 @@ test_that("lgb.train() works when a list of strings or a character vector is pas
     # the difference metrics shouldn't have been mixed up with each other
     results <- bst$record_evals[["valid1"]]
     if ("binary_error" %in% unlist(eval_variation)) {
-      expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
+      expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5405405) < TOLERANCE)
     }
     if ("binary_logloss" %in% unlist(eval_variation)) {
-      expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
+      expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.699359) < TOLERANCE)
     }
   }
 })
@@ -1126,8 +1129,8 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
-  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5405405) < TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.699359) < TOLERANCE)
 })
 
 test_that("lgb.train() works when you give a function for eval", {
@@ -1537,8 +1540,8 @@ test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid"]]
-  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE)
-  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE)
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5161012) < TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7007832) < TOLERANCE)
 
   # all boosters should have been created
   expect_length(bst$boosters, nfolds)
@@ -1675,6 +1678,101 @@ test_that("early stopping works with lgb.cv()", {
   )
 })
 
+test_that("lgb.train() works correctly with data frames", {
+  data(mtcars)
+  y <- mtcars$mpg
+  X <- mtcars[,-1]
+  # adding fake categorical features
+  X[["cyl"]] <- paste0("cyl", X[["cyl"]])
+  X[["gear"]] <- paste0("gear", X[["gear"]])
+  X[["carb"]] <- paste0("carb", X[["carb"]])
+
+  # fitting a model
+  model <- lightgbm(data=X, label=y,
+                    params=list(objective="regression", min_data=1),
+                    verbose=-1)
+  pred <- predict(model, X)
+
+  # checking that the columns are re-ordered if needed
+  X <- X[, rev(names(X))]
+  pred_new <- predict(model, X)
+  expect_equal(pred, pred_new)
+
+  # now try altering the categorical encodings
+  X[["cyl"]] <- factor(X[["cyl"]], rev(unique(X[["cyl"]])))
+  X[["gear"]] <- factor(X[["gear"]], rev(unique(X[["gear"]])))
+  X[["carb"]] <- factor(X[["carb"]], rev(unique(X[["carb"]])))
+
+  # check that predictions are still the same
+  pred_new <- predict(model, X)
+  expect_equal(pred, pred_new)
+
+  # now alter it in an incompatible way
+  X[["cyl"]] <- seq(1, nrow(X))
+  X[["gear"]] <- seq(1, nrow(X))
+  X[["carb"]] <- seq(1, nrow(X))
+
+  # check that the results were altered
+  pred_new <- predict(model, X)
+  diff <- pred - pred_new
+  diff <- diff %*% diff
+  expect_true(diff > .Machine$double.eps)
+
+  # check that the results match when using other functions
+  X_lgb <- lgb.Dataset(mtcars[,-1])
+  model_new <- lgb.train(params=list(objective="regression", min_data=1),
+                         data=X_lgb, verbose=-1)
+  pred <- predict(model_new, mtcars[,-1])
+  pred_new <- predict(model_new, as.matrix(mtcars[,-1]))
+  expect_equal(pred, pred_new)
+
+  pred_new <- model_new$predict(mtcars[,-1])
+  expect_equal(pred, pred_new)
+
+  # check that it throws an error when there's mising columns
+  expect_error(predict(model, mtcars[, 3:4]))
+
+  # check that it accepts data frames even when fitting to matrices
+  X <- mtcars[,-1]
+  model_new <- lightgbm(data=as.matrix(X), label=y,
+                        params=list(objective="regression", min_data=1),
+                        verbose=-1)
+  pred <- predict(model_new, as.matrix(X))
+  pred_new <- predict(model_new, X)
+  expect_equal(pred, pred_new)
+
+  # verify that labels and weights can be passed as column names
+  X_lgb <- lgb.Dataset(mtcars[,-1])
+  model <- lgb.train(params=list(objective="regression", min_data=1),
+                     data=X_lgb, verbose=-1)
+  pred <- predict(model, mtcars[,-1])
+
+  X_lgb_new <- lgb.Dataset(mtcars, label=mpg)
+  model_new <- lgb.train(params=list(objective="regression", min_data=1),
+                         data=X_lgb, verbose=-1)
+  pred_new <- predict(model, mtcars)
+  expect_equal(pred, pred_new)
+  pred_new <- predict(model, mtcars[,-1])
+  expect_equal(pred, pred_new)
+  X_lgb_new <- lgb.Dataset(mtcars, label="mpg")
+  model_new <- lgb.train(params=list(objective="regression", min_data=1),
+                         data=X_lgb, verbose=-1)
+  pred_new <- predict(model, mtcars)
+  expect_equal(pred, pred_new)
+
+  X_lgb_new <- lgb.Dataset(mtcars, label=mpg, weight=rep(1, nrow(mtcars)))
+  model_new <- lgb.train(params=list(objective="regression", min_data=1),
+                         data=X_lgb, verbose=-1)
+  pred_new <- predict(model, mtcars)
+  expect_equal(pred, pred_new)
+
+  model_new <- lightgbm(params=list(objective="regression", min_data=1),
+                        data=mtcars, label=mpg, weight=rep(1, nrow(mtcars)),
+                        verbose=-1)
+  pred_new <- predict(model, mtcars)
+  expect_equal(pred, pred_new)
+})
+
 context("linear learner")
 
 test_that("lgb.train() fit on linearly-relatead data improves when using linear learners", {

From 7bd4ed6b94aaac2210b7f26dd09df282359cc4fd Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes.rivera@gmail.com>
Date: Tue, 20 Apr 2021 22:54:35 -0400
Subject: [PATCH 2/7] solve linter complains

---
 R-package/R/lgb.Dataset.R             | 34 ++++++-------
 R-package/tests/testthat/test_basic.R | 72 +++++++++++++--------------
 2 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index 7cdc967fe4e2..a2d5cf2ed1ed 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -98,16 +98,16 @@ Dataset <- R6::R6Class(
           cols_char <- sapply(data, is.character)
           if (any(cols_char)) {
             names_cols_char <- names(data)[cols_char]
-            data[, (names_cols_char) := lapply(.SD, factor), .SDcols=names_cols_char]
+            data[, (names_cols_char) := lapply(.SD, factor), .SDcols = names_cols_char]
           }
           cols_factor <- sapply(data, is.factor)
           if (any(cols_factor)) {
             categorical_feature <- names(data)[cols_factor]
-            data[, (categorical_feature) := lapply(.SD, factor), .SDcols=categorical_feature]
-            private$factor_levels <- lapply(data[, categorical_feature, with=FALSE], levels)
+            data[, (categorical_feature) := lapply(.SD, factor), .SDcols = categorical_feature]
+            private$factor_levels <- lapply(data[, categorical_feature, with = FALSE], levels)
             data[
-                  , (categorical_feature) := lapply(.SD, function(x) ifelse(is.na(x), 0, as.numeric(x))-1)
-                  , .SDcols=categorical_feature
+                  , (categorical_feature) := lapply(.SD, function(x) {x <- as.numeric(x); x[is.na(x)] <- 0.0; return(x-1.0)})
+                  , .SDcols = categorical_feature
             ]
           }
 
@@ -504,19 +504,19 @@ Dataset <- R6::R6Class(
     process_data_frame_columns = function(data, colnames, categorical_feature, factor_levels) {
       data <- as.data.table(data)
       if (!is.null(colnames))
-        data <- data[, colnames, with=FALSE]
+        data <- data[, colnames, with = FALSE]
       if (!is.null(factor_levels)) {
         data[
               , (categorical_feature)
                   := mapply(
                       function(col, levs) factor(col, levs),
-                      .SD, factor_levels, SIMPLIFY=FALSE
+                      .SD, factor_levels, SIMPLIFY = FALSE
                     )
-              , .SDcols=categorical_feature
+              , .SDcols = categorical_feature
         ]
         data[
-              , (categorical_feature) := lapply(.SD, function(x) ifelse(is.na(x), 0, as.numeric(x)) - 1)
-              , .SDcols=categorical_feature
+              , (categorical_feature) := lapply(.SD, function(x) {x <- as.numeric(x); x[is.na(x)] <- 0.0; return(x-1.0)})
+              , .SDcols = categorical_feature
         ]
       } else {
         if (any(sapply(data, function(x) is.character(x) || is.factor(x))))
@@ -829,23 +829,23 @@ Dataset <- R6::R6Class(
                                        env_where_to_substitute) {
 
 
-      check_is_df_col = function(var, var_name, data) {
-        var_name <- head(as.character(var_name), 1)
+      check_is_df_col <- function(var, var_name, data) {
+        var_name <- head(as.character(var_name), 1L)
         if (inherits(data, "data.frame") && NROW(var_name) && var_name != "NULL") {
           if (var_name %in% names(data)) {
             var <- data[[var_name]]
-            data <- as.data.table(data)[, setdiff(names(data), var_name), with=FALSE]
+            data <- as.data.table(data)[, setdiff(names(data), var_name), with = FALSE]
           } else if (is.character(var) && NROW(var) == 1L && var %in% names(data)) {
             var <- data[[var]]
-            data <- as.data.table(data)[, setdiff(names(data), var), with=FALSE]
+            data <- as.data.table(data)[, setdiff(names(data), var), with = FALSE]
           }
         }
         return(list(var, data))
       }
 
-      label_name <- head(as.character(label_name), 1)
-      weight_name <- head(as.character(weight_name), 1)
-      init_score_name <- head(as.character(init_score_name), 1)
+      label_name <- head(as.character(label_name), 1L)
+      weight_name <- head(as.character(weight_name), 1L)
+      init_score_name <- head(as.character(init_score_name), 1L)
 
       temp <- check_is_df_col(label, label_name, data)
       label <- temp[[1L]]
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index b6cdf2e3ba78..f33e79ff5d19 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1681,16 +1681,16 @@ test_that("early stopping works with lgb.cv()", {
 test_that("lgb.train() works correctly with data frames", {
   data(mtcars)
   y <- mtcars$mpg
-  X <- mtcars[,-1]
+  X <- mtcars[, -1L]
   # adding fake categorical features
   X[["cyl"]] <- paste0("cyl", X[["cyl"]])
   X[["gear"]] <- paste0("gear", X[["gear"]])
   X[["carb"]] <- paste0("carb", X[["carb"]])
 
   # fitting a model
-  model <- lightgbm(data=X, label=y,
-                    params=list(objective="regression", min_data=1),
-                    verbose=-1)
+  model <- lightgbm(data = X, label = y,
+                    params = list(objective = "regression", min_data = 1L),
+                    verbose = -1L)
   pred <- predict(model, X)
 
   # checking that the columns are re-ordered if needed
@@ -1708,9 +1708,9 @@ test_that("lgb.train() works correctly with data frames", {
   expect_equal(pred, pred_new)
 
   # now alter it in an incompatible way
-  X[["cyl"]] <- seq(1, nrow(X))
-  X[["gear"]] <- seq(1, nrow(X))
-  X[["carb"]] <- seq(1, nrow(X))
+  X[["cyl"]] <- seq(1.0, nrow(X))
+  X[["gear"]] <- seq(1.0, nrow(X))
+  X[["carb"]] <- seq(1.0, nrow(X))
 
   # check that the results were altered
   pred_new <- predict(model, X)
@@ -1719,56 +1719,56 @@ test_that("lgb.train() works correctly with data frames", {
   expect_true(diff > .Machine$double.eps)
 
   # check that the results match when using other functions
-  X_lgb <- lgb.Dataset(mtcars[,-1])
-  model_new <- lgb.train(params=list(objective="regression", min_data=1),
-                         data=X_lgb, verbose=-1)
-  pred <- predict(model_new, mtcars[,-1])
-  pred_new <- predict(model_new, as.matrix(mtcars[,-1]))
+  X_lgb <- lgb.Dataset(mtcars[, -1L])
+  model_new <- lgb.train(params = list(objective = "regression", min_data = 1L),
+                         data = X_lgb, verbose = -1L)
+  pred <- predict(model_new, mtcars[ ,-1L])
+  pred_new <- predict(model_new, as.matrix(mtcars[, -1L]))
   expect_equal(pred, pred_new)
 
-  pred_new <- model_new$predict(mtcars[,-1])
+  pred_new <- model_new$predict(mtcars[, -1L])
   expect_equal(pred, pred_new)
 
   # check that it throws an error when there's mising columns
-  expect_error(predict(model, mtcars[, 3:4]))
+  expect_error(predict(model, mtcars[, 3L:4L]))
 
   # check that it accepts data frames even when fitting to matrices
-  X <- mtcars[,-1]
-  model_new <- lightgbm(data=as.matrix(X), label=y,
-                        params=list(objective="regression", min_data=1),
-                        verbose=-1)
+  X <- mtcars[, -1L]
+  model_new <- lightgbm(data = as.matrix(X), label = y,
+                        params = list(objective = "regression", min_data = 1L),
+                        verbose = -1L)
   pred <- predict(model_new, as.matrix(X))
   pred_new <- predict(model_new, X)
   expect_equal(pred, pred_new)
 
   # verify that labels and weights can be passed as column names
-  X_lgb <- lgb.Dataset(mtcars[,-1])
-  model <- lgb.train(params=list(objective="regression", min_data=1),
-                     data=X_lgb, verbose=-1)
-  pred <- predict(model, mtcars[,-1])
-
-  X_lgb_new <- lgb.Dataset(mtcars, label=mpg)
-  model_new <- lgb.train(params=list(objective="regression", min_data=1),
-                         data=X_lgb, verbose=-1)
+  X_lgb <- lgb.Dataset(mtcars[, -1L])
+  model <- lgb.train(params = list(objective = "regression", min_data = 1L),
+                     data = X_lgb, verbose = -1L)
+  pred <- predict(model, mtcars[, -1L])
+
+  X_lgb_new <- lgb.Dataset(mtcars, label = mpg)
+  model_new <- lgb.train(params = list(objective = "regression", min_data = 1L),
+                         data = X_lgb, verbose = -1L)
   pred_new <- predict(model, mtcars)
   expect_equal(pred, pred_new)
-  pred_new <- predict(model, mtcars[,-1])
+  pred_new <- predict(model, mtcars[, -1L])
   expect_equal(pred, pred_new)
-  X_lgb_new <- lgb.Dataset(mtcars, label="mpg")
-  model_new <- lgb.train(params=list(objective="regression", min_data=1),
-                         data=X_lgb, verbose=-1)
+  X_lgb_new <- lgb.Dataset(mtcars, label = "mpg")
+  model_new <- lgb.train(params = list(objective = "regression", min_data = 1L),
+                         data = X_lgb, verbose = -1L)
   pred_new <- predict(model, mtcars)
   expect_equal(pred, pred_new)
 
-  X_lgb_new <- lgb.Dataset(mtcars, label=mpg, weight=rep(1, nrow(mtcars)))
-  model_new <- lgb.train(params=list(objective="regression", min_data=1),
-                         data=X_lgb, verbose=-1)
+  X_lgb_new <- lgb.Dataset(mtcars, label = mpg, weight = rep(1.0, nrow(mtcars)))
+  model_new <- lgb.train(params = list(objective = "regression", min_data = 1L),
+                         data = X_lgb, verbose = -1L)
   pred_new <- predict(model, mtcars)
   expect_equal(pred, pred_new)
 
-  model_new <- lightgbm(params=list(objective="regression", min_data=1),
-                        data=mtcars, label=mpg, weight=rep(1, nrow(mtcars)),
-                        verbose=-1)
+  model_new <- lightgbm(params = list(objective = "regression", min_data = 1L),
+                        data = mtcars, label = mpg, weight = rep(1.0, nrow(mtcars)),
+                        verbose = -1L)
   pred_new <- predict(model, mtcars)
   expect_equal(pred, pred_new)
 })

From f90304a837bca49506e93d0b03032f6d9e2ec143 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes.rivera@gmail.com>
Date: Tue, 20 Apr 2021 23:05:31 -0400
Subject: [PATCH 3/7] attempt at solving issues with random seed
 reproducibility in different OSes

---
 R-package/tests/testthat/test_basic.R | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index f33e79ff5d19..a813c20fd544 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -50,24 +50,32 @@ CONSTANT_METRIC_VALUE <- 0.2
 
 # sample datasets to test early stopping
 set.seed(708L)
+data <- as.matrix(rnorm(100L), ncol = 1L, drop = FALSE)
+label <- rnorm(100L)
 DTRAIN_RANDOM_REGRESSION <- lgb.Dataset(
-  data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE)
-  , label = rnorm(100L)
+  data = data
+  , label = label
 )
 set.seed(708L)
+data <- as.matrix(rnorm(50L), ncol = 1L, drop = FALSE)
+label <- rnorm(50L)
 DVALID_RANDOM_REGRESSION <- lgb.Dataset(
-  data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE)
-  , label = rnorm(50L)
+  data = data
+  , label = label
 )
 set.seed(708L)
+data <- as.matrix(rnorm(120L), ncol = 1L, drop = FALSE)
+label <- sample(c(0L, 1L), size = 120L, replace = TRUE)
 DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset(
-  data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE)
-  , label = sample(c(0L, 1L), size = 120L, replace = TRUE)
+  data = data
+  , label = label
 )
 set.seed(708L)
+data <- as.matrix(rnorm(37L), ncol = 1L, drop = FALSE)
+label <- sample(c(0L, 1L), size = 37L, replace = TRUE)
 DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset(
-  data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE)
-  , label = sample(c(0L, 1L), size = 37L, replace = TRUE)
+  data = data
+  , label = label
 )
 
 test_that("train and predict binary classification", {

From a911c37aa023d4dbbef6b9a030a87438d1e4ea49 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes.rivera@gmail.com>
Date: Tue, 20 Apr 2021 23:09:51 -0400
Subject: [PATCH 4/7] more linter complains

---
 R-package/R/lgb.Dataset.R             | 16 ++++++++++++++--
 R-package/tests/testthat/test_basic.R |  2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index a2d5cf2ed1ed..14731838b8c5 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -105,8 +105,14 @@ Dataset <- R6::R6Class(
             categorical_feature <- names(data)[cols_factor]
             data[, (categorical_feature) := lapply(.SD, factor), .SDcols = categorical_feature]
             private$factor_levels <- lapply(data[, categorical_feature, with = FALSE], levels)
+            encode_categ <- function(x) {
+              x <- as.numeric(x)
+              x[is.na(x)] <- 0.0
+              x <- x - 1.0
+              return(x)
+            }
             data[
-                  , (categorical_feature) := lapply(.SD, function(x) {x <- as.numeric(x); x[is.na(x)] <- 0.0; return(x-1.0)})
+                  , (categorical_feature) := lapply(.SD, encode_categ)
                   , .SDcols = categorical_feature
             ]
           }
@@ -514,8 +520,14 @@ Dataset <- R6::R6Class(
                     )
               , .SDcols = categorical_feature
         ]
+        encode_categ <- function(x) {
+          x <- as.numeric(x)
+          x[is.na(x)] <- 0.0
+          x <- x - 1.0
+          return(x)
+        }
         data[
-              , (categorical_feature) := lapply(.SD, function(x) {x <- as.numeric(x); x[is.na(x)] <- 0.0; return(x-1.0)})
+              , (categorical_feature) := lapply(.SD, function(x) encode_categ)
               , .SDcols = categorical_feature
         ]
       } else {
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index a813c20fd544..399fbc285942 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1730,7 +1730,7 @@ test_that("lgb.train() works correctly with data frames", {
   X_lgb <- lgb.Dataset(mtcars[, -1L])
   model_new <- lgb.train(params = list(objective = "regression", min_data = 1L),
                          data = X_lgb, verbose = -1L)
-  pred <- predict(model_new, mtcars[ ,-1L])
+  pred <- predict(model_new, mtcars[, -1L])
   pred_new <- predict(model_new, as.matrix(mtcars[, -1L]))
   expect_equal(pred, pred_new)
 

From d6671dfc2bd1a131ccdae47f1945d1624fbde0bb Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes.rivera@gmail.com>
Date: Tue, 20 Apr 2021 23:25:32 -0400
Subject: [PATCH 5/7] missing file

---
 R-package/man/lgb_shared_params.Rd | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/R-package/man/lgb_shared_params.Rd b/R-package/man/lgb_shared_params.Rd
index e5288e0450ae..38845bed1a98 100644
--- a/R-package/man/lgb_shared_params.Rd
+++ b/R-package/man/lgb_shared_params.Rd
@@ -6,9 +6,10 @@
 \arguments{
 \item{callbacks}{List of callback functions that are applied at each iteration.}
 
-\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}},
-may allow you to pass other types of data like \code{matrix} and then separately supply
-\code{label} as a keyword argument.}
+\item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}
+and \link{lightgbm}, may allow you to pass other types of data like \code{matrix} and then
+separately supply \code{label} as a keyword argument. See the documentation of \link{lgb.Dataset}
+for more details.}
 
 \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null,
 training will stop if the evaluation of any metric on any validation set

From 08d8a344ad77e366932933aaa95fdefead9f3c92 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes.rivera@gmail.com>
Date: Wed, 21 Apr 2021 00:13:00 -0400
Subject: [PATCH 6/7] fix error

---
 R-package/R/lgb.Dataset.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index 14731838b8c5..2e1cd3b26b9c 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -527,7 +527,7 @@ Dataset <- R6::R6Class(
           return(x)
         }
         data[
-              , (categorical_feature) := lapply(.SD, function(x) encode_categ)
+              , (categorical_feature) := lapply(.SD, encode_categ)
               , .SDcols = categorical_feature
         ]
       } else {

From 9fb2ffa2a78899d0c93c6cb1b53718b1ba49c307 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes.rivera@gmail.com>
Date: Wed, 21 Apr 2021 19:05:41 -0400
Subject: [PATCH 7/7] revert corrections

---
 R-package/R/lgb.Dataset.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index 2e1cd3b26b9c..a28b50d4591c 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -195,7 +195,7 @@ Dataset <- R6::R6Class(
         cnames <- colnames(private$raw_data)
       }
 
-      # set feature names if they don't exist
+      # set feature names if they not exist
       if (is.null(private$colnames) && !is.null(cnames)) {
         private$colnames <- as.character(cnames)
       }
@@ -287,7 +287,7 @@ Dataset <- R6::R6Class(
           )
 
         } else if (methods::is(private$raw_data, "dgCMatrix")) {
-          if (length(private$raw_data@p) > .Machine$integer.max) {
+          if (length(private$raw_data@p) > 2147483647L) {
             stop("Cannot support large CSC matrix")
           }
           # Are we using a dgCMatrix (sparsed matrix column compressed)