microsoft · guolinke · Feb 19, 2020 · Nov 26, 2019 · Nov 26, 2019 · Nov 26, 2019
@@ -1,12 +1,86 @@
 # Central location for parameter aliases.
 # See https://lightgbm.readthedocs.io/en/latest/Parameters.html#core-parameters
 
+# [description] List of respected parameter aliases specific to lgb.Dataset. Wrapped in a function to
+#               take advantage of lazy evaluation (so it doesn't matter what order
+#               R sources files during installation).
+# [return] A named list, where each key is a parameter relevant to lgb.DataSet and each value is a character
+#          vector of corresponding aliases.
+.DATASET_PARAMETERS <- function() {
+    return(list(
+        "bin_construct_sample_cnt" = c(
+            "bin_construct_sample_cnt"
+            , "subsample_for_bin"
+        )
+        , "categorical_feature" = c(
+            "categorical_feature"
+            , "cat_feature"
+            , "categorical_column"
+            , "cat_column"
+        )
+        , "seed" = c(
+            "seed"
+            , "data_random_seed"
+            , "feature_fraction_seed"
+        )
+        , "enable_bundle" = c(
+            "enable_bundle"
+            , "is_endable_bundle"
+            , "bundle"
+        )
+        , "enable_sparse" = c(
+            "enable_sparse"
+            , "is_sparse"
+            , "sparse"
+        )
+        , "feature_pre_filter" = "feature_pre_filter"
+        , "forcedbins_filename" = "forcedbins_filename"
+        , "group_column" = c(
+            "group_column"
+            , "group_id"
+            , "query_column"
+            , "query"
+            , "query_id"
+        )
+        , "header" = c(
+            "header"
+            , "has_header"
+        )
+        , "ignore_column" = c(
+            "ignore_column"
+            , "ignore_feature"
+            , "blacklist"
+        )
+        , "label_column" = c(
+            "label_column"
+            , "label"
+        )
+        , "max_bin" = "max_bin"
+        , "max_bin_by_feature" = "max_bin_by_feature"
+        , "pre_partition" = c(
+            "pre_parition"
+            , "is_pre_partition"
+        )
+        , "two_round" = c(
+            "two_round"
+            , "two_round_loading"
+            , "use_two_round_loading"
+        )
+        , "use_missing" = "use_missing"
+        , "weight_column" = c(
+            "weight_column"
+            , "weight"
+        )
+        , "zero_as_missing" = "zero_as_missing"
+    ))
+}
+
 # [description] List of respected parameter aliases. Wrapped in a function to take advantage of
 #               lazy evaluation (so it doesn't matter what order R sources files during installation).
 # [return] A named list, where each key is a main LightGBM parameter and each value is a character
 #          vector of corresponding aliases.
 .PARAMETER_ALIASES <- function() {
-    return(list(
+    learning_params <- list(
         "boosting" = c(
             "boosting"
             , "boost"
@@ -29,5 +103,6 @@
             , "num_boost_round"
             , "n_estimators"
         )
-    ))
+    )
+    return(c(learning_params, .DATASET_PARAMETERS()))
 }
@@ -31,25 +31,25 @@ Booster <- R6::R6Class(
 
       # Create parameters and handle
       params <- append(params, list(...))
-      params_str <- lgb.params2str(params)
       handle <- 0.0
 
       # Attempts to create a handle for the dataset
       try({
 
         # Check if training dataset is not null
         if (!is.null(train_set)) {
-
           # Check if training dataset is lgb.Dataset or not
           if (!lgb.check.r6.class(train_set, "lgb.Dataset")) {
             stop("lgb.Booster: Can only use lgb.Dataset as training data")
           }
-
+          train_set_handle <- train_set$.__enclos_env__$private$get_handle()
+          params <- modifyList(params, train_set$get_params())
+          params_str <- lgb.params2str(params)
           # Store booster handle
           handle <- lgb.call(
             "LGBM_BoosterCreate_R"
             , ret = handle
-            , train_set$.__enclos_env__$private$get_handle()
+            , train_set_handle
             , params_str
           )
 

@@ -530,22 +530,48 @@ Dataset <- R6::R6Class(
 
     # Update parameters
     update_params = function(params) {
-
-      # Parameter updating
-      if (!lgb.is.null.handle(private$handle)) {
-        lgb.call(
-          "LGBM_DatasetUpdateParam_R"
-          , ret = NULL
-          , private$handle
+      if (length(params) == 0L) {
+        return(invisible(self))
+      }
+      if (lgb.is.null.handle(private$handle)) {
+        private$params <- modifyList(private$params, params)
+      } else {
+        call_state <- 0L
+        call_state <- .Call(
+          "LGBM_DatasetUpdateParamChecking_R"
+          , lgb.params2str(private$params)
           , lgb.params2str(params)
+          , call_state
+          , PACKAGE = "lib_lightgbm"
         )
-        return(invisible(self))
+        call_state <- as.integer(call_state)
+        if (call_state != 0L) {
+
+          # raise error if raw data is freed
+          if (is.null(private$raw_data)) {
+            lgb.last_error()
+          }
+
+          # Overwrite paramms
+          private$params <- modifyList(private$params, params)
+          self$finalize()
+        }
       }
-      private$params <- modifyList(private$params, params)
       return(invisible(self))
 
     },
 
+    get_params = function() {
+      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
+      ret <- list()
+      for (param_key in names(private$params)) {
+        if (param_key %in% dataset_params) {
+          ret[[param_key]] <- private$params[[param_key]]
+        }
+      }
+      return(ret)
+    },
+
     # Set categorical feature parameter
     set_categorical_feature = function(categorical_feature) {
 

@@ -19,6 +19,36 @@ lgb.encode.char <- function(arr, len) {
 
 }
 
+lgb.last_error <- function() {
+  # Perform text error buffering
+  buf_len <- 200L
+  act_len <- 0L
+  err_msg <- raw(buf_len)
+  err_msg <- .Call(
+    "LGBM_GetLastError_R"
+    , buf_len
+    , act_len
+    , err_msg
+    , PACKAGE = "lib_lightgbm"
+  )
+
+  # Check error buffer
+  if (act_len > buf_len) {
+    buf_len <- act_len
+    err_msg <- raw(buf_len)
+    err_msg <- .Call(
+      "LGBM_GetLastError_R"
+      , buf_len
+      , act_len
+      , err_msg
+      , PACKAGE = "lib_lightgbm"
+    )
+  }
+
+  # Return error
+  stop("api error: ", lgb.encode.char(err_msg, act_len))
+}
+
 lgb.call <- function(fun_name, ret, ...) {
   # Set call state to a zero value
   call_state <- 0L
@@ -43,35 +73,7 @@ lgb.call <- function(fun_name, ret, ...) {
   call_state <- as.integer(call_state)
   # Check for call state value post call
   if (call_state != 0L) {
-
-    # Perform text error buffering
-    buf_len <- 200L
-    act_len <- 0L
-    err_msg <- raw(buf_len)
-    err_msg <- .Call(
-      "LGBM_GetLastError_R"
-      , buf_len
-      , act_len
-      , err_msg
-      , PACKAGE = "lib_lightgbm"
-    )
-
-    # Check error buffer
-    if (act_len > buf_len) {
-      buf_len <- act_len
-      err_msg <- raw(buf_len)
-      err_msg <- .Call(
-        "LGBM_GetLastError_R"
-        , buf_len
-        , act_len
-        , err_msg
-        , PACKAGE = "lib_lightgbm"
-      )
-    }
-
-    # Return error
-    stop("api error: ", lgb.encode.char(err_msg, act_len))
-
+    lgb.last_error()
   }
 
   return(ret)

@@ -126,3 +126,80 @@ test_that("Dataset$new() should throw an error if 'predictor' is provided but of
     )
   }, regexp = "predictor must be a", fixed = TRUE)
 })
+
+test_that("Dataset$get_params() successfully returns parameters if you passed them", {
+  # note that this list uses one "main" parameter (feature_pre_filter) and one that
+  # is an alias (is_sparse), to check that aliases are handled correctly
+  params <- list(
+    "feature_pre_filter" = TRUE
+    , "is_sparse" = FALSE
+  )
+  ds <- lgb.Dataset(
+    test_data
+    , label = test_label
+    , params = params
+  )
+  returned_params <- ds$get_params()
+  expect_true(methods::is(returned_params, "list"))
+  expect_identical(length(params), length(returned_params))
+  expect_identical(sort(names(params)), sort(names(returned_params)))
+  for (param_name in names(params)) {
+    expect_identical(params[[param_name]], returned_params[[param_name]])
+  }
+})
+
+test_that("Dataset$get_params() ignores irrelevant parameters", {
+  params <- list(
+    "feature_pre_filter" = TRUE
+    , "is_sparse" = FALSE
+    , "nonsense_parameter" = c(1.0, 2.0, 5.0)
+  )
+  ds <- lgb.Dataset(
+    test_data
+    , label = test_label
+    , params = params
+  )
+  returned_params <- ds$get_params()
+  expect_false("nonsense_parameter" %in% names(returned_params))
+})
+
+test_that("Dataset$update_parameters() does nothing for empty inputs", {
+  ds <- lgb.Dataset(
+    test_data
+    , label = test_label
+  )
+  initial_params <- ds$get_params()
+  expect_identical(initial_params, list())
+
+  # update_params() should return "self" so it can be chained
+  res <- ds$update_params(
+    params = list()
+  )
+  expect_true(lgb.is.Dataset(res))
+
+  new_params <- ds$get_params()
+  expect_identical(new_params, initial_params)
+})
+
+test_that("Dataset$update_params() works correctly for recognized Dataset parameters", {
+  ds <- lgb.Dataset(
+    test_data
+    , label = test_label
+  )
+  initial_params <- ds$get_params()
+  expect_identical(initial_params, list())
+
+  new_params <- list(
+    "data_random_seed" = 708L
+    , "enable_bundle" = FALSE
+  )
+  res <- ds$update_params(
+    params = new_params
+  )
+  expect_true(lgb.is.Dataset(res))
+
+  updated_params <- ds$get_params()
+  for (param_name in names(new_params)) {
+    expect_identical(new_params[[param_name]], updated_params[[param_name]])
+  }
+})
@@ -44,13 +44,18 @@ test_that("Feature penalties work properly", {
   expect_length(var_gain[[length(var_gain)]], 0L)
 })
 
-test_that(".PARAMETER_ALIASES() returns a named list", {
+context("parameter aliases")
+
+test_that(".PARAMETER_ALIASES() returns a named list of character vectors, where names are unique", {
   param_aliases <- .PARAMETER_ALIASES()
   expect_true(is.list(param_aliases))
   expect_true(is.character(names(param_aliases)))
   expect_true(is.character(param_aliases[["boosting"]]))
   expect_true(is.character(param_aliases[["early_stopping_round"]]))
   expect_true(is.character(param_aliases[["num_iterations"]]))
+  expect_true(length(names(param_aliases)) == length(param_aliases))
+  expect_true(all(sapply(param_aliases, is.character)))
+  expect_true(length(unique(names(param_aliases))) == length(param_aliases))
 })
 
 test_that("training should warn if you use 'dart' boosting, specified with 'boosting' or aliases", {

@@ -537,6 +537,14 @@ IO Parameters
 
    -  use this to avoid one-data-one-bin (potential over-fitting)
 
+-  ``feature_pre_filter`` :raw-html:`<a id="feature_pre_filter" title="Permalink to this parameter" href="#feature_pre_filter">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool
+
+   -  set this to ``true`` to pre-filter the unsplittable features by ``min_data_in_leaf``
+
+   -  as dataset object is initialized only once and cannot be changed after that, you may need to set this to ``false`` when searching parameters with ``min_data_in_leaf``, otherwise features are filtered by ``min_data_in_leaf`` firstly if you don't reconstruct dataset object
+
+   -  **Note**: setting this to ``false`` may slow down the training
+
 -  ``bin_construct_sample_cnt`` :raw-html:`<a id="bin_construct_sample_cnt" title="Permalink to this parameter" href="#bin_construct_sample_cnt">&#x1F517;&#xFE0E;</a>`, default = ``200000``, type = int, aliases: ``subsample_for_bin``, constraints: ``bin_construct_sample_cnt > 0``
 
    -  number of data that sampled to construct histogram bins

@@ -142,12 +142,13 @@ class BinMapper {
   * \param max_bin The maximal number of bin
   * \param min_data_in_bin min number of data in one bin
   * \param min_split_data
+  * \param pre_filter
   * \param bin_type Type of this bin
   * \param use_missing True to enable missing value handle
   * \param zero_as_missing True to use zero as missing value
   * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
   */
-  void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
+  void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
                bool use_missing, bool zero_as_missing, const std::vector<double>& forced_upper_bounds);
 
   /*!