diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore new file mode 100644 index 000000000000..443a918af322 --- /dev/null +++ b/R-package/.Rbuildignore @@ -0,0 +1,3 @@ +^docs$ +^_pkgdown\.yml$ +^pkgdown$ diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index c87776150d95..2f8d0c6f9fcf 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -2,9 +2,12 @@ Package: lightgbm Type: Package Title: Light Gradient Boosting Machine Version: 2.1.0 -Date: 2018-01-25 -Author: Guolin Ke -Maintainer: Guolin Ke +Date: 2018-01-27 +Authors@R: c( + person("Guolin", "Ke", email = "guolin.ke@microsoft.com", role = c("aut", "cre")), + person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("ctb")), + person("Yachen", "Yan", role = c("ctb")) + ) Description: LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages: 1. Faster training speed and higher efficiency. @@ -28,10 +31,10 @@ Suggests: stringi (>= 0.5.2) Depends: R (>= 3.0), - R6 (>= 2.0) -Imports: + R6 (>= 2.0), methods, - Matrix (>= 1.1-0), + Matrix (>= 1.1-0) +Imports: data.table (>= 1.9.6), magrittr (>= 1.5), jsonlite (>= 1.0) diff --git a/R-package/R/lgb.unloader.R b/R-package/R/lgb.unloader.R index 0624e7847eec..991ba5c0ed3e 100644 --- a/R-package/R/lgb.unloader.R +++ b/R-package/R/lgb.unloader.R @@ -26,11 +26,14 @@ #' min_data = 1, #' learning_rate = 1, #' early_stopping_rounds = 10) -#' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) +#' +#' # Disabled the following line as it crashes the documentation generator +#' # lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) #' rm(model, dtrain, dtest) # Not needed if wipe = TRUE #' gc() # Not needed if wipe = TRUE #' -#' library(lightgbm) +#' # Disabled the following line as it crashes the documentation generator +#' # library(lightgbm) #' # Do whatever you want again with LightGBM without object clashing #' } #' diff --git a/R-package/README.md b/R-package/README.md index f13cc576cdbc..503c3d1c1605 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -128,12 +128,12 @@ You may also read [Microsoft/LightGBM#912](https://github.com/Microsoft/LightGBM Examples -------- -Please visit [demo](demo): - -* [Basic walkthrough of wrappers](demo/basic_walkthrough.R) -* [Boosting from existing prediction](demo/boost_from_prediction.R) -* [Early Stopping](demo/early_stopping.R) -* [Cross Validation](demo/cross_validation.R) -* [Multiclass Training/Prediction](demo/multiclass.R) -* [Leaf (in)Stability](demo/leaf_stability.R) -* [Weight-Parameter Adjustment Relationship](demo/weight_param.R) +Please visit [demo](https://github.com/Microsoft/LightGBM/tree/master/R-package/demo): + +* [Basic walkthrough of wrappers](articles/basic_walkthrough.html) +* [Boosting from existing prediction](articles/boost_from_prediction.html) +* [Early Stopping](articles/early_stopping.html) +* [Cross Validation](articles/cross_validation.html) +* [Multiclass Training/Prediction](articles/multiclass.html) +* [Leaf (in)Stability](articles/leaf_stability.html) +* [Weight-Parameter Adjustment Relationship](articles/weight_param.html) diff --git a/R-package/demo/00Index b/R-package/demo/00Index index 9eb5d45a58f1..dea564f3ec8c 100644 --- a/R-package/demo/00Index +++ b/R-package/demo/00Index @@ -5,6 +5,6 @@ categorical_feature_rules Categorical Feature Preparation with Rules cross_validation Cross Validation early_stopping Early Stop in training efficient_many_training Efficiency for Many Model Trainings -multiclass Multiclass training/prediction leaf_stability Leaf (in)Stability example +multiclass Multiclass training/prediction weight_param Weight-Parameter adjustment relationship diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R index a6f6c0639b5c..b1362e7af28b 100644 --- a/R-package/demo/basic_walkthrough.R +++ b/R-package/demo/basic_walkthrough.R @@ -22,6 +22,7 @@ bst <- lightgbm(data = train$data, num_leaves = 4, learning_rate = 1, nrounds = 2, + nthread = 1, objective = "binary") # Alternatively, you can put in dense matrix, i.e. basic R-matrix @@ -31,6 +32,7 @@ bst <- lightgbm(data = as.matrix(train$data), num_leaves = 4, learning_rate = 1, nrounds = 2, + nthread = 1, objective = "binary") # You can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features @@ -41,6 +43,7 @@ bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2, + nthread = 1, objective = "binary") # Verbose = 0,1,2 @@ -49,6 +52,7 @@ bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2, + nthread = 1, objective = "binary", verbose = 0) @@ -57,7 +61,7 @@ bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2, - nthread = 2, + nthread = 1, objective = "binary", verbose = 1) @@ -66,7 +70,7 @@ bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2, - nthread = 2, + nthread = 1, objective = "binary", verbose = 2) @@ -109,7 +113,7 @@ bst <- lgb.train(data = dtrain, learning_rate = 1, nrounds = 2, valids = valids, - nthread = 2, + nthread = 1, objective = "binary") # We can change evaluation metrics, or use multiple evaluation metrics @@ -120,7 +124,7 @@ bst <- lgb.train(data = dtrain, nrounds = 2, valids = valids, eval = c("binary_error", "binary_logloss"), - nthread = 2, + nthread = 1, objective = "binary") # lgb.Dataset can also be saved using lgb.Dataset.save @@ -133,7 +137,7 @@ bst <- lgb.train(data = dtrain2, learning_rate = 1, nrounds = 2, valids = valids, - nthread = 2, + nthread = 1, objective = "binary") # information can be extracted from lgb.Dataset using getinfo diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R index bd2b30e892c4..3429fb717b48 100644 --- a/R-package/demo/boost_from_prediction.R +++ b/R-package/demo/boost_from_prediction.R @@ -15,7 +15,7 @@ print("Start running example to start from a initial prediction") # Train lightgbm for 1 round param <- list(num_leaves = 4, learning_rate = 1, - nthread = 2, + nthread = 1, objective = "binary") bst <- lgb.train(param, dtrain, 1, valids = valids) @@ -32,4 +32,5 @@ print("This is result of boost from initial prediction") bst <- lgb.train(params = param, data = dtrain, nrounds = 5, + nthread = 1, valids = valids) diff --git a/R-package/demo/categorical_features_prepare.R b/R-package/demo/categorical_features_prepare.R index 236e88784042..576d845ae550 100644 --- a/R-package/demo/categorical_features_prepare.R +++ b/R-package/demo/categorical_features_prepare.R @@ -67,9 +67,9 @@ lgb_data <- lgb.Dataset(data = my_data, # We can now train a model model <- lgb.train(list(objective = "binary", metric = "l2", + nthread = 1, min_data = 1, learning_rate = 0.1, - min_data = 0, min_hessian = 1, max_depth = 2), lgb_data, diff --git a/R-package/demo/categorical_features_rules.R b/R-package/demo/categorical_features_rules.R index 6dd913600178..6db2ba4d7298 100644 --- a/R-package/demo/categorical_features_rules.R +++ b/R-package/demo/categorical_features_rules.R @@ -76,9 +76,9 @@ dtest <- lgb.Dataset(data = my_data_test, # We can now train a model model <- lgb.train(list(objective = "binary", metric = "l2", + nthread = 1, min_data = 1, learning_rate = 0.1, - min_data = 0, min_hessian = 1, max_depth = 2, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)), diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R index 90a965004068..8dbb76cae445 100644 --- a/R-package/demo/cross_validation.R +++ b/R-package/demo/cross_validation.R @@ -8,6 +8,7 @@ dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) nrounds <- 2 param <- list(num_leaves = 4, learning_rate = 1, + nthread = 1, objective = "binary") print("Running cross validation") diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R index 7e213cf478eb..26cd1378baf4 100644 --- a/R-package/demo/early_stopping.R +++ b/R-package/demo/early_stopping.R @@ -12,7 +12,8 @@ dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) # Note: what we are getting is margin value in prediction # You must know what you are doing param <- list(num_leaves = 4, - learning_rate = 1) + learning_rate = 1, + nthread = 1) valids <- list(eval = dtest) num_round <- 20 @@ -45,4 +46,5 @@ bst <- lgb.train(param, valids, objective = logregobj, eval = evalerror, - early_stopping_round = 3) + early_stopping_round = 3, + nthread = 1) diff --git a/R-package/demo/efficient_many_training.R b/R-package/demo/efficient_many_training.R index 6a63797e4ae8..28e22b5c02e9 100644 --- a/R-package/demo/efficient_many_training.R +++ b/R-package/demo/efficient_many_training.R @@ -8,13 +8,17 @@ # Just doing reset=TRUE will already improve things: OS reports 4.6GB. # Doing reset=TRUE and calling gc() in the loop will have OS 1.3GB. Thanks for the latest tip." +# 2018-01-21 example patch: use the "small" switch to make it bigger. + +small <- TRUE + # Load library library(lightgbm) # Generate fictive data of size 1M x 100 set.seed(11111) -x_data <- matrix(rnorm(n = 100000000, mean = 0, sd = 100), nrow = 1000000, ncol = 100) -y_data <- rnorm(n = 1000000, mean = 0, sd = 5) +x_data <- matrix(rnorm(n = ifelse(small, 1000000, 100000000), mean = 0, sd = 100), nrow = ifelse(small, 10000, 1000000), ncol = 100) +y_data <- rnorm(n = ifelse(small, 10000, 1000000), mean = 0, sd = 5) # Create lgb.Dataset for training data <- lgb.Dataset(x_data, label = y_data) @@ -24,9 +28,10 @@ data$construct() # It MUST remain constant (if not increasing very slightly) gbm <- list() -for (i in 1:1000) { - print(i) - gbm[[i]] <- lgb.train(params = list(objective = "regression"), +for (i in 1:(ifelse(small, 100, 1000))) { + cat(format(Sys.time(), "%a %b %d %Y %X"), ": ", i, "\n", sep = "") + gbm[[i]] <- lgb.train(params = list(objective = "regression", + nthread = 1), data = data, 1, reset_data = TRUE) diff --git a/R-package/demo/leaf_stability.R b/R-package/demo/leaf_stability.R index 8d7982ff4dff..6a1eed3ac76b 100644 --- a/R-package/demo/leaf_stability.R +++ b/R-package/demo/leaf_stability.R @@ -25,7 +25,8 @@ model <- lgb.train(params, learning_rate = 0.1, bagging_fraction = 0.1, bagging_freq = 1, - bagging_seed = 1) + bagging_seed = 1, + nthread = 1) # We create a data.frame with the following structure: # X = average leaf of the observation throughout all trees @@ -63,7 +64,8 @@ model2 <- lgb.train(params, 100, valids, min_data = 1, - learning_rate = 1) + learning_rate = 1, + nthread = 1) # We create the data structure, but for model2 new_data2 <- data.frame(X = rowMeans(predict(model2, @@ -98,7 +100,8 @@ model3 <- lgb.train(params, 1000, valids, min_data = 1, - learning_rate = 1) + learning_rate = 1, + nthread = 1) # We create the data structure, but for model3 new_data3 <- data.frame(X = rowMeans(predict(model3, diff --git a/R-package/demo/multiclass.R b/R-package/demo/multiclass.R index f1d14691ab89..bcca549a0f1d 100644 --- a/R-package/demo/multiclass.R +++ b/R-package/demo/multiclass.R @@ -25,7 +25,8 @@ model <- lgb.train(params, valids, min_data = 1, learning_rate = 1, - early_stopping_rounds = 10) + early_stopping_rounds = 10, + nthread = 1) # We can predict on test data, outputs a 90-length vector # Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3... @@ -41,7 +42,8 @@ model <- lgb.train(list(), early_stopping_rounds = 10, objective = "multiclass", metric = "multi_error", - num_class = 3) + num_class = 3, + nthread = 1) # We can predict on test data, identical my_preds <- predict(model, test[, 1:4]) diff --git a/R-package/docs/LICENSE.html b/R-package/docs/LICENSE.html new file mode 100644 index 000000000000..b60c54a7c806 --- /dev/null +++ b/R-package/docs/LICENSE.html @@ -0,0 +1,167 @@ + + + + + + + + +License • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+ +
+
+ + +
The MIT License (MIT)
+
+Copyright (c) Microsoft Corporation 
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+ +
+ +
+ + + +
+ + + diff --git a/R-package/docs/articles/basic_walkthrough.html b/R-package/docs/articles/basic_walkthrough.html new file mode 100644 index 000000000000..a42af49fe78e --- /dev/null +++ b/R-package/docs/articles/basic_walkthrough.html @@ -0,0 +1,297 @@ + + + + + + + +Basic Walkthrough • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
require(lightgbm)
+
## Loading required package: lightgbm
+
## Loading required package: R6
+
require(methods)
+
+# We load in the agaricus dataset
+# In this example, we are aiming to predict whether a mushroom is edible
+data(agaricus.train, package = "lightgbm")
+data(agaricus.test, package = "lightgbm")
+train <- agaricus.train
+test <- agaricus.test
+
+# The loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
+class(train$label)
+
## [1] "numeric"
+
class(train$data)
+
## [1] "dgCMatrix"
+## attr(,"package")
+## [1] "Matrix"
+
#--------------------Basic Training using lightgbm----------------
+# This is the basic usage of lightgbm you can put matrix in data field
+# Note: we are putting in sparse matrix here, lightgbm naturally handles sparse input
+# Use sparse matrix when your feature is sparse (e.g. when you are using one-hot encoding vector)
+print("Training lightgbm with sparseMatrix")
+
## [1] "Training lightgbm with sparseMatrix"
+
bst <- lightgbm(data = train$data,
+                label = train$label,
+                num_leaves = 4,
+                learning_rate = 1,
+                nrounds = 2,
+                nthread = 1,
+                objective = "binary")
+
## Loading required package: Matrix
+
## [1]: train's binary_logloss:0.198843 
+## [2]: train's binary_logloss:0.11168
+
# Alternatively, you can put in dense matrix, i.e. basic R-matrix
+print("Training lightgbm with Matrix")
+
## [1] "Training lightgbm with Matrix"
+
bst <- lightgbm(data = as.matrix(train$data),
+                label = train$label,
+                num_leaves = 4,
+                learning_rate = 1,
+                nrounds = 2,
+                nthread = 1,
+                objective = "binary")
+
## [1]: train's binary_logloss:0.198843 
+## [2]: train's binary_logloss:0.11168
+
# You can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features
+print("Training lightgbm with lgb.Dataset")
+
## [1] "Training lightgbm with lgb.Dataset"
+
dtrain <- lgb.Dataset(data = train$data,
+                      label = train$label)
+bst <- lightgbm(data = dtrain,
+                num_leaves = 4,
+                learning_rate = 1,
+                nrounds = 2,
+                nthread = 1,
+                objective = "binary")
+
## [1]: train's binary_logloss:0.198843 
+## [2]: train's binary_logloss:0.11168
+
# Verbose = 0,1,2
+print("Train lightgbm with verbose 0, no message")
+
## [1] "Train lightgbm with verbose 0, no message"
+
bst <- lightgbm(data = dtrain,
+                num_leaves = 4,
+                learning_rate = 1,
+                nrounds = 2,
+                nthread = 1,
+                objective = "binary",
+                verbose = 0)
+
+print("Train lightgbm with verbose 1, print evaluation metric")
+
## [1] "Train lightgbm with verbose 1, print evaluation metric"
+
bst <- lightgbm(data = dtrain,
+                num_leaves = 4,
+                learning_rate = 1,
+                nrounds = 2,
+                nthread = 1,
+                objective = "binary",
+                verbose = 1)
+
## [1]: train's binary_logloss:0.198843 
+## [2]: train's binary_logloss:0.11168
+
print("Train lightgbm with verbose 2, also print information about tree")
+
## [1] "Train lightgbm with verbose 2, also print information about tree"
+
bst <- lightgbm(data = dtrain,
+                num_leaves = 4,
+                learning_rate = 1,
+                nrounds = 2,
+                nthread = 1,
+                objective = "binary",
+                verbose = 2)
+
## [1]: train's binary_logloss:0.198843 
+## [2]: train's binary_logloss:0.11168
+
# You can also specify data as file path to a LibSVM/TCV/CSV format input
+# Since we do not have this file with us, the following line is just for illustration
+# bst <- lightgbm(data = "agaricus.train.svm", num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary")
+
+#--------------------Basic prediction using lightgbm--------------
+# You can do prediction using the following line
+# You can put in Matrix, sparseMatrix, or lgb.Dataset
+pred <- predict(bst, test$data)
+err <- mean(as.numeric(pred > 0.5) != test$label)
+print(paste("test-error=", err))
+
## [1] "test-error= 0.0217256362507759"
+
#--------------------Save and load models-------------------------
+# Save model to binary local file
+lgb.save(bst, "lightgbm.model")
+
+# Load binary model to R
+bst2 <- lgb.load("lightgbm.model")
+pred2 <- predict(bst2, test$data)
+
+# pred2 should be identical to pred
+print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
+
## [1] "sum(abs(pred2-pred))= 0"
+
#--------------------Advanced features ---------------------------
+# To use advanced features, we need to put data in lgb.Dataset
+dtrain <- lgb.Dataset(data = train$data, label = train$label, free_raw_data = FALSE)
+dtest <- lgb.Dataset(data = test$data, label = test$label, free_raw_data = FALSE)
+
+#--------------------Using validation set-------------------------
+# valids is a list of lgb.Dataset, each of them is tagged with name
+valids <- list(train = dtrain, test = dtest)
+
+# To train with valids, use lgb.train, which contains more advanced features
+# valids allows us to monitor the evaluation result on all data in the list
+print("Train lightgbm using lgb.train with valids")
+
## [1] "Train lightgbm using lgb.train with valids"
+
bst <- lgb.train(data = dtrain,
+                 num_leaves = 4,
+                 learning_rate = 1,
+                 nrounds = 2,
+                 valids = valids,
+                 nthread = 1,
+                 objective = "binary")
+
## [1]: train's binary_logloss:0.198843 test's binary_logloss:0.204992 
+## [2]: train's binary_logloss:0.11168  test's binary_logloss:0.113243
+
# We can change evaluation metrics, or use multiple evaluation metrics
+print("Train lightgbm using lgb.train with valids, watch logloss and error")
+
## [1] "Train lightgbm using lgb.train with valids, watch logloss and error"
+
bst <- lgb.train(data = dtrain,
+                 num_leaves = 4,
+                 learning_rate = 1,
+                 nrounds = 2,
+                 valids = valids,
+                 eval = c("binary_error", "binary_logloss"),
+                 nthread = 1,
+                 objective = "binary")
+
## [1]: train's binary_error:0.0304007  train's binary_logloss:0.198843 test's binary_error:0.0335196   test's binary_logloss:0.204992 
+## [2]: train's binary_error:0.0222632  train's binary_logloss:0.11168  test's binary_error:0.0217256   test's binary_logloss:0.113243
+
# lgb.Dataset can also be saved using lgb.Dataset.save
+lgb.Dataset.save(dtrain, "dtrain.buffer")
+
+# To load it in, simply call lgb.Dataset
+dtrain2 <- lgb.Dataset("dtrain.buffer")
+bst <- lgb.train(data = dtrain2,
+                 num_leaves = 4,
+                 learning_rate = 1,
+                 nrounds = 2,
+                 valids = valids,
+                 nthread = 1,
+                 objective = "binary")
+
## [1]: train's binary_logloss:0.198843 test's binary_logloss:0.204992 
+## [2]: train's binary_logloss:0.11168  test's binary_logloss:0.113243
+
# information can be extracted from lgb.Dataset using getinfo
+label = getinfo(dtest, "label")
+pred <- predict(bst, test$data)
+err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
+print(paste("test-error=", err))
+
## [1] "test-error= 0.0217256362507759"
+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/boost_from_prediction.html b/R-package/docs/articles/boost_from_prediction.html new file mode 100644 index 000000000000..5181de7c0482 --- /dev/null +++ b/R-package/docs/articles/boost_from_prediction.html @@ -0,0 +1,172 @@ + + + + + + + +Boosting from existing prediction • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
require(lightgbm)
+
## Loading required package: lightgbm
+
## Loading required package: R6
+
require(methods)
+
+# Load in the agaricus dataset
+data(agaricus.train, package = "lightgbm")
+data(agaricus.test, package = "lightgbm")
+dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
+dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
+
+valids <- list(eval = dtest, train = dtrain)
+#--------------------Advanced features ---------------------------
+# advanced: start from a initial base prediction
+print("Start running example to start from a initial prediction")
+
## [1] "Start running example to start from a initial prediction"
+
# Train lightgbm for 1 round
+param <- list(num_leaves = 4,
+              learning_rate = 1,
+              nthread = 1,
+              objective = "binary")
+bst <- lgb.train(param, dtrain, 1, valids = valids)
+
## Loading required package: Matrix
+
## [1]: train's binary_logloss:0.198843 eval's binary_logloss:0.204992
+
# Note: we need the margin value instead of transformed prediction in set_init_score
+ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE)
+ptest  <- predict(bst, agaricus.test$data, rawscore = TRUE)
+
+# set the init_score property of dtrain and dtest
+# base margin is the base prediction we will boost from
+setinfo(dtrain, "init_score", ptrain)
+setinfo(dtest, "init_score", ptest)
+
+print("This is result of boost from initial prediction")
+
## [1] "This is result of boost from initial prediction"
+
bst <- lgb.train(params = param,
+                 data = dtrain,
+                 nrounds = 5,
+                 nthread = 1,
+                 valids = valids)
+
## [1]: train's binary_logloss:0.11168  eval's binary_logloss:0.113243 
+## [2]: train's binary_logloss:0.0481094    eval's binary_logloss:0.0476983 
+## [3]: train's binary_logloss:0.0279468    eval's binary_logloss:0.0234973 
+## [4]: train's binary_logloss:0.0174926    eval's binary_logloss:0.0137969 
+## [5]: train's binary_logloss:0.0624553    eval's binary_logloss:0.105701
+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/categorical_features_prepare.html b/R-package/docs/articles/categorical_features_prepare.html new file mode 100644 index 000000000000..d34523847cc4 --- /dev/null +++ b/R-package/docs/articles/categorical_features_prepare.html @@ -0,0 +1,344 @@ + + + + + + + +Categorical Feature Preparation • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
# Here we are going to try training a model with categorical features
+
+# Load libraries
+library(data.table)
+library(lightgbm)
+
## Loading required package: R6
+
# Load data and look at the structure
+# 
+# Classes 'data.table' and 'data.frame':    4521 obs. of  17 variables:
+# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
+# $ job      : chr  "unemployed" "services" "management" "management" ...
+# $ marital  : chr  "married" "married" "single" "married" ...
+# $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
+# $ default  : chr  "no" "no" "no" "no" ...
+# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
+# $ housing  : chr  "no" "yes" "yes" "yes" ...
+# $ loan     : chr  "no" "yes" "no" "yes" ...
+# $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
+# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
+# $ month    : chr  "oct" "may" "apr" "jun" ...
+# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
+# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
+# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
+# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
+# $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
+# $ y        : chr  "no" "no" "no" "no" ...
+data(bank, package = "lightgbm")
+str(bank)
+
## Classes 'data.table' and 'data.frame':   4521 obs. of  17 variables:
+##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
+##  $ job      : chr  "unemployed" "services" "management" "management" ...
+##  $ marital  : chr  "married" "married" "single" "married" ...
+##  $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
+##  $ default  : chr  "no" "no" "no" "no" ...
+##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
+##  $ housing  : chr  "no" "yes" "yes" "yes" ...
+##  $ loan     : chr  "no" "yes" "no" "yes" ...
+##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
+##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
+##  $ month    : chr  "oct" "may" "apr" "jun" ...
+##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
+##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
+##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
+##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
+##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
+##  $ y        : chr  "no" "no" "no" "no" ...
+##  - attr(*, ".internal.selfref")=<externalptr>
+
# We must now transform the data to fit in LightGBM
+# For this task, we use lgb.prepare
+# The function transforms the data into a fittable data
+# 
+# Classes 'data.table' and 'data.frame':    4521 obs. of  17 variables:
+# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
+# $ job      : chr  "unemployed" "services" "management" "management" ...
+# $ marital  : chr  "married" "married" "single" "married" ...
+# $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
+# $ default  : chr  "no" "no" "no" "no" ...
+# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
+# $ housing  : chr  "no" "yes" "yes" "yes" ...
+# $ loan     : chr  "no" "yes" "no" "yes" ...
+# $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
+# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
+# $ month    : chr  "oct" "may" "apr" "jun" ...
+# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
+# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
+# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
+# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
+# $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
+# $ y        : chr  "no" "no" "no" "no" ...
+bank <- lgb.prepare(data = bank)
+str(bank)
+
## Classes 'data.table' and 'data.frame':   4521 obs. of  17 variables:
+##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
+##  $ job      : num  11 8 5 5 2 5 7 10 3 8 ...
+##  $ marital  : num  2 2 3 2 2 3 2 2 2 2 ...
+##  $ education: num  1 2 3 3 2 3 3 2 3 1 ...
+##  $ default  : num  1 1 1 1 1 1 1 1 1 1 ...
+##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
+##  $ housing  : num  1 2 2 2 2 1 2 2 2 2 ...
+##  $ loan     : num  1 2 1 2 1 1 1 1 1 2 ...
+##  $ contact  : num  1 1 1 3 3 1 1 1 3 1 ...
+##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
+##  $ month    : num  11 9 1 7 9 4 9 9 9 1 ...
+##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
+##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
+##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
+##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
+##  $ poutcome : num  4 1 1 4 4 1 2 4 4 1 ...
+##  $ y        : num  1 1 1 1 1 1 1 1 1 1 ...
+##  - attr(*, ".internal.selfref")=<externalptr>
+
# Remove 1 to label because it must be between 0 and 1
+bank$y <- bank$y - 1
+
+# Data input to LightGBM must be a matrix, without the label
+my_data <- as.matrix(bank[, 1:16, with = FALSE])
+
+# Creating the LightGBM dataset with categorical features
+# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
+lgb_data <- lgb.Dataset(data = my_data,
+                        label = bank$y,
+                        categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))
+
+# We can now train a model
+model <- lgb.train(list(objective = "binary",
+                        metric = "l2",
+                        nthread = 1,
+                        min_data = 1,
+                        learning_rate = 0.1,
+                        min_hessian = 1,
+                        max_depth = 2),
+                   lgb_data,
+                   100,
+                   valids = list(train = lgb_data))
+
## [1]: train's l2:0.217736 
+## [2]: train's l2:0.191678 
+## [3]: train's l2:0.170597 
+## [4]: train's l2:0.153499 
+## [5]: train's l2:0.139624 
+## [6]: train's l2:0.128409 
+## [7]: train's l2:0.119125 
+## [8]: train's l2:0.111578 
+## [9]: train's l2:0.10539 
+## [10]:    train's l2:0.100182 
+## [11]:    train's l2:0.0959854 
+## [12]:    train's l2:0.0925037 
+## [13]:    train's l2:0.0894105 
+## [14]:    train's l2:0.0870018 
+## [15]:    train's l2:0.0850329 
+## [16]:    train's l2:0.0832162 
+## [17]:    train's l2:0.0817969 
+## [18]:    train's l2:0.0805792 
+## [19]:    train's l2:0.0794556 
+## [20]:    train's l2:0.078615 
+## [21]:    train's l2:0.0778916 
+## [22]:    train's l2:0.0770496 
+## [23]:    train's l2:0.0764825 
+## [24]:    train's l2:0.0760338 
+## [25]:    train's l2:0.075266 
+## [26]:    train's l2:0.0748611 
+## [27]:    train's l2:0.0744663 
+## [28]:    train's l2:0.074148 
+## [29]:    train's l2:0.0738167 
+## [30]:    train's l2:0.0732869 
+## [31]:    train's l2:0.0730472 
+## [32]:    train's l2:0.0725745 
+## [33]:    train's l2:0.0721689 
+## [34]:    train's l2:0.0719837 
+## [35]:    train's l2:0.0716568 
+## [36]:    train's l2:0.0714165 
+## [37]:    train's l2:0.0712889 
+## [38]:    train's l2:0.0711586 
+## [39]:    train's l2:0.0708194 
+## [40]:    train's l2:0.0705857 
+## [41]:    train's l2:0.0702503 
+## [42]:    train's l2:0.0700202 
+## [43]:    train's l2:0.0698951 
+## [44]:    train's l2:0.0696913 
+## [45]:    train's l2:0.0695681 
+## [46]:    train's l2:0.0694261 
+## [47]:    train's l2:0.0692232 
+## [48]:    train's l2:0.0690527 
+## [49]:    train's l2:0.0689585 
+## [50]:    train's l2:0.0687556 
+## [51]:    train's l2:0.0685764 
+## [52]:    train's l2:0.0685009 
+## [53]:    train's l2:0.0683634 
+## [54]:    train's l2:0.0681724 
+## [55]:    train's l2:0.0679939 
+## [56]:    train's l2:0.0678073 
+## [57]:    train's l2:0.067689 
+## [58]:    train's l2:0.0675343 
+## [59]:    train's l2:0.0673871 
+## [60]:    train's l2:0.0672275 
+## [61]:    train's l2:0.0671656 
+## [62]:    train's l2:0.0670863 
+## [63]:    train's l2:0.0669049 
+## [64]:    train's l2:0.0667991 
+## [65]:    train's l2:0.0667414 
+## [66]:    train's l2:0.0665912 
+## [67]:    train's l2:0.0664871 
+## [68]:    train's l2:0.0663864 
+## [69]:    train's l2:0.0662678 
+## [70]:    train's l2:0.0662279 
+## [71]:    train's l2:0.0660886 
+## [72]:    train's l2:0.065937 
+## [73]:    train's l2:0.0659013 
+## [74]:    train's l2:0.0657467 
+## [75]:    train's l2:0.0655841 
+## [76]:    train's l2:0.0654689 
+## [77]:    train's l2:0.0653655 
+## [78]:    train's l2:0.0653227 
+## [79]:    train's l2:0.0651911 
+## [80]:    train's l2:0.0650897 
+## [81]:    train's l2:0.0650003 
+## [82]:    train's l2:0.0648883 
+## [83]:    train's l2:0.0647997 
+## [84]:    train's l2:0.0646982 
+## [85]:    train's l2:0.0646623 
+## [86]:    train's l2:0.0645255 
+## [87]:    train's l2:0.0644004 
+## [88]:    train's l2:0.0642746 
+## [89]:    train's l2:0.0641889 
+## [90]:    train's l2:0.0640877 
+## [91]:    train's l2:0.0638363 
+## [92]:    train's l2:0.0637582 
+## [93]:    train's l2:0.0637263 
+## [94]:    train's l2:0.0636626 
+## [95]:    train's l2:0.0635542 
+## [96]:    train's l2:0.0634245 
+## [97]:    train's l2:0.0633328 
+## [98]:    train's l2:0.063234 
+## [99]:    train's l2:0.0630126 
+## [100]:   train's l2:0.0629891
+
# Try to find split_feature: 2
+# If you find it, it means it used a categorical feature in the first tree
+lgb.dump(model, num_iteration = 1)
+
## [1] "{\"name\":\"tree\",\n\"version\":\"v2\",\n\"num_class\":1,\n\"num_tree_per_iteration\":1,\n\"label_index\":0,\n\"max_feature_idx\":15,\n\"feature_names\":[\"age\",\"job\",\"marital\",\"education\",\"default\",\"balance\",\"housing\",\"loan\",\"contact\",\"day\",\"month\",\"duration\",\"campaign\",\"pdays\",\"previous\",\"poutcome\"],\n\"tree_info\":[{\"tree_index\":0,\"num_leaves\":4,\n\"num_cat\":2,\n\"shrinkage\":0.10000000000000001,\n\"tree_structure\":{\n\"split_index\":0,\n\"split_feature\":11,\n\"split_gain\":229.80378723144531,\n\"threshold\":647.00000000000011,\n\"decision_type\":\"<=\",\n\"default_left\":true,\n\"missing_type\":\"None\",\n\"internal_value\":0,\n\"internal_count\":4521,\n\"left_child\":{\n\"split_index\":1,\n\"split_feature\":15,\n\"split_gain\":156.62019348144531,\n\"threshold\":\"3\",\n\"decision_type\":\"==\",\n\"default_left\":false,\n\"missing_type\":\"None\",\n\"internal_value\":-1.6706673067690831,\n\"internal_count\":4166,\n\"left_child\":{\n\"leaf_index\":0,\n\"leaf_value\":0.062068965517241385,\n\"leaf_count\":116\n},\n\"right_child\":{\n\"leaf_index\":2,\n\"leaf_value\":-0.17362962962962963,\n\"leaf_count\":4050\n}\n},\n\"right_child\":{\n\"split_index\":2,\n\"split_feature\":2,\n\"split_gain\":14.507294654846191,\n\"threshold\":\"2\",\n\"decision_type\":\"==\",\n\"default_left\":false,\n\"missing_type\":\"None\",\n\"internal_value\":0.0056338028169014088,\n\"internal_count\":355,\n\"left_child\":{\n\"leaf_index\":1,\n\"leaf_value\":-0.03482587064676617,\n\"leaf_count\":201\n},\n\"right_child\":{\n\"leaf_index\":3,\n\"leaf_value\":0.046753246753246755,\n\"leaf_count\":154\n}\n}\n}\n}]\n}\n"
+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/categorical_features_rules.html b/R-package/docs/articles/categorical_features_rules.html new file mode 100644 index 000000000000..c1c3f16ecb6f --- /dev/null +++ b/R-package/docs/articles/categorical_features_rules.html @@ -0,0 +1,354 @@ + + + + + + + +Categorical Feature Preparation with Rule • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
# Here we are going to try training a model with categorical features
+
+# Load libraries
+library(data.table)
+library(lightgbm)
+
## Loading required package: R6
+
# Load data and look at the structure
+# 
+# Classes 'data.table' and 'data.frame':    4521 obs. of  17 variables:
+# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
+# $ job      : chr  "unemployed" "services" "management" "management" ...
+# $ marital  : chr  "married" "married" "single" "married" ...
+# $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
+# $ default  : chr  "no" "no" "no" "no" ...
+# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
+# $ housing  : chr  "no" "yes" "yes" "yes" ...
+# $ loan     : chr  "no" "yes" "no" "yes" ...
+# $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
+# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
+# $ month    : chr  "oct" "may" "apr" "jun" ...
+# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
+# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
+# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
+# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
+# $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
+# $ y        : chr  "no" "no" "no" "no" ...
+data(bank, package = "lightgbm")
+str(bank)
+
## Classes 'data.table' and 'data.frame':   4521 obs. of  17 variables:
+##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
+##  $ job      : chr  "unemployed" "services" "management" "management" ...
+##  $ marital  : chr  "married" "married" "single" "married" ...
+##  $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
+##  $ default  : chr  "no" "no" "no" "no" ...
+##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
+##  $ housing  : chr  "no" "yes" "yes" "yes" ...
+##  $ loan     : chr  "no" "yes" "no" "yes" ...
+##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
+##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
+##  $ month    : chr  "oct" "may" "apr" "jun" ...
+##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
+##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
+##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
+##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
+##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
+##  $ y        : chr  "no" "no" "no" "no" ...
+##  - attr(*, ".internal.selfref")=<externalptr>
+
# We are dividing the dataset into two: one train, one validation
+bank_train <- bank[1:4000, ]
+bank_test <- bank[4001:4521, ]
+
+# We must now transform the data to fit in LightGBM
+# For this task, we use lgb.prepare
+# The function transforms the data into a fittable data
+# 
+# Classes 'data.table' and 'data.frame':    521 obs. of  17 variables:
+# $ age      : int  53 36 58 26 34 55 55 34 41 38 ...
+# $ job      : num  1 10 10 9 10 2 2 3 3 4 ...
+# $ marital  : num  1 2 1 3 3 2 2 2 1 1 ...
+# $ education: num  2 2 2 2 2 1 2 3 2 2 ...
+# $ default  : num  1 1 1 1 1 1 1 1 1 1 ...
+# $ balance  : int  26 191 -123 -147 179 1086 471 105 1588 70 ...
+# $ housing  : num  2 1 1 1 1 2 2 2 2 1 ...
+# $ loan     : num  1 1 1 1 1 1 1 1 2 1 ...
+# $ contact  : num  1 1 1 3 1 1 3 3 3 1 ...
+# $ day      : int  7 31 5 4 19 6 30 28 20 27 ...
+# $ month    : num  9 2 2 7 2 9 9 9 7 11 ...
+# $ duration : int  56 69 131 95 294 146 58 249 10 255 ...
+# $ campaign : int  1 1 2 2 3 1 2 2 8 3 ...
+# $ pdays    : int  359 -1 -1 -1 -1 272 -1 -1 -1 148 ...
+# $ previous : int  1 0 0 0 0 2 0 0 0 1 ...
+# $ poutcome : num  1 4 4 4 4 1 4 4 4 3 ...
+# $ y        : num  1 1 1 1 1 1 1 1 1 2 ...
+bank_rules <- lgb.prepare_rules(data = bank_train)
+bank_train <- bank_rules$data
+bank_test <- lgb.prepare_rules(data = bank_test, rules = bank_rules$rules)$data
+str(bank_test)
+
## Classes 'data.table' and 'data.frame':   521 obs. of  17 variables:
+##  $ age      : int  53 36 58 26 34 55 55 34 41 38 ...
+##  $ job      : num  1 10 10 9 10 2 2 3 3 4 ...
+##  $ marital  : num  1 2 1 3 3 2 2 2 1 1 ...
+##  $ education: num  2 2 2 2 2 1 2 3 2 2 ...
+##  $ default  : num  1 1 1 1 1 1 1 1 1 1 ...
+##  $ balance  : int  26 191 -123 -147 179 1086 471 105 1588 70 ...
+##  $ housing  : num  2 1 1 1 1 2 2 2 2 1 ...
+##  $ loan     : num  1 1 1 1 1 1 1 1 2 1 ...
+##  $ contact  : num  1 1 1 3 1 1 3 3 3 1 ...
+##  $ day      : int  7 31 5 4 19 6 30 28 20 27 ...
+##  $ month    : num  9 2 2 7 2 9 9 9 7 11 ...
+##  $ duration : int  56 69 131 95 294 146 58 249 10 255 ...
+##  $ campaign : int  1 1 2 2 3 1 2 2 8 3 ...
+##  $ pdays    : int  359 -1 -1 -1 -1 272 -1 -1 -1 148 ...
+##  $ previous : int  1 0 0 0 0 2 0 0 0 1 ...
+##  $ poutcome : num  1 4 4 4 4 1 4 4 4 3 ...
+##  $ y        : num  1 1 1 1 1 1 1 1 1 2 ...
+##  - attr(*, ".internal.selfref")=<externalptr>
+
# Remove 1 to label because it must be between 0 and 1
+bank_train$y <- bank_train$y - 1
+bank_test$y <- bank_test$y - 1
+
+# Data input to LightGBM must be a matrix, without the label
+my_data_train <- as.matrix(bank_train[, 1:16, with = FALSE])
+my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE])
+
+# Creating the LightGBM dataset with categorical features
+# The categorical features can be passed to lgb.train to not copy and paste a lot
+dtrain <- lgb.Dataset(data = my_data_train,
+                      label = bank_train$y)
+dtest <- lgb.Dataset(data = my_data_test,
+                     label = bank_test$y)
+
+# We can now train a model
+model <- lgb.train(list(objective = "binary",
+                        metric = "l2",
+                        nthread = 1,
+                        min_data = 1,
+                        learning_rate = 0.1,
+                        min_hessian = 1,
+                        max_depth = 2,
+                        categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)),
+                   dtrain,
+                   100,
+                   valids = list(train = dtrain, valid = dtest))
+
## [1]: train's l2:0.218547 valid's l2:0.219319 
+## [2]: train's l2:0.193199 valid's l2:0.194746 
+## [3]: train's l2:0.172641 valid's l2:0.17521 
+## [4]: train's l2:0.155786 valid's l2:0.15914 
+## [5]: train's l2:0.14213  valid's l2:0.145937 
+## [6]: train's l2:0.131005 valid's l2:0.135395 
+## [7]: train's l2:0.12174  valid's l2:0.126573 
+## [8]: train's l2:0.114305 valid's l2:0.119354 
+## [9]: train's l2:0.1081   valid's l2:0.113633 
+## [10]:    train's l2:0.102941 valid's l2:0.107969 
+## [11]:    train's l2:0.0987159    valid's l2:0.103675 
+## [12]:    train's l2:0.0952544    valid's l2:0.100598 
+## [13]:    train's l2:0.0923753    valid's l2:0.0973981 
+## [14]:    train's l2:0.0900202    valid's l2:0.0949834 
+## [15]:    train's l2:0.0879191    valid's l2:0.0931068 
+## [16]:    train's l2:0.0863022    valid's l2:0.0914335 
+## [17]:    train's l2:0.0849219    valid's l2:0.0901021 
+## [18]:    train's l2:0.0835936    valid's l2:0.0888896 
+## [19]:    train's l2:0.0826468    valid's l2:0.0878815 
+## [20]:    train's l2:0.081797 valid's l2:0.0866827 
+## [21]:    train's l2:0.0809775    valid's l2:0.0860839 
+## [22]:    train's l2:0.0803427    valid's l2:0.0854876 
+## [23]:    train's l2:0.0797951    valid's l2:0.0853016 
+## [24]:    train's l2:0.0791358    valid's l2:0.0850711 
+## [25]:    train's l2:0.0786815    valid's l2:0.0843546 
+## [26]:    train's l2:0.0782522    valid's l2:0.0841719 
+## [27]:    train's l2:0.077961 valid's l2:0.08392 
+## [28]:    train's l2:0.0776035    valid's l2:0.0835606 
+## [29]:    train's l2:0.0770722    valid's l2:0.0831014 
+## [30]:    train's l2:0.0768721    valid's l2:0.0829624 
+## [31]:    train's l2:0.0765371    valid's l2:0.0827473 
+## [32]:    train's l2:0.0763328    valid's l2:0.0826138 
+## [33]:    train's l2:0.0761409    valid's l2:0.0821803 
+## [34]:    train's l2:0.0758711    valid's l2:0.0820529 
+## [35]:    train's l2:0.0757279    valid's l2:0.0819736 
+## [36]:    train's l2:0.0755947    valid's l2:0.0818789 
+## [37]:    train's l2:0.0753168    valid's l2:0.0817608 
+## [38]:    train's l2:0.0751867    valid's l2:0.0814168 
+## [39]:    train's l2:0.0748237    valid's l2:0.0814926 
+## [40]:    train's l2:0.0745276    valid's l2:0.0810787 
+## [41]:    train's l2:0.0742475    valid's l2:0.0807388 
+## [42]:    train's l2:0.0740241    valid's l2:0.0806229 
+## [43]:    train's l2:0.0739098    valid's l2:0.0806254 
+## [44]:    train's l2:0.073664 valid's l2:0.080449 
+## [45]:    train's l2:0.0736108    valid's l2:0.0804152 
+## [46]:    train's l2:0.0733494    valid's l2:0.0804195 
+## [47]:    train's l2:0.0730964    valid's l2:0.0798332 
+## [48]:    train's l2:0.0730014    valid's l2:0.0799223 
+## [49]:    train's l2:0.0728088    valid's l2:0.0799888 
+## [50]:    train's l2:0.0725185    valid's l2:0.0799877 
+## [51]:    train's l2:0.0724482    valid's l2:0.0799518 
+## [52]:    train's l2:0.0722151    valid's l2:0.0800195 
+## [53]:    train's l2:0.0719814    valid's l2:0.079813 
+## [54]:    train's l2:0.0718549    valid's l2:0.0797692 
+## [55]:    train's l2:0.0716643    valid's l2:0.0796802 
+## [56]:    train's l2:0.0715891    valid's l2:0.0796772 
+## [57]:    train's l2:0.0713858    valid's l2:0.0797784 
+## [58]:    train's l2:0.0711216    valid's l2:0.0799199 
+## [59]:    train's l2:0.070894 valid's l2:0.0797905 
+## [60]:    train's l2:0.0705351    valid's l2:0.0794658 
+## [61]:    train's l2:0.0704668    valid's l2:0.0794671 
+## [62]:    train's l2:0.0703246    valid's l2:0.079374 
+## [63]:    train's l2:0.0701194    valid's l2:0.0789409 
+## [64]:    train's l2:0.0698633    valid's l2:0.078693 
+## [65]:    train's l2:0.0698044    valid's l2:0.0786952 
+## [66]:    train's l2:0.0696808    valid's l2:0.078649 
+## [67]:    train's l2:0.0695161    valid's l2:0.0782809 
+## [68]:    train's l2:0.0692527    valid's l2:0.0780414 
+## [69]:    train's l2:0.0687731    valid's l2:0.0774695 
+## [70]:    train's l2:0.0686329    valid's l2:0.0776007 
+## [71]:    train's l2:0.0684736    valid's l2:0.0774359 
+## [72]:    train's l2:0.0684289    valid's l2:0.0774821 
+## [73]:    train's l2:0.0682447    valid's l2:0.077642 
+## [74]:    train's l2:0.0678441    valid's l2:0.0771838 
+## [75]:    train's l2:0.067732 valid's l2:0.0771857 
+## [76]:    train's l2:0.067545 valid's l2:0.0769763 
+## [77]:    train's l2:0.0673388    valid's l2:0.0769291 
+## [78]:    train's l2:0.0672337    valid's l2:0.0768385 
+## [79]:    train's l2:0.0671938    valid's l2:0.0768484 
+## [80]:    train's l2:0.0670772    valid's l2:0.0767856 
+## [81]:    train's l2:0.0669279    valid's l2:0.0767355 
+## [82]:    train's l2:0.0668174    valid's l2:0.0768689 
+## [83]:    train's l2:0.0667102    valid's l2:0.0767268 
+## [84]:    train's l2:0.0665359    valid's l2:0.0767133 
+## [85]:    train's l2:0.0664498    valid's l2:0.0767154 
+## [86]:    train's l2:0.0661474    valid's l2:0.0763899 
+## [87]:    train's l2:0.0661185    valid's l2:0.0763743 
+## [88]:    train's l2:0.0660187    valid's l2:0.0765239 
+## [89]:    train's l2:0.065929 valid's l2:0.0764648 
+## [90]:    train's l2:0.0658001    valid's l2:0.0764113 
+## [91]:    train's l2:0.0656409    valid's l2:0.0763919 
+## [92]:    train's l2:0.0655088    valid's l2:0.0765255 
+## [93]:    train's l2:0.0652731    valid's l2:0.0762637 
+## [94]:    train's l2:0.0651455    valid's l2:0.076341 
+## [95]:    train's l2:0.0650616    valid's l2:0.0763153 
+## [96]:    train's l2:0.0649726    valid's l2:0.076439 
+## [97]:    train's l2:0.0649336    valid's l2:0.0763692 
+## [98]:    train's l2:0.0648688    valid's l2:0.0763461 
+## [99]:    train's l2:0.0647634    valid's l2:0.0763314 
+## [100]:   train's l2:0.0646546    valid's l2:0.0764209
+
# Try to find split_feature: 11
+# If you find it, it means it used a categorical feature in the first tree
+lgb.dump(model, num_iteration = 1)
+
## [1] "{\"name\":\"tree\",\n\"version\":\"v2\",\n\"num_class\":1,\n\"num_tree_per_iteration\":1,\n\"label_index\":0,\n\"max_feature_idx\":15,\n\"feature_names\":[\"age\",\"job\",\"marital\",\"education\",\"default\",\"balance\",\"housing\",\"loan\",\"contact\",\"day\",\"month\",\"duration\",\"campaign\",\"pdays\",\"previous\",\"poutcome\"],\n\"tree_info\":[{\"tree_index\":0,\"num_leaves\":4,\n\"num_cat\":0,\n\"shrinkage\":0.10000000000000001,\n\"tree_structure\":{\n\"split_index\":0,\n\"split_feature\":11,\n\"split_gain\":206.50007629394531,\n\"threshold\":645.00000000000011,\n\"decision_type\":\"<=\",\n\"default_left\":true,\n\"missing_type\":\"None\",\n\"internal_value\":0,\n\"internal_count\":4000,\n\"left_child\":{\n\"split_index\":1,\n\"split_feature\":11,\n\"split_gain\":63.834861755371094,\n\"threshold\":211.50000000000003,\n\"decision_type\":\"<=\",\n\"default_left\":true,\n\"missing_type\":\"None\",\n\"internal_value\":-1.6737127371273712,\n\"internal_count\":3690,\n\"left_child\":{\n\"leaf_index\":0,\n\"leaf_value\":-0.18818862935213754,\n\"leaf_count\":2269\n},\n\"right_child\":{\n\"leaf_index\":2,\n\"leaf_value\":-0.13413089373680506,\n\"leaf_count\":1421\n}\n},\n\"right_child\":{\n\"split_index\":2,\n\"split_feature\":8,\n\"split_gain\":7.1037116050720215,\n\"threshold\":1.5000000000000002,\n\"decision_type\":\"<=\",\n\"default_left\":true,\n\"missing_type\":\"None\",\n\"internal_value\":0.025806451612903226,\n\"internal_count\":310,\n\"left_child\":{\n\"leaf_index\":1,\n\"leaf_value\":0.024875621890547265,\n\"leaf_count\":201\n},\n\"right_child\":{\n\"leaf_index\":3,\n\"leaf_value\":-0.038532110091743121,\n\"leaf_count\":109\n}\n}\n}\n}]\n}\n"
+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/cross_validation.html b/R-package/docs/articles/cross_validation.html new file mode 100644 index 000000000000..532ce1106f2f --- /dev/null +++ b/R-package/docs/articles/cross_validation.html @@ -0,0 +1,217 @@ + + + + + + + +Cross Validation • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
require(lightgbm)
+
## Loading required package: lightgbm
+
## Loading required package: R6
+
# load in the agaricus dataset
+data(agaricus.train, package = "lightgbm")
+data(agaricus.test, package = "lightgbm")
+dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
+dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
+
+nrounds <- 2
+param <- list(num_leaves = 4,
+              learning_rate = 1,
+              nthread = 1,
+              objective = "binary")
+
+print("Running cross validation")
+
## [1] "Running cross validation"
+
# Do cross validation, this will print result out as
+# [iteration]  metric_name:mean_value+std_value
+# std_value is standard deviation of the metric
+lgb.cv(param,
+       dtrain,
+       nrounds,
+       nfold = 5,
+       eval = "binary_error")
+
## Loading required package: Matrix
+
## [1]: valid's binary_error:0.0304001+0.00491304 
+## [2]: valid's binary_error:0.0222626+0.00298825
+
## <lgb.CVBooster>
+##   Public:
+##     best_iter: -1
+##     best_score: -1
+##     boosters: list
+##     initialize: function (x) 
+##     record_evals: list
+##     reset_parameter: function (new_params)
+
print("Running cross validation, disable standard deviation display")
+
## [1] "Running cross validation, disable standard deviation display"
+
# do cross validation, this will print result out as
+# [iteration]  metric_name:mean_value+std_value
+# std_value is standard deviation of the metric
+lgb.cv(param,
+       dtrain,
+       nrounds,
+       nfold = 5,
+       eval = "binary_error",
+       showsd = FALSE)
+
## [1]: valid's binary_error:0.030401 
+## [2]: valid's binary_error:0.0222636
+
## <lgb.CVBooster>
+##   Public:
+##     best_iter: -1
+##     best_score: -1
+##     boosters: list
+##     initialize: function (x) 
+##     record_evals: list
+##     reset_parameter: function (new_params)
+
# You can also do cross validation with cutomized loss function
+print("Running cross validation, with cutomsized loss function")
+
## [1] "Running cross validation, with cutomsized loss function"
+
logregobj <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  preds <- 1 / (1 + exp(-preds))
+  grad <- preds - labels
+  hess <- preds * (1 - preds)
+  return(list(grad = grad, hess = hess))
+}
+evalerror <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
+  return(list(name = "error", value = err, higher_better = FALSE))
+}
+
+# train with customized objective
+lgb.cv(params = param,
+       data = dtrain,
+       nrounds = nrounds,
+       obj = logregobj,
+       eval = evalerror,
+       nfold = 5)
+
## [1]: valid's error:0.0304002+0.00164458 
+## [2]: valid's error:0.0222625+0.00193436
+
## <lgb.CVBooster>
+##   Public:
+##     best_iter: -1
+##     best_score: -1
+##     boosters: list
+##     initialize: function (x) 
+##     record_evals: list
+##     reset_parameter: function (new_params)
+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/dtrain.buffer b/R-package/docs/articles/dtrain.buffer new file mode 100644 index 000000000000..836fab9305c3 Binary files /dev/null and b/R-package/docs/articles/dtrain.buffer differ diff --git a/R-package/docs/articles/early_stopping.html b/R-package/docs/articles/early_stopping.html new file mode 100644 index 000000000000..8460b5c93181 --- /dev/null +++ b/R-package/docs/articles/early_stopping.html @@ -0,0 +1,187 @@ + + + + + + + +Early Stop in training • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
require(lightgbm)
+
## Loading required package: lightgbm
+
## Loading required package: R6
+
require(methods)
+
+# Load in the agaricus dataset
+data(agaricus.train, package = "lightgbm")
+data(agaricus.test, package = "lightgbm")
+
+dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
+dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
+
+# Note: for customized objective function, we leave objective as default
+# Note: what we are getting is margin value in prediction
+# You must know what you are doing
+param <- list(num_leaves = 4,
+              learning_rate = 1,
+              nthread = 1)
+valids <- list(eval = dtest)
+num_round <- 20
+
+# User define objective function, given prediction, return gradient and second order gradient
+# This is loglikelihood loss
+logregobj <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  preds <- 1 / (1 + exp(-preds))
+  grad <- preds - labels
+  hess <- preds * (1 - preds)
+  return(list(grad = grad, hess = hess))
+}
+
+# User defined evaluation function, return a pair metric_name, result, higher_better
+# NOTE: when you do customized loss function, the default prediction value is margin
+# This may make buildin evalution metric not function properly
+# For example, we are doing logistic loss, the prediction is score before logistic transformation
+# The buildin evaluation error assumes input is after logistic transformation
+# Take this in mind when you use the customization, and maybe you need write customized evaluation function
+evalerror <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  err <- as.numeric(sum(labels != (preds > 0.5))) / length(labels)
+  return(list(name = "error", value = err, higher_better = FALSE))
+}
+print("Start training with early Stopping setting")
+
## [1] "Start training with early Stopping setting"
+
bst <- lgb.train(param,
+                 dtrain,
+                 num_round,
+                 valids,
+                 objective = logregobj,
+                 eval = evalerror,
+                 early_stopping_round = 3,
+                 nthread = 1)
+
## Loading required package: Matrix
+
## [1]: eval's error:0.0335196 
+## [2]: eval's error:0.0217256 
+## [3]: eval's error:0.00558659 
+## [4]: eval's error:0 
+## [5]: eval's error:0 
+## [6]: eval's error:0.00620732 
+## [7]: eval's error:0
+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/efficient_many_training.html b/R-package/docs/articles/efficient_many_training.html new file mode 100644 index 000000000000..68cb2a778c4c --- /dev/null +++ b/R-package/docs/articles/efficient_many_training.html @@ -0,0 +1,266 @@ + + + + + + + +Efficiency for Many Model Trainings • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
# Efficient training means training without giving up too much RAM
+# In the case of many trainings (like 100+ models), RAM will be eaten very quickly
+# Therefore, it is essential to know a strategy to deal with such issue
+
+# More results can be found here: https://github.com/Microsoft/LightGBM/issues/879#issuecomment-326656580
+# Quote: "@Laurae2 Thanks for nice easily reproducible example (unlike mine).
+# With reset=FALSE you get after 500 iterations (not 1000): OS reports 27GB usage, while R gc() reports 1.5GB.
+# Just doing reset=TRUE will already improve things: OS reports 4.6GB.
+# Doing reset=TRUE and calling gc() in the loop will have OS 1.3GB. Thanks for the latest tip."
+
+# 2018-01-21 example patch: use the "small" switch to make it bigger.
+
+small <- TRUE
+
+# Load library
+library(lightgbm)
+
## Loading required package: R6
+
# Generate fictive data of size 1M x 100
+set.seed(11111)
+x_data <- matrix(rnorm(n = ifelse(small, 1000000, 100000000), mean = 0, sd = 100), nrow = ifelse(small, 10000, 1000000), ncol = 100)
+y_data <- rnorm(n = ifelse(small, 10000, 1000000), mean = 0, sd = 5)
+
+# Create lgb.Dataset for training
+data <- lgb.Dataset(x_data, label = y_data)
+data$construct()
+
+# Loop through a training of 1000 models, please check your RAM on your task manager
+# It MUST remain constant (if not increasing very slightly)
+gbm <- list()
+
+for (i in 1:(ifelse(small, 100, 1000))) {
+  cat(format(Sys.time(), "%a %b %d %Y %X"), ": ", i, "\n", sep = "")
+  gbm[[i]] <- lgb.train(params = list(objective = "regression",
+                                      nthread = 1),
+                        data = data,
+                        1,
+                        reset_data = TRUE)
+  gc(verbose = FALSE)
+}
+
## Tue Jan 23 2018 08:38:50 PM: 1
+## Tue Jan 23 2018 08:38:50 PM: 2
+## Tue Jan 23 2018 08:38:50 PM: 3
+## Tue Jan 23 2018 08:38:50 PM: 4
+## Tue Jan 23 2018 08:38:50 PM: 5
+## Tue Jan 23 2018 08:38:51 PM: 6
+## Tue Jan 23 2018 08:38:51 PM: 7
+## Tue Jan 23 2018 08:38:51 PM: 8
+## Tue Jan 23 2018 08:38:51 PM: 9
+## Tue Jan 23 2018 08:38:51 PM: 10
+## Tue Jan 23 2018 08:38:51 PM: 11
+## Tue Jan 23 2018 08:38:51 PM: 12
+## Tue Jan 23 2018 08:38:51 PM: 13
+## Tue Jan 23 2018 08:38:51 PM: 14
+## Tue Jan 23 2018 08:38:51 PM: 15
+## Tue Jan 23 2018 08:38:51 PM: 16
+## Tue Jan 23 2018 08:38:51 PM: 17
+## Tue Jan 23 2018 08:38:51 PM: 18
+## Tue Jan 23 2018 08:38:51 PM: 19
+## Tue Jan 23 2018 08:38:51 PM: 20
+## Tue Jan 23 2018 08:38:52 PM: 21
+## Tue Jan 23 2018 08:38:52 PM: 22
+## Tue Jan 23 2018 08:38:52 PM: 23
+## Tue Jan 23 2018 08:38:52 PM: 24
+## Tue Jan 23 2018 08:38:52 PM: 25
+## Tue Jan 23 2018 08:38:52 PM: 26
+## Tue Jan 23 2018 08:38:52 PM: 27
+## Tue Jan 23 2018 08:38:52 PM: 28
+## Tue Jan 23 2018 08:38:52 PM: 29
+## Tue Jan 23 2018 08:38:52 PM: 30
+## Tue Jan 23 2018 08:38:52 PM: 31
+## Tue Jan 23 2018 08:38:52 PM: 32
+## Tue Jan 23 2018 08:38:52 PM: 33
+## Tue Jan 23 2018 08:38:52 PM: 34
+## Tue Jan 23 2018 08:38:52 PM: 35
+## Tue Jan 23 2018 08:38:52 PM: 36
+## Tue Jan 23 2018 08:38:52 PM: 37
+## Tue Jan 23 2018 08:38:53 PM: 38
+## Tue Jan 23 2018 08:38:53 PM: 39
+## Tue Jan 23 2018 08:38:53 PM: 40
+## Tue Jan 23 2018 08:38:53 PM: 41
+## Tue Jan 23 2018 08:38:53 PM: 42
+## Tue Jan 23 2018 08:38:53 PM: 43
+## Tue Jan 23 2018 08:38:53 PM: 44
+## Tue Jan 23 2018 08:38:53 PM: 45
+## Tue Jan 23 2018 08:38:53 PM: 46
+## Tue Jan 23 2018 08:38:53 PM: 47
+## Tue Jan 23 2018 08:38:53 PM: 48
+## Tue Jan 23 2018 08:38:53 PM: 49
+## Tue Jan 23 2018 08:38:53 PM: 50
+## Tue Jan 23 2018 08:38:53 PM: 51
+## Tue Jan 23 2018 08:38:53 PM: 52
+## Tue Jan 23 2018 08:38:54 PM: 53
+## Tue Jan 23 2018 08:38:54 PM: 54
+## Tue Jan 23 2018 08:38:54 PM: 55
+## Tue Jan 23 2018 08:38:54 PM: 56
+## Tue Jan 23 2018 08:38:54 PM: 57
+## Tue Jan 23 2018 08:38:54 PM: 58
+## Tue Jan 23 2018 08:38:54 PM: 59
+## Tue Jan 23 2018 08:38:54 PM: 60
+## Tue Jan 23 2018 08:38:54 PM: 61
+## Tue Jan 23 2018 08:38:54 PM: 62
+## Tue Jan 23 2018 08:38:54 PM: 63
+## Tue Jan 23 2018 08:38:54 PM: 64
+## Tue Jan 23 2018 08:38:54 PM: 65
+## Tue Jan 23 2018 08:38:54 PM: 66
+## Tue Jan 23 2018 08:38:54 PM: 67
+## Tue Jan 23 2018 08:38:55 PM: 68
+## Tue Jan 23 2018 08:38:55 PM: 69
+## Tue Jan 23 2018 08:38:55 PM: 70
+## Tue Jan 23 2018 08:38:55 PM: 71
+## Tue Jan 23 2018 08:38:55 PM: 72
+## Tue Jan 23 2018 08:38:55 PM: 73
+## Tue Jan 23 2018 08:38:55 PM: 74
+## Tue Jan 23 2018 08:38:55 PM: 75
+## Tue Jan 23 2018 08:38:55 PM: 76
+## Tue Jan 23 2018 08:38:55 PM: 77
+## Tue Jan 23 2018 08:38:55 PM: 78
+## Tue Jan 23 2018 08:38:55 PM: 79
+## Tue Jan 23 2018 08:38:55 PM: 80
+## Tue Jan 23 2018 08:38:55 PM: 81
+## Tue Jan 23 2018 08:38:55 PM: 82
+## Tue Jan 23 2018 08:38:55 PM: 83
+## Tue Jan 23 2018 08:38:55 PM: 84
+## Tue Jan 23 2018 08:38:56 PM: 85
+## Tue Jan 23 2018 08:38:56 PM: 86
+## Tue Jan 23 2018 08:38:56 PM: 87
+## Tue Jan 23 2018 08:38:56 PM: 88
+## Tue Jan 23 2018 08:38:56 PM: 89
+## Tue Jan 23 2018 08:38:56 PM: 90
+## Tue Jan 23 2018 08:38:56 PM: 91
+## Tue Jan 23 2018 08:38:56 PM: 92
+## Tue Jan 23 2018 08:38:56 PM: 93
+## Tue Jan 23 2018 08:38:56 PM: 94
+## Tue Jan 23 2018 08:38:56 PM: 95
+## Tue Jan 23 2018 08:38:56 PM: 96
+## Tue Jan 23 2018 08:38:56 PM: 97
+## Tue Jan 23 2018 08:38:56 PM: 98
+## Tue Jan 23 2018 08:38:56 PM: 99
+## Tue Jan 23 2018 08:38:56 PM: 100
+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/index.html b/R-package/docs/articles/index.html new file mode 100644 index 000000000000..7eb7f6617f9d --- /dev/null +++ b/R-package/docs/articles/index.html @@ -0,0 +1,158 @@ + + + + + + + + +Articles • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/R-package/docs/articles/leaf_stability.html b/R-package/docs/articles/leaf_stability.html new file mode 100644 index 000000000000..fb819cc1a1e3 --- /dev/null +++ b/R-package/docs/articles/leaf_stability.html @@ -0,0 +1,1415 @@ + + + + + + + +Leaf (in)Stability example • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
# We are going to look at how iterating too much might generate observation instability.
+# Obviously, we are in a controlled environment, without issues (real rules).
+# Do not do this in a real scenario.
+
+# First, we load our libraries
+library(lightgbm)
+
## Loading required package: R6
+
library(ggplot2)
+
+# Second, we load our data
+data(agaricus.train, package = "lightgbm")
+train <- agaricus.train
+dtrain <- lgb.Dataset(train$data, label = train$label)
+data(agaricus.test, package = "lightgbm")
+test <- agaricus.test
+dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
+
+# Third, we setup parameters and we train a model
+params <- list(objective = "regression", metric = "l2")
+valids <- list(test = dtest)
+model <- lgb.train(params,
+                   dtrain,
+                   50,
+                   valids,
+                   min_data = 1,
+                   learning_rate = 0.1,
+                   bagging_fraction = 0.1,
+                   bagging_freq = 1,
+                   bagging_seed = 1,
+                   nthread = 1)
+
## Loading required package: Matrix
+
## [1]: test's l2:0.202293 
+## [2]: test's l2:0.164342 
+## [3]: test's l2:0.133283 
+## [4]: test's l2:0.1084 
+## [5]: test's l2:0.0877367 
+## [6]: test's l2:0.0710476 
+## [7]: test's l2:0.0575632 
+## [8]: test's l2:0.0466222 
+## [9]: test's l2:0.0379134 
+## [10]:    test's l2:0.0307555 
+## [11]:    test's l2:0.0248963 
+## [12]:    test's l2:0.0201392 
+## [13]:    test's l2:0.0163162 
+## [14]:    test's l2:0.0132696 
+## [15]:    test's l2:0.0107514 
+## [16]:    test's l2:0.00882604 
+## [17]:    test's l2:0.00724468 
+## [18]:    test's l2:0.00589563 
+## [19]:    test's l2:0.00478529 
+## [20]:    test's l2:0.00389577 
+## [21]:    test's l2:0.00317115 
+## [22]:    test's l2:0.00264432 
+## [23]:    test's l2:0.0021541 
+## [24]:    test's l2:0.00179906 
+## [25]:    test's l2:0.00150985 
+## [26]:    test's l2:0.00129508 
+## [27]:    test's l2:0.00108426 
+## [28]:    test's l2:0.000956892 
+## [29]:    test's l2:0.000856905 
+## [30]:    test's l2:0.000729517 
+## [31]:    test's l2:0.000613605 
+## [32]:    test's l2:0.000523628 
+## [33]:    test's l2:0.000449206 
+## [34]:    test's l2:0.000378496 
+## [35]:    test's l2:0.000312651 
+## [36]:    test's l2:0.000259974 
+## [37]:    test's l2:0.000222764 
+## [38]:    test's l2:0.000207567 
+## [39]:    test's l2:0.000201502 
+## [40]:    test's l2:0.000170301 
+## [41]:    test's l2:0.000155593 
+## [42]:    test's l2:0.000134624 
+## [43]:    test's l2:0.000120277 
+## [44]:    test's l2:0.000111736 
+## [45]:    test's l2:9.5131e-05 
+## [46]:    test's l2:9.03253e-05 
+## [47]:    test's l2:8.87863e-05 
+## [48]:    test's l2:8.38359e-05 
+## [49]:    test's l2:8.21905e-05 
+## [50]:    test's l2:7.20829e-05
+
# We create a data.frame with the following structure:
+# X = average leaf of the observation throughout all trees
+# Y = prediction probability (clamped to [1e-15, 1-1e-15])
+# Z = logloss
+# binned = binned quantile of average leaf
+new_data <- data.frame(X = rowMeans(predict(model,
+                                            agaricus.test$data,
+                                            predleaf = TRUE)),
+                       Y = pmin(pmax(predict(model,
+                                             agaricus.test$data), 1e-15), 1 - 1e-15))
+new_data$Z <- -(agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y))
+new_data$binned <- .bincode(x = new_data$X,
+                            breaks = quantile(x = new_data$X,
+                                              probs = (1:9)/10),
+                            right = TRUE,
+                            include.lowest = TRUE)
+new_data$binned[is.na(new_data$binned)] <- 0
+new_data$binned <- as.factor(new_data$binned)
+
+# We can check the binned content
+table(new_data$binned)
+
## 
+##   0   1   2   3   4   5   6   7   8 
+## 292 188 164 147 159 166 180 138 177
+
# We can plot the binned content
+# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
+# On the third plot, it is smooth!
+ggplot(data = new_data, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
+

+
ggplot(data = new_data, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
+

+
ggplot(data = new_data, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
+

+
# Now, let's show with other parameters
+model2 <- lgb.train(params,
+                    dtrain,
+                    100,
+                    valids,
+                    min_data = 1,
+                    learning_rate = 1,
+                    nthread = 1)
+
## [1]: test's l2:6.44165e-17 
+## [2]: test's l2:1.97215e-31 
+## [3]: test's l2:0 
+## [4]: test's l2:0 
+## [5]: test's l2:0 
+## [6]: test's l2:0 
+## [7]: test's l2:0 
+## [8]: test's l2:0 
+## [9]: test's l2:0 
+## [10]:    test's l2:0 
+## [11]:    test's l2:0 
+## [12]:    test's l2:0 
+## [13]:    test's l2:0 
+## [14]:    test's l2:0 
+## [15]:    test's l2:0 
+## [16]:    test's l2:0 
+## [17]:    test's l2:0 
+## [18]:    test's l2:0 
+## [19]:    test's l2:0 
+## [20]:    test's l2:0 
+## [21]:    test's l2:0 
+## [22]:    test's l2:0 
+## [23]:    test's l2:0 
+## [24]:    test's l2:0 
+## [25]:    test's l2:0 
+## [26]:    test's l2:0 
+## [27]:    test's l2:0 
+## [28]:    test's l2:0 
+## [29]:    test's l2:0 
+## [30]:    test's l2:0 
+## [31]:    test's l2:0 
+## [32]:    test's l2:0 
+## [33]:    test's l2:0 
+## [34]:    test's l2:0 
+## [35]:    test's l2:0 
+## [36]:    test's l2:0 
+## [37]:    test's l2:0 
+## [38]:    test's l2:0 
+## [39]:    test's l2:0 
+## [40]:    test's l2:0 
+## [41]:    test's l2:0 
+## [42]:    test's l2:0 
+## [43]:    test's l2:0 
+## [44]:    test's l2:0 
+## [45]:    test's l2:0 
+## [46]:    test's l2:0 
+## [47]:    test's l2:0 
+## [48]:    test's l2:0 
+## [49]:    test's l2:0 
+## [50]:    test's l2:0 
+## [51]:    test's l2:0 
+## [52]:    test's l2:0 
+## [53]:    test's l2:0 
+## [54]:    test's l2:0 
+## [55]:    test's l2:0 
+## [56]:    test's l2:0 
+## [57]:    test's l2:0 
+## [58]:    test's l2:0 
+## [59]:    test's l2:0 
+## [60]:    test's l2:0 
+## [61]:    test's l2:0 
+## [62]:    test's l2:0 
+## [63]:    test's l2:0 
+## [64]:    test's l2:0 
+## [65]:    test's l2:0 
+## [66]:    test's l2:0 
+## [67]:    test's l2:0 
+## [68]:    test's l2:0 
+## [69]:    test's l2:0 
+## [70]:    test's l2:0 
+## [71]:    test's l2:0 
+## [72]:    test's l2:0 
+## [73]:    test's l2:0 
+## [74]:    test's l2:0 
+## [75]:    test's l2:0 
+## [76]:    test's l2:0 
+## [77]:    test's l2:0 
+## [78]:    test's l2:0 
+## [79]:    test's l2:0 
+## [80]:    test's l2:0 
+## [81]:    test's l2:0 
+## [82]:    test's l2:0 
+## [83]:    test's l2:0 
+## [84]:    test's l2:0 
+## [85]:    test's l2:0 
+## [86]:    test's l2:0 
+## [87]:    test's l2:0 
+## [88]:    test's l2:0 
+## [89]:    test's l2:0 
+## [90]:    test's l2:0 
+## [91]:    test's l2:0 
+## [92]:    test's l2:0 
+## [93]:    test's l2:0 
+## [94]:    test's l2:0 
+## [95]:    test's l2:0 
+## [96]:    test's l2:0 
+## [97]:    test's l2:0 
+## [98]:    test's l2:0 
+## [99]:    test's l2:0 
+## [100]:   test's l2:0
+
# We create the data structure, but for model2
+new_data2 <- data.frame(X = rowMeans(predict(model2,
+                                             agaricus.test$data,
+                                             predleaf = TRUE)),
+                        Y = pmin(pmax(predict(model2,
+                                              agaricus.test$data), 1e-15), 1 - 1e-15))
+new_data2$Z <- -(agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y))
+new_data2$binned <- .bincode(x = new_data2$X,
+                             breaks = quantile(x = new_data2$X,
+                                               probs = (1:9)/10),
+                             right = TRUE,
+                             include.lowest = TRUE)
+new_data2$binned[is.na(new_data2$binned)] <- 0
+new_data2$binned <- as.factor(new_data2$binned)
+
+# We can check the binned content
+table(new_data2$binned)
+
## 
+##   0   1   2   4   5   6   8 
+##  25 440 337  31 159 339 280
+
# We can plot the binned content
+# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
+# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue
+# However, if the rules were not true, the loss would explode.
+ggplot(data = new_data2, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
+

+
ggplot(data = new_data2, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
+

+
ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
+

+
# Now, try with very severe overfitting
+model3 <- lgb.train(params,
+                    dtrain,
+                    1000,
+                    valids,
+                    min_data = 1,
+                    learning_rate = 1,
+                    nthread = 1)
+
## [1]: test's l2:6.44165e-17 
+## [2]: test's l2:1.97215e-31 
+## [3]: test's l2:0 
+## [4]: test's l2:0 
+## [5]: test's l2:0 
+## [6]: test's l2:0 
+## [7]: test's l2:0 
+## [8]: test's l2:0 
+## [9]: test's l2:0 
+## [10]:    test's l2:0 
+## [11]:    test's l2:0 
+## [12]:    test's l2:0 
+## [13]:    test's l2:0 
+## [14]:    test's l2:0 
+## [15]:    test's l2:0 
+## [16]:    test's l2:0 
+## [17]:    test's l2:0 
+## [18]:    test's l2:0 
+## [19]:    test's l2:0 
+## [20]:    test's l2:0 
+## [21]:    test's l2:0 
+## [22]:    test's l2:0 
+## [23]:    test's l2:0 
+## [24]:    test's l2:0 
+## [25]:    test's l2:0 
+## [26]:    test's l2:0 
+## [27]:    test's l2:0 
+## [28]:    test's l2:0 
+## [29]:    test's l2:0 
+## [30]:    test's l2:0 
+## [31]:    test's l2:0 
+## [32]:    test's l2:0 
+## [33]:    test's l2:0 
+## [34]:    test's l2:0 
+## [35]:    test's l2:0 
+## [36]:    test's l2:0 
+## [37]:    test's l2:0 
+## [38]:    test's l2:0 
+## [39]:    test's l2:0 
+## [40]:    test's l2:0 
+## [41]:    test's l2:0 
+## [42]:    test's l2:0 
+## [43]:    test's l2:0 
+## [44]:    test's l2:0 
+## [45]:    test's l2:0 
+## [46]:    test's l2:0 
+## [47]:    test's l2:0 
+## [48]:    test's l2:0 
+## [49]:    test's l2:0 
+## [50]:    test's l2:0 
+## [51]:    test's l2:0 
+## [52]:    test's l2:0 
+## [53]:    test's l2:0 
+## [54]:    test's l2:0 
+## [55]:    test's l2:0 
+## [56]:    test's l2:0 
+## [57]:    test's l2:0 
+## [58]:    test's l2:0 
+## [59]:    test's l2:0 
+## [60]:    test's l2:0 
+## [61]:    test's l2:0 
+## [62]:    test's l2:0 
+## [63]:    test's l2:0 
+## [64]:    test's l2:0 
+## [65]:    test's l2:0 
+## [66]:    test's l2:0 
+## [67]:    test's l2:0 
+## [68]:    test's l2:0 
+## [69]:    test's l2:0 
+## [70]:    test's l2:0 
+## [71]:    test's l2:0 
+## [72]:    test's l2:0 
+## [73]:    test's l2:0 
+## [74]:    test's l2:0 
+## [75]:    test's l2:0 
+## [76]:    test's l2:0 
+## [77]:    test's l2:0 
+## [78]:    test's l2:0 
+## [79]:    test's l2:0 
+## [80]:    test's l2:0 
+## [81]:    test's l2:0 
+## [82]:    test's l2:0 
+## [83]:    test's l2:0 
+## [84]:    test's l2:0 
+## [85]:    test's l2:0 
+## [86]:    test's l2:0 
+## [87]:    test's l2:0 
+## [88]:    test's l2:0 
+## [89]:    test's l2:0 
+## [90]:    test's l2:0 
+## [91]:    test's l2:0 
+## [92]:    test's l2:0 
+## [93]:    test's l2:0 
+## [94]:    test's l2:0 
+## [95]:    test's l2:0 
+## [96]:    test's l2:0 
+## [97]:    test's l2:0 
+## [98]:    test's l2:0 
+## [99]:    test's l2:0 
+## [100]:   test's l2:0 
+## [101]:   test's l2:0 
+## [102]:   test's l2:0 
+## [103]:   test's l2:0 
+## [104]:   test's l2:0 
+## [105]:   test's l2:0 
+## [106]:   test's l2:0 
+## [107]:   test's l2:0 
+## [108]:   test's l2:0 
+## [109]:   test's l2:0 
+## [110]:   test's l2:0 
+## [111]:   test's l2:0 
+## [112]:   test's l2:0 
+## [113]:   test's l2:0 
+## [114]:   test's l2:0 
+## [115]:   test's l2:0 
+## [116]:   test's l2:0 
+## [117]:   test's l2:0 
+## [118]:   test's l2:0 
+## [119]:   test's l2:0 
+## [120]:   test's l2:0 
+## [121]:   test's l2:0 
+## [122]:   test's l2:0 
+## [123]:   test's l2:0 
+## [124]:   test's l2:0 
+## [125]:   test's l2:0 
+## [126]:   test's l2:0 
+## [127]:   test's l2:0 
+## [128]:   test's l2:0 
+## [129]:   test's l2:0 
+## [130]:   test's l2:0 
+## [131]:   test's l2:0 
+## [132]:   test's l2:0 
+## [133]:   test's l2:0 
+## [134]:   test's l2:0 
+## [135]:   test's l2:0 
+## [136]:   test's l2:0 
+## [137]:   test's l2:0 
+## [138]:   test's l2:0 
+## [139]:   test's l2:0 
+## [140]:   test's l2:0 
+## [141]:   test's l2:0 
+## [142]:   test's l2:0 
+## [143]:   test's l2:0 
+## [144]:   test's l2:0 
+## [145]:   test's l2:0 
+## [146]:   test's l2:0 
+## [147]:   test's l2:0 
+## [148]:   test's l2:0 
+## [149]:   test's l2:0 
+## [150]:   test's l2:0 
+## [151]:   test's l2:0 
+## [152]:   test's l2:0 
+## [153]:   test's l2:0 
+## [154]:   test's l2:0 
+## [155]:   test's l2:0 
+## [156]:   test's l2:0 
+## [157]:   test's l2:0 
+## [158]:   test's l2:0 
+## [159]:   test's l2:0 
+## [160]:   test's l2:0 
+## [161]:   test's l2:0 
+## [162]:   test's l2:0 
+## [163]:   test's l2:0 
+## [164]:   test's l2:0 
+## [165]:   test's l2:0 
+## [166]:   test's l2:0 
+## [167]:   test's l2:0 
+## [168]:   test's l2:0 
+## [169]:   test's l2:0 
+## [170]:   test's l2:0 
+## [171]:   test's l2:0 
+## [172]:   test's l2:0 
+## [173]:   test's l2:0 
+## [174]:   test's l2:0 
+## [175]:   test's l2:0 
+## [176]:   test's l2:0 
+## [177]:   test's l2:0 
+## [178]:   test's l2:0 
+## [179]:   test's l2:0 
+## [180]:   test's l2:0 
+## [181]:   test's l2:0 
+## [182]:   test's l2:0 
+## [183]:   test's l2:0 
+## [184]:   test's l2:0 
+## [185]:   test's l2:0 
+## [186]:   test's l2:0 
+## [187]:   test's l2:0 
+## [188]:   test's l2:0 
+## [189]:   test's l2:0 
+## [190]:   test's l2:0 
+## [191]:   test's l2:0 
+## [192]:   test's l2:0 
+## [193]:   test's l2:0 
+## [194]:   test's l2:0 
+## [195]:   test's l2:0 
+## [196]:   test's l2:0 
+## [197]:   test's l2:0 
+## [198]:   test's l2:0 
+## [199]:   test's l2:0 
+## [200]:   test's l2:0 
+## [201]:   test's l2:0 
+## [202]:   test's l2:0 
+## [203]:   test's l2:0 
+## [204]:   test's l2:0 
+## [205]:   test's l2:0 
+## [206]:   test's l2:0 
+## [207]:   test's l2:0 
+## [208]:   test's l2:0 
+## [209]:   test's l2:0 
+## [210]:   test's l2:0 
+## [211]:   test's l2:0 
+## [212]:   test's l2:0 
+## [213]:   test's l2:0 
+## [214]:   test's l2:0 
+## [215]:   test's l2:0 
+## [216]:   test's l2:0 
+## [217]:   test's l2:0 
+## [218]:   test's l2:0 
+## [219]:   test's l2:0 
+## [220]:   test's l2:0 
+## [221]:   test's l2:0 
+## [222]:   test's l2:0 
+## [223]:   test's l2:0 
+## [224]:   test's l2:0 
+## [225]:   test's l2:0 
+## [226]:   test's l2:0 
+## [227]:   test's l2:0 
+## [228]:   test's l2:0 
+## [229]:   test's l2:0 
+## [230]:   test's l2:0 
+## [231]:   test's l2:0 
+## [232]:   test's l2:0 
+## [233]:   test's l2:0 
+## [234]:   test's l2:0 
+## [235]:   test's l2:0 
+## [236]:   test's l2:0 
+## [237]:   test's l2:0 
+## [238]:   test's l2:0 
+## [239]:   test's l2:0 
+## [240]:   test's l2:0 
+## [241]:   test's l2:0 
+## [242]:   test's l2:0 
+## [243]:   test's l2:0 
+## [244]:   test's l2:0 
+## [245]:   test's l2:0 
+## [246]:   test's l2:0 
+## [247]:   test's l2:0 
+## [248]:   test's l2:0 
+## [249]:   test's l2:0 
+## [250]:   test's l2:0 
+## [251]:   test's l2:0 
+## [252]:   test's l2:0 
+## [253]:   test's l2:0 
+## [254]:   test's l2:0 
+## [255]:   test's l2:0 
+## [256]:   test's l2:0 
+## [257]:   test's l2:0 
+## [258]:   test's l2:0 
+## [259]:   test's l2:0 
+## [260]:   test's l2:0 
+## [261]:   test's l2:0 
+## [262]:   test's l2:0 
+## [263]:   test's l2:0 
+## [264]:   test's l2:0 
+## [265]:   test's l2:0 
+## [266]:   test's l2:0 
+## [267]:   test's l2:0 
+## [268]:   test's l2:0 
+## [269]:   test's l2:0 
+## [270]:   test's l2:0 
+## [271]:   test's l2:0 
+## [272]:   test's l2:0 
+## [273]:   test's l2:0 
+## [274]:   test's l2:0 
+## [275]:   test's l2:0 
+## [276]:   test's l2:0 
+## [277]:   test's l2:0 
+## [278]:   test's l2:0 
+## [279]:   test's l2:0 
+## [280]:   test's l2:0 
+## [281]:   test's l2:0 
+## [282]:   test's l2:0 
+## [283]:   test's l2:0 
+## [284]:   test's l2:0 
+## [285]:   test's l2:0 
+## [286]:   test's l2:0 
+## [287]:   test's l2:0 
+## [288]:   test's l2:0 
+## [289]:   test's l2:0 
+## [290]:   test's l2:0 
+## [291]:   test's l2:0 
+## [292]:   test's l2:0 
+## [293]:   test's l2:0 
+## [294]:   test's l2:0 
+## [295]:   test's l2:0 
+## [296]:   test's l2:0 
+## [297]:   test's l2:0 
+## [298]:   test's l2:0 
+## [299]:   test's l2:0 
+## [300]:   test's l2:0 
+## [301]:   test's l2:0 
+## [302]:   test's l2:0 
+## [303]:   test's l2:0 
+## [304]:   test's l2:0 
+## [305]:   test's l2:0 
+## [306]:   test's l2:0 
+## [307]:   test's l2:0 
+## [308]:   test's l2:0 
+## [309]:   test's l2:0 
+## [310]:   test's l2:0 
+## [311]:   test's l2:0 
+## [312]:   test's l2:0 
+## [313]:   test's l2:0 
+## [314]:   test's l2:0 
+## [315]:   test's l2:0 
+## [316]:   test's l2:0 
+## [317]:   test's l2:0 
+## [318]:   test's l2:0 
+## [319]:   test's l2:0 
+## [320]:   test's l2:0 
+## [321]:   test's l2:0 
+## [322]:   test's l2:0 
+## [323]:   test's l2:0 
+## [324]:   test's l2:0 
+## [325]:   test's l2:0 
+## [326]:   test's l2:0 
+## [327]:   test's l2:0 
+## [328]:   test's l2:0 
+## [329]:   test's l2:0 
+## [330]:   test's l2:0 
+## [331]:   test's l2:0 
+## [332]:   test's l2:0 
+## [333]:   test's l2:0 
+## [334]:   test's l2:0 
+## [335]:   test's l2:0 
+## [336]:   test's l2:0 
+## [337]:   test's l2:0 
+## [338]:   test's l2:0 
+## [339]:   test's l2:0 
+## [340]:   test's l2:0 
+## [341]:   test's l2:0 
+## [342]:   test's l2:0 
+## [343]:   test's l2:0 
+## [344]:   test's l2:0 
+## [345]:   test's l2:0 
+## [346]:   test's l2:0 
+## [347]:   test's l2:0 
+## [348]:   test's l2:0 
+## [349]:   test's l2:0 
+## [350]:   test's l2:0 
+## [351]:   test's l2:0 
+## [352]:   test's l2:0 
+## [353]:   test's l2:0 
+## [354]:   test's l2:0 
+## [355]:   test's l2:0 
+## [356]:   test's l2:0 
+## [357]:   test's l2:0 
+## [358]:   test's l2:0 
+## [359]:   test's l2:0 
+## [360]:   test's l2:0 
+## [361]:   test's l2:0 
+## [362]:   test's l2:0 
+## [363]:   test's l2:0 
+## [364]:   test's l2:0 
+## [365]:   test's l2:0 
+## [366]:   test's l2:0 
+## [367]:   test's l2:0 
+## [368]:   test's l2:0 
+## [369]:   test's l2:0 
+## [370]:   test's l2:0 
+## [371]:   test's l2:0 
+## [372]:   test's l2:0 
+## [373]:   test's l2:0 
+## [374]:   test's l2:0 
+## [375]:   test's l2:0 
+## [376]:   test's l2:0 
+## [377]:   test's l2:0 
+## [378]:   test's l2:0 
+## [379]:   test's l2:0 
+## [380]:   test's l2:0 
+## [381]:   test's l2:0 
+## [382]:   test's l2:0 
+## [383]:   test's l2:0 
+## [384]:   test's l2:0 
+## [385]:   test's l2:0 
+## [386]:   test's l2:0 
+## [387]:   test's l2:0 
+## [388]:   test's l2:0 
+## [389]:   test's l2:0 
+## [390]:   test's l2:0 
+## [391]:   test's l2:0 
+## [392]:   test's l2:0 
+## [393]:   test's l2:0 
+## [394]:   test's l2:0 
+## [395]:   test's l2:0 
+## [396]:   test's l2:0 
+## [397]:   test's l2:0 
+## [398]:   test's l2:0 
+## [399]:   test's l2:0 
+## [400]:   test's l2:0 
+## [401]:   test's l2:0 
+## [402]:   test's l2:0 
+## [403]:   test's l2:0 
+## [404]:   test's l2:0 
+## [405]:   test's l2:0 
+## [406]:   test's l2:0 
+## [407]:   test's l2:0 
+## [408]:   test's l2:0 
+## [409]:   test's l2:0 
+## [410]:   test's l2:0 
+## [411]:   test's l2:0 
+## [412]:   test's l2:0 
+## [413]:   test's l2:0 
+## [414]:   test's l2:0 
+## [415]:   test's l2:0 
+## [416]:   test's l2:0 
+## [417]:   test's l2:0 
+## [418]:   test's l2:0 
+## [419]:   test's l2:0 
+## [420]:   test's l2:0 
+## [421]:   test's l2:0 
+## [422]:   test's l2:0 
+## [423]:   test's l2:0 
+## [424]:   test's l2:0 
+## [425]:   test's l2:0 
+## [426]:   test's l2:0 
+## [427]:   test's l2:0 
+## [428]:   test's l2:0 
+## [429]:   test's l2:0 
+## [430]:   test's l2:0 
+## [431]:   test's l2:0 
+## [432]:   test's l2:0 
+## [433]:   test's l2:0 
+## [434]:   test's l2:0 
+## [435]:   test's l2:0 
+## [436]:   test's l2:0 
+## [437]:   test's l2:0 
+## [438]:   test's l2:0 
+## [439]:   test's l2:0 
+## [440]:   test's l2:0 
+## [441]:   test's l2:0 
+## [442]:   test's l2:0 
+## [443]:   test's l2:0 
+## [444]:   test's l2:0 
+## [445]:   test's l2:0 
+## [446]:   test's l2:0 
+## [447]:   test's l2:0 
+## [448]:   test's l2:0 
+## [449]:   test's l2:0 
+## [450]:   test's l2:0 
+## [451]:   test's l2:0 
+## [452]:   test's l2:0 
+## [453]:   test's l2:0 
+## [454]:   test's l2:0 
+## [455]:   test's l2:0 
+## [456]:   test's l2:0 
+## [457]:   test's l2:0 
+## [458]:   test's l2:0 
+## [459]:   test's l2:0 
+## [460]:   test's l2:0 
+## [461]:   test's l2:0 
+## [462]:   test's l2:0 
+## [463]:   test's l2:0 
+## [464]:   test's l2:0 
+## [465]:   test's l2:0 
+## [466]:   test's l2:0 
+## [467]:   test's l2:0 
+## [468]:   test's l2:0 
+## [469]:   test's l2:0 
+## [470]:   test's l2:0 
+## [471]:   test's l2:0 
+## [472]:   test's l2:0 
+## [473]:   test's l2:0 
+## [474]:   test's l2:0 
+## [475]:   test's l2:0 
+## [476]:   test's l2:0 
+## [477]:   test's l2:0 
+## [478]:   test's l2:0 
+## [479]:   test's l2:0 
+## [480]:   test's l2:0 
+## [481]:   test's l2:0 
+## [482]:   test's l2:0 
+## [483]:   test's l2:0 
+## [484]:   test's l2:0 
+## [485]:   test's l2:0 
+## [486]:   test's l2:0 
+## [487]:   test's l2:0 
+## [488]:   test's l2:0 
+## [489]:   test's l2:0 
+## [490]:   test's l2:0 
+## [491]:   test's l2:0 
+## [492]:   test's l2:0 
+## [493]:   test's l2:0 
+## [494]:   test's l2:0 
+## [495]:   test's l2:0 
+## [496]:   test's l2:0 
+## [497]:   test's l2:0 
+## [498]:   test's l2:0 
+## [499]:   test's l2:0 
+## [500]:   test's l2:0 
+## [501]:   test's l2:0 
+## [502]:   test's l2:0 
+## [503]:   test's l2:0 
+## [504]:   test's l2:0 
+## [505]:   test's l2:0 
+## [506]:   test's l2:0 
+## [507]:   test's l2:0 
+## [508]:   test's l2:0 
+## [509]:   test's l2:0 
+## [510]:   test's l2:0 
+## [511]:   test's l2:0 
+## [512]:   test's l2:0 
+## [513]:   test's l2:0 
+## [514]:   test's l2:0 
+## [515]:   test's l2:0 
+## [516]:   test's l2:0 
+## [517]:   test's l2:0 
+## [518]:   test's l2:0 
+## [519]:   test's l2:0 
+## [520]:   test's l2:0 
+## [521]:   test's l2:0 
+## [522]:   test's l2:0 
+## [523]:   test's l2:0 
+## [524]:   test's l2:0 
+## [525]:   test's l2:0 
+## [526]:   test's l2:0 
+## [527]:   test's l2:0 
+## [528]:   test's l2:0 
+## [529]:   test's l2:0 
+## [530]:   test's l2:0 
+## [531]:   test's l2:0 
+## [532]:   test's l2:0 
+## [533]:   test's l2:0 
+## [534]:   test's l2:0 
+## [535]:   test's l2:0 
+## [536]:   test's l2:0 
+## [537]:   test's l2:0 
+## [538]:   test's l2:0 
+## [539]:   test's l2:0 
+## [540]:   test's l2:0 
+## [541]:   test's l2:0 
+## [542]:   test's l2:0 
+## [543]:   test's l2:0 
+## [544]:   test's l2:0 
+## [545]:   test's l2:0 
+## [546]:   test's l2:0 
+## [547]:   test's l2:0 
+## [548]:   test's l2:0 
+## [549]:   test's l2:0 
+## [550]:   test's l2:0 
+## [551]:   test's l2:0 
+## [552]:   test's l2:0 
+## [553]:   test's l2:0 
+## [554]:   test's l2:0 
+## [555]:   test's l2:0 
+## [556]:   test's l2:0 
+## [557]:   test's l2:0 
+## [558]:   test's l2:0 
+## [559]:   test's l2:0 
+## [560]:   test's l2:0 
+## [561]:   test's l2:0 
+## [562]:   test's l2:0 
+## [563]:   test's l2:0 
+## [564]:   test's l2:0 
+## [565]:   test's l2:0 
+## [566]:   test's l2:0 
+## [567]:   test's l2:0 
+## [568]:   test's l2:0 
+## [569]:   test's l2:0 
+## [570]:   test's l2:0 
+## [571]:   test's l2:0 
+## [572]:   test's l2:0 
+## [573]:   test's l2:0 
+## [574]:   test's l2:0 
+## [575]:   test's l2:0 
+## [576]:   test's l2:0 
+## [577]:   test's l2:0 
+## [578]:   test's l2:0 
+## [579]:   test's l2:0 
+## [580]:   test's l2:0 
+## [581]:   test's l2:0 
+## [582]:   test's l2:0 
+## [583]:   test's l2:0 
+## [584]:   test's l2:0 
+## [585]:   test's l2:0 
+## [586]:   test's l2:0 
+## [587]:   test's l2:0 
+## [588]:   test's l2:0 
+## [589]:   test's l2:0 
+## [590]:   test's l2:0 
+## [591]:   test's l2:0 
+## [592]:   test's l2:0 
+## [593]:   test's l2:0 
+## [594]:   test's l2:0 
+## [595]:   test's l2:0 
+## [596]:   test's l2:0 
+## [597]:   test's l2:0 
+## [598]:   test's l2:0 
+## [599]:   test's l2:0 
+## [600]:   test's l2:0 
+## [601]:   test's l2:0 
+## [602]:   test's l2:0 
+## [603]:   test's l2:0 
+## [604]:   test's l2:0 
+## [605]:   test's l2:0 
+## [606]:   test's l2:0 
+## [607]:   test's l2:0 
+## [608]:   test's l2:0 
+## [609]:   test's l2:0 
+## [610]:   test's l2:0 
+## [611]:   test's l2:0 
+## [612]:   test's l2:0 
+## [613]:   test's l2:0 
+## [614]:   test's l2:0 
+## [615]:   test's l2:0 
+## [616]:   test's l2:0 
+## [617]:   test's l2:0 
+## [618]:   test's l2:0 
+## [619]:   test's l2:0 
+## [620]:   test's l2:0 
+## [621]:   test's l2:0 
+## [622]:   test's l2:0 
+## [623]:   test's l2:0 
+## [624]:   test's l2:0 
+## [625]:   test's l2:0 
+## [626]:   test's l2:0 
+## [627]:   test's l2:0 
+## [628]:   test's l2:0 
+## [629]:   test's l2:0 
+## [630]:   test's l2:0 
+## [631]:   test's l2:0 
+## [632]:   test's l2:0 
+## [633]:   test's l2:0 
+## [634]:   test's l2:0 
+## [635]:   test's l2:0 
+## [636]:   test's l2:0 
+## [637]:   test's l2:0 
+## [638]:   test's l2:0 
+## [639]:   test's l2:0 
+## [640]:   test's l2:0 
+## [641]:   test's l2:0 
+## [642]:   test's l2:0 
+## [643]:   test's l2:0 
+## [644]:   test's l2:0 
+## [645]:   test's l2:0 
+## [646]:   test's l2:0 
+## [647]:   test's l2:0 
+## [648]:   test's l2:0 
+## [649]:   test's l2:0 
+## [650]:   test's l2:0 
+## [651]:   test's l2:0 
+## [652]:   test's l2:0 
+## [653]:   test's l2:0 
+## [654]:   test's l2:0 
+## [655]:   test's l2:0 
+## [656]:   test's l2:0 
+## [657]:   test's l2:0 
+## [658]:   test's l2:0 
+## [659]:   test's l2:0 
+## [660]:   test's l2:0 
+## [661]:   test's l2:0 
+## [662]:   test's l2:0 
+## [663]:   test's l2:0 
+## [664]:   test's l2:0 
+## [665]:   test's l2:0 
+## [666]:   test's l2:0 
+## [667]:   test's l2:0 
+## [668]:   test's l2:0 
+## [669]:   test's l2:0 
+## [670]:   test's l2:0 
+## [671]:   test's l2:0 
+## [672]:   test's l2:0 
+## [673]:   test's l2:0 
+## [674]:   test's l2:0 
+## [675]:   test's l2:0 
+## [676]:   test's l2:0 
+## [677]:   test's l2:0 
+## [678]:   test's l2:0 
+## [679]:   test's l2:0 
+## [680]:   test's l2:0 
+## [681]:   test's l2:0 
+## [682]:   test's l2:0 
+## [683]:   test's l2:0 
+## [684]:   test's l2:0 
+## [685]:   test's l2:0 
+## [686]:   test's l2:0 
+## [687]:   test's l2:0 
+## [688]:   test's l2:0 
+## [689]:   test's l2:0 
+## [690]:   test's l2:0 
+## [691]:   test's l2:0 
+## [692]:   test's l2:0 
+## [693]:   test's l2:0 
+## [694]:   test's l2:0 
+## [695]:   test's l2:0 
+## [696]:   test's l2:0 
+## [697]:   test's l2:0 
+## [698]:   test's l2:0 
+## [699]:   test's l2:0 
+## [700]:   test's l2:0 
+## [701]:   test's l2:0 
+## [702]:   test's l2:0 
+## [703]:   test's l2:0 
+## [704]:   test's l2:0 
+## [705]:   test's l2:0 
+## [706]:   test's l2:0 
+## [707]:   test's l2:0 
+## [708]:   test's l2:0 
+## [709]:   test's l2:0 
+## [710]:   test's l2:0 
+## [711]:   test's l2:0 
+## [712]:   test's l2:0 
+## [713]:   test's l2:0 
+## [714]:   test's l2:0 
+## [715]:   test's l2:0 
+## [716]:   test's l2:0 
+## [717]:   test's l2:0 
+## [718]:   test's l2:0 
+## [719]:   test's l2:0 
+## [720]:   test's l2:0 
+## [721]:   test's l2:0 
+## [722]:   test's l2:0 
+## [723]:   test's l2:0 
+## [724]:   test's l2:0 
+## [725]:   test's l2:0 
+## [726]:   test's l2:0 
+## [727]:   test's l2:0 
+## [728]:   test's l2:0 
+## [729]:   test's l2:0 
+## [730]:   test's l2:0 
+## [731]:   test's l2:0 
+## [732]:   test's l2:0 
+## [733]:   test's l2:0 
+## [734]:   test's l2:0 
+## [735]:   test's l2:0 
+## [736]:   test's l2:0 
+## [737]:   test's l2:0 
+## [738]:   test's l2:0 
+## [739]:   test's l2:0 
+## [740]:   test's l2:0 
+## [741]:   test's l2:0 
+## [742]:   test's l2:0 
+## [743]:   test's l2:0 
+## [744]:   test's l2:0 
+## [745]:   test's l2:0 
+## [746]:   test's l2:0 
+## [747]:   test's l2:0 
+## [748]:   test's l2:0 
+## [749]:   test's l2:0 
+## [750]:   test's l2:0 
+## [751]:   test's l2:0 
+## [752]:   test's l2:0 
+## [753]:   test's l2:0 
+## [754]:   test's l2:0 
+## [755]:   test's l2:0 
+## [756]:   test's l2:0 
+## [757]:   test's l2:0 
+## [758]:   test's l2:0 
+## [759]:   test's l2:0 
+## [760]:   test's l2:0 
+## [761]:   test's l2:0 
+## [762]:   test's l2:0 
+## [763]:   test's l2:0 
+## [764]:   test's l2:0 
+## [765]:   test's l2:0 
+## [766]:   test's l2:0 
+## [767]:   test's l2:0 
+## [768]:   test's l2:0 
+## [769]:   test's l2:0 
+## [770]:   test's l2:0 
+## [771]:   test's l2:0 
+## [772]:   test's l2:0 
+## [773]:   test's l2:0 
+## [774]:   test's l2:0 
+## [775]:   test's l2:0 
+## [776]:   test's l2:0 
+## [777]:   test's l2:0 
+## [778]:   test's l2:0 
+## [779]:   test's l2:0 
+## [780]:   test's l2:0 
+## [781]:   test's l2:0 
+## [782]:   test's l2:0 
+## [783]:   test's l2:0 
+## [784]:   test's l2:0 
+## [785]:   test's l2:0 
+## [786]:   test's l2:0 
+## [787]:   test's l2:0 
+## [788]:   test's l2:0 
+## [789]:   test's l2:0 
+## [790]:   test's l2:0 
+## [791]:   test's l2:0 
+## [792]:   test's l2:0 
+## [793]:   test's l2:0 
+## [794]:   test's l2:0 
+## [795]:   test's l2:0 
+## [796]:   test's l2:0 
+## [797]:   test's l2:0 
+## [798]:   test's l2:0 
+## [799]:   test's l2:0 
+## [800]:   test's l2:0 
+## [801]:   test's l2:0 
+## [802]:   test's l2:0 
+## [803]:   test's l2:0 
+## [804]:   test's l2:0 
+## [805]:   test's l2:0 
+## [806]:   test's l2:0 
+## [807]:   test's l2:0 
+## [808]:   test's l2:0 
+## [809]:   test's l2:0 
+## [810]:   test's l2:0 
+## [811]:   test's l2:0 
+## [812]:   test's l2:0 
+## [813]:   test's l2:0 
+## [814]:   test's l2:0 
+## [815]:   test's l2:0 
+## [816]:   test's l2:0 
+## [817]:   test's l2:0 
+## [818]:   test's l2:0 
+## [819]:   test's l2:0 
+## [820]:   test's l2:0 
+## [821]:   test's l2:0 
+## [822]:   test's l2:0 
+## [823]:   test's l2:0 
+## [824]:   test's l2:0 
+## [825]:   test's l2:0 
+## [826]:   test's l2:0 
+## [827]:   test's l2:0 
+## [828]:   test's l2:0 
+## [829]:   test's l2:0 
+## [830]:   test's l2:0 
+## [831]:   test's l2:0 
+## [832]:   test's l2:0 
+## [833]:   test's l2:0 
+## [834]:   test's l2:0 
+## [835]:   test's l2:0 
+## [836]:   test's l2:0 
+## [837]:   test's l2:0 
+## [838]:   test's l2:0 
+## [839]:   test's l2:0 
+## [840]:   test's l2:0 
+## [841]:   test's l2:0 
+## [842]:   test's l2:0 
+## [843]:   test's l2:0 
+## [844]:   test's l2:0 
+## [845]:   test's l2:0 
+## [846]:   test's l2:0 
+## [847]:   test's l2:0 
+## [848]:   test's l2:0 
+## [849]:   test's l2:0 
+## [850]:   test's l2:0 
+## [851]:   test's l2:0 
+## [852]:   test's l2:0 
+## [853]:   test's l2:0 
+## [854]:   test's l2:0 
+## [855]:   test's l2:0 
+## [856]:   test's l2:0 
+## [857]:   test's l2:0 
+## [858]:   test's l2:0 
+## [859]:   test's l2:0 
+## [860]:   test's l2:0 
+## [861]:   test's l2:0 
+## [862]:   test's l2:0 
+## [863]:   test's l2:0 
+## [864]:   test's l2:0 
+## [865]:   test's l2:0 
+## [866]:   test's l2:0 
+## [867]:   test's l2:0 
+## [868]:   test's l2:0 
+## [869]:   test's l2:0 
+## [870]:   test's l2:0 
+## [871]:   test's l2:0 
+## [872]:   test's l2:0 
+## [873]:   test's l2:0 
+## [874]:   test's l2:0 
+## [875]:   test's l2:0 
+## [876]:   test's l2:0 
+## [877]:   test's l2:0 
+## [878]:   test's l2:0 
+## [879]:   test's l2:0 
+## [880]:   test's l2:0 
+## [881]:   test's l2:0 
+## [882]:   test's l2:0 
+## [883]:   test's l2:0 
+## [884]:   test's l2:0 
+## [885]:   test's l2:0 
+## [886]:   test's l2:0 
+## [887]:   test's l2:0 
+## [888]:   test's l2:0 
+## [889]:   test's l2:0 
+## [890]:   test's l2:0 
+## [891]:   test's l2:0 
+## [892]:   test's l2:0 
+## [893]:   test's l2:0 
+## [894]:   test's l2:0 
+## [895]:   test's l2:0 
+## [896]:   test's l2:0 
+## [897]:   test's l2:0 
+## [898]:   test's l2:0 
+## [899]:   test's l2:0 
+## [900]:   test's l2:0 
+## [901]:   test's l2:0 
+## [902]:   test's l2:0 
+## [903]:   test's l2:0 
+## [904]:   test's l2:0 
+## [905]:   test's l2:0 
+## [906]:   test's l2:0 
+## [907]:   test's l2:0 
+## [908]:   test's l2:0 
+## [909]:   test's l2:0 
+## [910]:   test's l2:0 
+## [911]:   test's l2:0 
+## [912]:   test's l2:0 
+## [913]:   test's l2:0 
+## [914]:   test's l2:0 
+## [915]:   test's l2:0 
+## [916]:   test's l2:0 
+## [917]:   test's l2:0 
+## [918]:   test's l2:0 
+## [919]:   test's l2:0 
+## [920]:   test's l2:0 
+## [921]:   test's l2:0 
+## [922]:   test's l2:0 
+## [923]:   test's l2:0 
+## [924]:   test's l2:0 
+## [925]:   test's l2:0 
+## [926]:   test's l2:0 
+## [927]:   test's l2:0 
+## [928]:   test's l2:0 
+## [929]:   test's l2:0 
+## [930]:   test's l2:0 
+## [931]:   test's l2:0 
+## [932]:   test's l2:0 
+## [933]:   test's l2:0 
+## [934]:   test's l2:0 
+## [935]:   test's l2:0 
+## [936]:   test's l2:0 
+## [937]:   test's l2:0 
+## [938]:   test's l2:0 
+## [939]:   test's l2:0 
+## [940]:   test's l2:0 
+## [941]:   test's l2:0 
+## [942]:   test's l2:0 
+## [943]:   test's l2:0 
+## [944]:   test's l2:0 
+## [945]:   test's l2:0 
+## [946]:   test's l2:0 
+## [947]:   test's l2:0 
+## [948]:   test's l2:0 
+## [949]:   test's l2:0 
+## [950]:   test's l2:0 
+## [951]:   test's l2:0 
+## [952]:   test's l2:0 
+## [953]:   test's l2:0 
+## [954]:   test's l2:0 
+## [955]:   test's l2:0 
+## [956]:   test's l2:0 
+## [957]:   test's l2:0 
+## [958]:   test's l2:0 
+## [959]:   test's l2:0 
+## [960]:   test's l2:0 
+## [961]:   test's l2:0 
+## [962]:   test's l2:0 
+## [963]:   test's l2:0 
+## [964]:   test's l2:0 
+## [965]:   test's l2:0 
+## [966]:   test's l2:0 
+## [967]:   test's l2:0 
+## [968]:   test's l2:0 
+## [969]:   test's l2:0 
+## [970]:   test's l2:0 
+## [971]:   test's l2:0 
+## [972]:   test's l2:0 
+## [973]:   test's l2:0 
+## [974]:   test's l2:0 
+## [975]:   test's l2:0 
+## [976]:   test's l2:0 
+## [977]:   test's l2:0 
+## [978]:   test's l2:0 
+## [979]:   test's l2:0 
+## [980]:   test's l2:0 
+## [981]:   test's l2:0 
+## [982]:   test's l2:0 
+## [983]:   test's l2:0 
+## [984]:   test's l2:0 
+## [985]:   test's l2:0 
+## [986]:   test's l2:0 
+## [987]:   test's l2:0 
+## [988]:   test's l2:0 
+## [989]:   test's l2:0 
+## [990]:   test's l2:0 
+## [991]:   test's l2:0 
+## [992]:   test's l2:0 
+## [993]:   test's l2:0 
+## [994]:   test's l2:0 
+## [995]:   test's l2:0 
+## [996]:   test's l2:0 
+## [997]:   test's l2:0 
+## [998]:   test's l2:0 
+## [999]:   test's l2:0 
+## [1000]:  test's l2:0
+
# We create the data structure, but for model3
+new_data3 <- data.frame(X = rowMeans(predict(model3,
+                                             agaricus.test$data,
+                                             predleaf = TRUE)),
+                        Y = pmin(pmax(predict(model3,
+                                              agaricus.test$data), 1e-15), 1 - 1e-15))
+new_data3$Z <- -(agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y))
+new_data3$binned <- .bincode(x = new_data3$X,
+                             breaks = quantile(x = new_data3$X,
+                                               probs = (1:9)/10),
+                             right = TRUE,
+                             include.lowest = TRUE)
+new_data3$binned[is.na(new_data3$binned)] <- 0
+new_data3$binned <- as.factor(new_data3$binned)
+
+# We can check the binned content
+table(new_data3$binned)
+
## 
+##   0   1   2   4   5   6   8 
+##  25 440 337  31 159 339 280
+
# We can plot the binned content
+# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue.
+# However, if the rules were not true, the loss would explode. See the sudden spikes?
+ggplot(data = new_data3, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
+

+
# Compare with our second model, the difference is severe. This is smooth.
+ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
+

+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-1.png b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-1.png new file mode 100644 index 000000000000..092438548cd0 Binary files /dev/null and b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-1.png differ diff --git a/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-2.png b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-2.png new file mode 100644 index 000000000000..db23e2739aaf Binary files /dev/null and b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-2.png differ diff --git a/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-3.png b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-3.png new file mode 100644 index 000000000000..29c4695e64c2 Binary files /dev/null and b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-3.png differ diff --git a/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-4.png b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-4.png new file mode 100644 index 000000000000..cfd98ba5aa33 Binary files /dev/null and b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-4.png differ diff --git a/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-5.png b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-5.png new file mode 100644 index 000000000000..0360c54c0904 Binary files /dev/null and b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-5.png differ diff --git a/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-6.png b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-6.png new file mode 100644 index 000000000000..6d7e4050ae0f Binary files /dev/null and b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-6.png differ diff --git a/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-7.png b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-7.png new file mode 100644 index 000000000000..6d7e4050ae0f Binary files /dev/null and b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-7.png differ diff --git a/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-8.png b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-8.png new file mode 100644 index 000000000000..6d7e4050ae0f Binary files /dev/null and b/R-package/docs/articles/leaf_stability_files/figure-html/unnamed-chunk-1-8.png differ diff --git a/R-package/docs/articles/multiclass.html b/R-package/docs/articles/multiclass.html new file mode 100644 index 000000000000..f902314fbf77 --- /dev/null +++ b/R-package/docs/articles/multiclass.html @@ -0,0 +1,216 @@ + + + + + + + +Multiclass training/prediction • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
require(lightgbm)
+
## Loading required package: lightgbm
+
## Loading required package: R6
+
# We load the default iris dataset shipped with R
+data(iris)
+
+# We must convert factors to numeric
+# They must be starting from number 0 to use multiclass
+# For instance: 0, 1, 2, 3, 4, 5...
+iris$Species <- as.numeric(as.factor(iris$Species)) - 1
+
+# We cut the data set into 80% train and 20% validation
+# The 10 last samples of each class are for validation
+
+train <- as.matrix(iris[c(1:40, 51:90, 101:140), ])
+test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
+dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
+dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
+valids <- list(test = dtest)
+
+# Method 1 of training
+params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
+model <- lgb.train(params,
+                   dtrain,
+                   100,
+                   valids,
+                   min_data = 1,
+                   learning_rate = 1,
+                   early_stopping_rounds = 10,
+                   nthread = 1)
+
## [1]: test's multi_error:0.0333333 
+## [2]: test's multi_error:0.0333333 
+## [3]: test's multi_error:0.0333333 
+## [4]: test's multi_error:0.0333333 
+## [5]: test's multi_error:0.0333333 
+## [6]: test's multi_error:0.0333333 
+## [7]: test's multi_error:0.0333333 
+## [8]: test's multi_error:0.0333333 
+## [9]: test's multi_error:0.0333333 
+## [10]:    test's multi_error:0.0333333 
+## [11]:    test's multi_error:0.0333333
+
# We can predict on test data, outputs a 90-length vector
+# Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3...
+my_preds <- predict(model, test[, 1:4])
+
+# Method 2 of training, identical
+model <- lgb.train(list(),
+                   dtrain,
+                   100,
+                   valids,
+                   min_data = 1,
+                   learning_rate = 1,
+                   early_stopping_rounds = 10,
+                   objective = "multiclass",
+                   metric = "multi_error",
+                   num_class = 3,
+                   nthread = 1)
+
## [1]: test's multi_error:0.0333333 
+## [2]: test's multi_error:0.0333333 
+## [3]: test's multi_error:0.0333333 
+## [4]: test's multi_error:0.0333333 
+## [5]: test's multi_error:0.0333333 
+## [6]: test's multi_error:0.0333333 
+## [7]: test's multi_error:0.0333333 
+## [8]: test's multi_error:0.0333333 
+## [9]: test's multi_error:0.0333333 
+## [10]:    test's multi_error:0.0333333 
+## [11]:    test's multi_error:0.0333333
+
# We can predict on test data, identical
+my_preds <- predict(model, test[, 1:4])
+
+# A (30x3) matrix with the predictions, use parameter reshape
+# class1 class2 class3
+#   obs1   obs1   obs1
+#   obs2   obs2   obs2
+#   ....   ....   ....
+my_preds <- predict(model, test[, 1:4], reshape = TRUE)
+
+# We can also get the predicted scores before the Sigmoid/Softmax application
+my_preds <- predict(model, test[, 1:4], rawscore = TRUE)
+
+# Raw score predictions as matrix instead of vector
+my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE)
+
+# We can also get the leaf index
+my_preds <- predict(model, test[, 1:4], predleaf = TRUE)
+
+# Predict leaf index as matrix instead of vector
+my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE)
+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/weight_param.html b/R-package/docs/articles/weight_param.html new file mode 100644 index 000000000000..541d2e2010a5 --- /dev/null +++ b/R-package/docs/articles/weight_param.html @@ -0,0 +1,340 @@ + + + + + + + +Weight-Parameter adjustment relationship • lightgbm + + + + + + +
+
+ + + +
+
+ + + + +
+
# This demo R code is to provide a demonstration of hyperparameter adjustment
+# when scaling weights for appropriate learning
+# As with any optimizers, bad parameters can impair performance
+
+# Load library
+library(lightgbm)
+
## Loading required package: R6
+
# We will train a model with the following scenarii:
+# - Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning)
+# - Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning)
+# - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning)
+
+# Setup small weights
+weights1 <- rep(1/100000, 6513)
+weights2 <- rep(1/100000, 1611)
+
+# Load data and create datasets
+data(agaricus.train, package = "lightgbm")
+train <- agaricus.train
+dtrain <- lgb.Dataset(train$data, label = train$label, weight = weights1)
+data(agaricus.test, package = "lightgbm")
+test <- agaricus.test
+dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label, weight = weights2)
+valids <- list(test = dtest)
+
+# Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning)
+# It cannot learn because regularization is too large!
+# min_sum_hessian alone is bigger than the sum of weights, thus you will never learn anything
+params <- list(objective = "regression",
+               metric = "l2",
+               device = "cpu",
+               min_sum_hessian = 10,
+               num_leaves = 7,
+               max_depth = 3,
+               nthread = 1)
+model <- lgb.train(params,
+                   dtrain,
+                   50,
+                   valids,
+                   min_data = 1,
+                   learning_rate = 1,
+                   early_stopping_rounds = 10)
+
## Loading required package: Matrix
+
## [1]: test's l2:0.249665 
+## [2]: test's l2:0.482507 
+## [3]: test's l2:1.18021 
+## [4]: test's l2:2.34279 
+## [5]: test's l2:3.97022 
+## [6]: test's l2:6.06253 
+## [7]: test's l2:8.6197 
+## [8]: test's l2:11.6417 
+## [9]: test's l2:15.1286 
+## [10]:    test's l2:19.0804 
+## [11]:    test's l2:23.497
+
weight_loss <- as.numeric(model$record_evals$test$l2$eval)
+plot(weight_loss) # Shows how poor the learning was: a straight line!
+

+
# Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning)
+# Adjusted regularization just consisting in multiplicating results by 1e4 (x10000)
+# Notice how it learns, there is no issue as we adjusted regularization ourselves
+params <- list(objective = "regression",
+               metric = "l2",
+               device = "cpu",
+               min_sum_hessian = 1e-4,
+               num_leaves = 7,
+               max_depth = 3,
+               nthread = 1)
+model <- lgb.train(params,
+                   dtrain,
+                   50,
+                   valids,
+                   min_data = 1,
+                   learning_rate = 1,
+                   early_stopping_rounds = 10)
+
## [1]: test's l2:0.015004 
+## [2]: test's l2:0.00733572 
+## [3]: test's l2:0.00376968 
+## [4]: test's l2:0.00269737 
+## [5]: test's l2:0.0017028 
+## [6]: test's l2:0.000754526 
+## [7]: test's l2:0.000612458 
+## [8]: test's l2:0.000433644 
+## [9]: test's l2:0.00037502 
+## [10]:    test's l2:0.000343423 
+## [11]:    test's l2:0.000317457 
+## [12]:    test's l2:6.40256e-05 
+## [13]:    test's l2:4.83092e-05 
+## [14]:    test's l2:0.000160781 
+## [15]:    test's l2:0.000102254 
+## [16]:    test's l2:6.40448e-05 
+## [17]:    test's l2:4.31159e-05 
+## [18]:    test's l2:3.45622e-05 
+## [19]:    test's l2:2.92339e-05 
+## [20]:    test's l2:2.67598e-05 
+## [21]:    test's l2:2.99818e-05 
+## [22]:    test's l2:2.8607e-05 
+## [23]:    test's l2:2.67324e-05 
+## [24]:    test's l2:2.48837e-05 
+## [25]:    test's l2:2.02386e-05 
+## [26]:    test's l2:1.93004e-05 
+## [27]:    test's l2:1.88033e-05 
+## [28]:    test's l2:1.37116e-05 
+## [29]:    test's l2:1.28615e-05 
+## [30]:    test's l2:1.45912e-05 
+## [31]:    test's l2:1.04062e-05 
+## [32]:    test's l2:8.28405e-06 
+## [33]:    test's l2:7.53409e-06 
+## [34]:    test's l2:6.80012e-06 
+## [35]:    test's l2:6.60489e-06 
+## [36]:    test's l2:6.27259e-06 
+## [37]:    test's l2:5.7322e-06 
+## [38]:    test's l2:5.22595e-06 
+## [39]:    test's l2:6.3065e-06 
+## [40]:    test's l2:5.44453e-06 
+## [41]:    test's l2:4.76803e-06 
+## [42]:    test's l2:4.57209e-06 
+## [43]:    test's l2:4.27042e-06 
+## [44]:    test's l2:4.39086e-06 
+## [45]:    test's l2:3.46275e-06 
+## [46]:    test's l2:3.18253e-06 
+## [47]:    test's l2:3.14085e-06 
+## [48]:    test's l2:3.01015e-06 
+## [49]:    test's l2:2.38811e-06 
+## [50]:    test's l2:2.22581e-06
+
small_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
+plot(small_weight_loss) # It learns!
+

+
# Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning)
+# To make it better, we are first cleaning the environment and reloading LightGBM
+lgb.unloader(wipe = TRUE)
+
+# And now, we are doing as usual
+library(lightgbm)
+data(agaricus.train, package = "lightgbm")
+train <- agaricus.train
+dtrain <- lgb.Dataset(train$data, label = train$label)
+data(agaricus.test, package = "lightgbm")
+test <- agaricus.test
+dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
+valids <- list(test = dtest)
+
+# Setup parameters and run model...
+params <- list(objective = "regression",
+               metric = "l2",
+               device = "cpu",
+               min_sum_hessian = 10,
+               num_leaves = 7,
+               max_depth = 3,
+               nthread = 1)
+model <- lgb.train(params,
+                   dtrain,
+                   50,
+                   valids,
+                   min_data = 1,
+                   learning_rate = 1,
+                   early_stopping_rounds = 10)
+
## [1]: test's l2:0.015004 
+## [2]: test's l2:0.00733572 
+## [3]: test's l2:0.00376968 
+## [4]: test's l2:0.00269737 
+## [5]: test's l2:0.00186089 
+## [6]: test's l2:0.00111794 
+## [7]: test's l2:0.00123277 
+## [8]: test's l2:0.00120069 
+## [9]: test's l2:0.00087376 
+## [10]:    test's l2:0.000777554 
+## [11]:    test's l2:0.000707629 
+## [12]:    test's l2:0.000526292 
+## [13]:    test's l2:0.000564568 
+## [14]:    test's l2:0.000520694 
+## [15]:    test's l2:0.000492974 
+## [16]:    test's l2:0.000509947 
+## [17]:    test's l2:0.000409418 
+## [18]:    test's l2:0.000436754 
+## [19]:    test's l2:0.000415116 
+## [20]:    test's l2:0.000398686 
+## [21]:    test's l2:0.000308853 
+## [22]:    test's l2:0.000333474 
+## [23]:    test's l2:0.000308576 
+## [24]:    test's l2:0.000293586 
+## [25]:    test's l2:0.000283788 
+## [26]:    test's l2:0.000228984 
+## [27]:    test's l2:0.000240609 
+## [28]:    test's l2:0.000227057 
+## [29]:    test's l2:0.000218103 
+## [30]:    test's l2:0.000176547 
+## [31]:    test's l2:0.000188157 
+## [32]:    test's l2:0.000176642 
+## [33]:    test's l2:0.000169261 
+## [34]:    test's l2:0.000137216 
+## [35]:    test's l2:0.000143953 
+## [36]:    test's l2:0.000135986 
+## [37]:    test's l2:0.000112708 
+## [38]:    test's l2:0.000117918 
+## [39]:    test's l2:0.000110817 
+## [40]:    test's l2:0.000106353 
+## [41]:    test's l2:0.000103378 
+## [42]:    test's l2:8.72045e-05 
+## [43]:    test's l2:9.12448e-05 
+## [44]:    test's l2:8.61785e-05 
+## [45]:    test's l2:8.30755e-05 
+## [46]:    test's l2:7.0557e-05 
+## [47]:    test's l2:7.39947e-05 
+## [48]:    test's l2:7.09341e-05 
+## [49]:    test's l2:6.19029e-05 
+## [50]:    test's l2:6.39761e-05
+
large_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
+plot(large_weight_loss) # It learns!
+

+
# Do you want to compare the learning? They both converge.
+plot(small_weight_loss, large_weight_loss)
+curve(1*x, from = 0, to = 0.02, add = TRUE)
+

+
+
+ + + +
+ + + +
+ + + diff --git a/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-1.png b/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-1.png new file mode 100644 index 000000000000..731a120e8f47 Binary files /dev/null and b/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-1.png differ diff --git a/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-2.png b/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-2.png new file mode 100644 index 000000000000..23d7ea3128f0 Binary files /dev/null and b/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-2.png differ diff --git a/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-3.png b/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-3.png new file mode 100644 index 000000000000..26c5e413145b Binary files /dev/null and b/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-3.png differ diff --git a/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-4.png b/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-4.png new file mode 100644 index 000000000000..869fbb791846 Binary files /dev/null and b/R-package/docs/articles/weight_param_files/figure-html/unnamed-chunk-1-4.png differ diff --git a/R-package/docs/authors.html b/R-package/docs/authors.html new file mode 100644 index 000000000000..06e4c4b5e661 --- /dev/null +++ b/R-package/docs/authors.html @@ -0,0 +1,158 @@ + + + + + + + + +Authors • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+ +
+
+ + + + +
+ +
+ + + +
+ + + diff --git a/R-package/docs/index.html b/R-package/docs/index.html new file mode 100644 index 000000000000..4045c939b64e --- /dev/null +++ b/R-package/docs/index.html @@ -0,0 +1,357 @@ + + + + + + + +Light Gradient Boosting Machine • lightgbm + + + + + + +
+
+ + + +
+
+ +
+ + +
+
+

+LightGBM, Light Gradient Boosting Machine

+

LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:

+
    +
  • Faster training speed and higher efficiency
  • +
  • Lower memory usage
  • +
  • Better accuracy
  • +
  • Parallel and GPU learning supported
  • +
  • Capable of handling large-scale data
  • +
+

For more details, please refer to Features.

+

Comparison experiments on public datasets show that LightGBM can outperform existing boosting frameworks on both efficiency and accuracy, with significantly lower memory consumption. What’s more, the parallel experiments show that LightGBM can achieve a linear speed-up by using multiple machines for training in specific settings.

+
+

+News

+

08/15/2017 : Optimal split for categorical features.

+

07/13/2017 : Gitter is available.

+

06/20/2017 : Python-package is on PyPI now.

+

06/09/2017 : LightGBM Slack team is available.

+

05/03/2017 : LightGBM v2 stable release.

+

04/10/2017 : LightGBM supports GPU-accelerated tree learning now. Please read our GPU Tutorial and Performance Comparison.

+

02/20/2017 : Update to LightGBM v2.

+

02/12/2017: LightGBM v1 stable release.

+

01/08/2017 : Release R-package beta version, welcome to have a try and provide feedback.

+

12/05/2016 : Categorical Features as input directly (without one-hot coding).

+

12/02/2016 : Release Python-package beta version, welcome to have a try and provide feedback.

+

More detailed update logs : Key Events.

+
+
+

+External (unofficial) Repositories

+

Julia Package: https://github.com/Allardvm/LightGBM.jl

+

JPMML: https://github.com/jpmml/jpmml-lightgbm

+
+
+

+Get Started and Documentation

+

Install by following the guide for the command line program, Python-package or R-package. Then please see the Quick Start guide.

+

Our primary documentation is at https://lightgbm.readthedocs.io/ and is generated from this repository.

+

Next you may want to read:

+ +

Documentation for contributors:

+ +
+
+

+Support

+ +
+
+

+How to Contribute

+

LightGBM has been developed and used by many active community members. Your help is very valuable to make it better for everyone.

+
    +
  • Check out call for contributions to see what can be improved, or open an issue if you want something.
  • +
  • Contribute to the tests to make it more reliable.
  • +
  • Contribute to the documents to make it clearer for everyone.
  • +
  • Contribute to the examples to share your experience with other users.
  • +
  • Open issue if you met problems during development.
  • +
+
+
+

+Microsoft Open Source Code of Conduct

+

This project has adopted the Microsoft Open Source Code of Conduct. For more information see the Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments.

+
+
+

+Reference Paper

+

Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. LightGBM: A Highly Efficient Gradient Boosting Decision Tree. In Advances in Neural Information Processing Systems (NIPS), pp. 3149-3157. 2017.

+
+
+

+Installation

+
+

+Preparation

+

You need to install git and CMake first.

+

Note: 32-bit R/Rtools is not supported.

+
+

+Windows Preparation

+

Installing Rtools is mandatory, and only support the 64-bit version. It requires to add to PATH the Rtools MinGW64 folder, if it was not done automatically during installation.

+

The default compiler is Visual Studio (or MS Build) in Windows, with an automatic fallback to Rtools or any MinGW64 (x86_64-posix-seh) available (this means if you have only Rtools and CMake, it will compile fine).

+

To force the usage of Rtools / MinGW, you can set use_mingw to TRUE in R-package/src/install.libs.R.

+

For users who wants to install online with GPU or want to choose a specific compiler, please check the end of this document for installation using a helper package (Laurae2/lgbdl).

+

Warning for Windows users: it is recommended to use Visual Studio for its better multi-threading efficency in Windows for many core systems. For very simple systems (dual core computers or worse), MinGW64 is recommended for maximum performance. If you do not know what to choose, it is recommended to use Visual Studio, the default compiler. Do not try using MinGW in Windows on many core systems. It may result in 10x slower results than Visual Studio.

+
+
+

+macOS Preparation

+

gcc with OpenMP support must be installed first. Refer to Installation-Guide for installing gcc with OpenMP support.

+
+
+
+

+Install

+

Install LightGBM R-package with the following command:

+
git clone --recursive https://github.com/Microsoft/LightGBM
+cd LightGBM/R-package
+# export CXX=g++-7 CC=gcc-7 # for macOS
+R CMD INSTALL --build . --no-multiarch
+

Or build a self-contained R package which can be installed afterwards:

+
git clone --recursive https://github.com/Microsoft/LightGBM
+cd LightGBM/R-package
+Rscript build_package.R
+# export CXX=g++-7 CC=gcc-7 # for macOS
+R CMD INSTALL lightgbm_2.0.4.tar.gz --no-multiarch
+

Note: for the build with Visual Studio/MSBuild in Windows, you should use the Windows CMD or Powershell.

+

Windows users may need to run with administrator rights (either R or the command prompt, depending on the way you are installing this package). Linux users might require the appropriate user write permissions for packages.

+

Set use_gpu to TRUE in R-package/src/install.libs.R to enable the build with GPU support. You will need to install Boost and OpenCL first: details for installation can be found in Installation-Guide.

+

You can also install directly from R using the repository with devtools:

+
library(devtools)
+options(devtools.install.args = "--no-multiarch") # if you have 64-bit R only, you can skip this
+install_github("Microsoft/LightGBM", subdir = "R-package")
+

If you are using a precompiled dll/lib locally, you can move the dll/lib into LightGBM root folder, modify LightGBM/R-package/src/install.libs.R’s 2nd line (change use_precompile <- FALSE to use_precompile <- TRUE), and install R-package as usual.

+

When your package installation is done, you can check quickly if your LightGBM R package is working by running the following:

+
library(lightgbm)
+data(agaricus.train, package='lightgbm')
+train <- agaricus.train
+dtrain <- lgb.Dataset(train$data, label=train$label)
+params <- list(objective="regression", metric="l2")
+model <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
+
+
+
+

+Installation with Precompiled dll/lib from R Using GitHub

+

You can install LightGBM R-package from GitHub with devtools thanks to a helper package for LightGBM.

+
+

+Prerequisites

+

You will need:

+
    +
  • Precompiled LightGBM dll/lib
  • +
  • MinGW / Visual Studio / gcc (depending on your OS and your needs) with make in PATH environment variable
  • +
  • git in PATH environment variable
  • +
  • +CMake in PATH environment variable
  • +
  • +lgbdl R-package, which can be installed using devtools::install_github("Laurae2/lgbdl") +
  • +
  • +Rtools if using Windows
  • +
+

In addition, if you are using a Visual Studio precompiled DLL, assuming you do not have Visual Studio installed (if you have it installed, ignore the warnings below):

+
    +
  • Visual Studio 2015/2017 precompiled DLL: download and install Visual Studio Runtime for 2015/2017 (you will get an error about MSVCP140.dll missing otherwise)
  • +
+

Once you have all this setup, you can use lgb.dl from lgbdl package to install LightGBM from repository.

+

For instance, you can install the R package from LightGBM master commit of GitHub with Visual Studio using the following from R:

+
lgb.dl(commit = "master",
+       compiler = "vs",
+       repo = "https://github.com/Microsoft/LightGBM")
+

You may also install using a precompiled dll/lib using the following from R:

+
lgb.dl(commit = "master",
+       libdll = "C:\\LightGBM\\windows\\x64\\DLL\\lib_lightgbm.dll", # YOUR PRECOMPILED DLL
+       repo = "https://github.com/Microsoft/LightGBM")
+

You may also install online using a LightGBM with proper GPU support using Visual Studio (as an example here) using the following from R:

+
lgb.dl(commit = "master",
+       compiler = "vs", # Remove this for MinGW + GPU installation
+       repo = "https://github.com/Microsoft/LightGBM",
+       use_gpu = TRUE)
+

For more details about options, please check Laurae2/lgbdl R-package.

+

You may also read Microsoft/LightGBM#912 for a visual example for LightGBM installation in Windows with Visual Studio.

+
+
+ +
+ +
+ + +
+ + + +
+ + + diff --git a/R-package/docs/jquery.sticky-kit.min.js b/R-package/docs/jquery.sticky-kit.min.js new file mode 100644 index 000000000000..e2a3c6de9e8e --- /dev/null +++ b/R-package/docs/jquery.sticky-kit.min.js @@ -0,0 +1,9 @@ +/* + Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | http://leafo.net +*/ +(function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k)); +if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("
"))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q, +u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),eb&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}), +a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize", +y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n + + + + + diff --git a/R-package/docs/pkgdown.css b/R-package/docs/pkgdown.css new file mode 100644 index 000000000000..209ce57febdc --- /dev/null +++ b/R-package/docs/pkgdown.css @@ -0,0 +1,163 @@ +/* Sticker footer */ +body > .container { + display: flex; + padding-top: 60px; + min-height: calc(100vh); + flex-direction: column; +} + +body > .container .row { + flex: 1; +} + +footer { + margin-top: 45px; + padding: 35px 0 36px; + border-top: 1px solid #e5e5e5; + color: #666; + display: flex; +} +footer p { + margin-bottom: 0; +} +footer div { + flex: 1; +} +footer .pkgdown { + text-align: right; +} +footer p { + margin-bottom: 0; +} + +img.icon { + float: right; +} + +img { + max-width: 100%; +} + +/* Section anchors ---------------------------------*/ + +a.anchor { + margin-left: -30px; + display:inline-block; + width: 30px; + height: 30px; + visibility: hidden; + + background-image: url(./link.svg); + background-repeat: no-repeat; + background-size: 20px 20px; + background-position: center center; +} + +.hasAnchor:hover a.anchor { + visibility: visible; +} + +@media (max-width: 767px) { + .hasAnchor:hover a.anchor { + visibility: hidden; + } +} + + +/* Fixes for fixed navbar --------------------------*/ + +.contents h1, .contents h2, .contents h3, .contents h4 { + padding-top: 60px; + margin-top: -60px; +} + +/* Static header placement on mobile devices */ +@media (max-width: 767px) { + .navbar-fixed-top { + position: absolute; + } + .navbar { + padding: 0; + } +} + + +/* Sidebar --------------------------*/ + +#sidebar { + margin-top: 30px; +} +#sidebar h2 { + font-size: 1.5em; + margin-top: 1em; +} + +#sidebar h2:first-child { + margin-top: 0; +} + +#sidebar .list-unstyled li { + margin-bottom: 0.5em; +} + +/* Reference index & topics ----------------------------------------------- */ + +.ref-index th {font-weight: normal;} +.ref-index h2 {font-size: 20px;} + +.ref-index td {vertical-align: top;} +.ref-index .alias {width: 40%;} +.ref-index .title {width: 60%;} + +.ref-index .alias {width: 40%;} +.ref-index .title {width: 60%;} + +.ref-arguments th {text-align: right; padding-right: 10px;} +.ref-arguments th, .ref-arguments td {vertical-align: top;} +.ref-arguments .name {width: 20%;} +.ref-arguments .desc {width: 80%;} + +/* Nice scrolling for wide elements --------------------------------------- */ + +table { + display: block; + overflow: auto; +} + +/* Syntax highlighting ---------------------------------------------------- */ + +pre { + word-wrap: normal; + word-break: normal; + border: 1px solid #eee; +} + +pre, code { + background-color: #f8f8f8; + color: #333; +} + +pre .img { + margin: 5px 0; +} + +pre .img img { + background-color: #fff; + display: block; + height: auto; +} + +code a, pre a { + color: #375f84; +} + +.fl {color: #1514b5;} +.fu {color: #000000;} /* function */ +.ch,.st {color: #036a07;} /* string */ +.kw {color: #264D66;} /* keyword */ +.co {color: #888888;} /* comment */ + +.message { color: black; font-weight: bolder;} +.error { color: orange; font-weight: bolder;} +.warning { color: #6A0366; font-weight: bolder;} + diff --git a/R-package/docs/pkgdown.js b/R-package/docs/pkgdown.js new file mode 100644 index 000000000000..4b8171328904 --- /dev/null +++ b/R-package/docs/pkgdown.js @@ -0,0 +1,45 @@ +$(function() { + $("#sidebar").stick_in_parent({offset_top: 40}); + $('body').scrollspy({ + target: '#sidebar', + offset: 60 + }); + + var cur_path = paths(location.pathname); + $("#navbar ul li a").each(function(index, value) { + if (value.text == "Home") + return; + if (value.getAttribute("href") === "#") + return; + + var path = paths(value.pathname); + if (is_prefix(cur_path, path)) { + // Add class to parent
  • , and enclosing
  • if in dropdown + var menu_anchor = $(value); + menu_anchor.parent().addClass("active"); + menu_anchor.closest("li.dropdown").addClass("active"); + } + }); +}); + +function paths(pathname) { + var pieces = pathname.split("/"); + pieces.shift(); // always starts with / + + var end = pieces[pieces.length - 1]; + if (end === "index.html" || end === "") + pieces.pop(); + return(pieces); +} + +function is_prefix(needle, haystack) { + if (needle.length > haystack.lengh) + return(false); + + for (var i = 0; i < haystack.length; i++) { + if (needle[i] != haystack[i]) + return(false); + } + + return(true); +} diff --git a/R-package/docs/reference/agaricus.test.html b/R-package/docs/reference/agaricus.test.html new file mode 100644 index 000000000000..ae95ec35fd33 --- /dev/null +++ b/R-package/docs/reference/agaricus.test.html @@ -0,0 +1,180 @@ + + + + + + + + +Test part from Mushroom Data Set — agaricus.test • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    This data set is originally from the Mushroom data set, +UCI Machine Learning Repository.

    + + +
    data(agaricus.test)
    + +

    Format

    + +

    A list containing a label vector, and a dgCMatrix object with 1611 +rows and 126 variables

    + +

    Details

    + +

    This data set includes the following fields:

      +
    • label the label for each record

    • +
    • data a sparse Matrix of dgCMatrix class, with 126 columns.

    • +
    + +

    References

    + +

    https://archive.ics.uci.edu/ml/datasets/Mushroom

    +

    Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository +[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, +School of Information and Computer Science.

    + + +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/agaricus.train.html b/R-package/docs/reference/agaricus.train.html new file mode 100644 index 000000000000..23246709cbf1 --- /dev/null +++ b/R-package/docs/reference/agaricus.train.html @@ -0,0 +1,180 @@ + + + + + + + + +Training part from Mushroom Data Set — agaricus.train • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    This data set is originally from the Mushroom data set, +UCI Machine Learning Repository.

    + + +
    data(agaricus.train)
    + +

    Format

    + +

    A list containing a label vector, and a dgCMatrix object with 6513 +rows and 127 variables

    + +

    Details

    + +

    This data set includes the following fields:

      +
    • label the label for each record

    • +
    • data a sparse Matrix of dgCMatrix class, with 126 columns.

    • +
    + +

    References

    + +

    https://archive.ics.uci.edu/ml/datasets/Mushroom

    +

    Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository +[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, +School of Information and Computer Science.

    + + +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/bank.html b/R-package/docs/reference/bank.html new file mode 100644 index 000000000000..803cd87a7dcf --- /dev/null +++ b/R-package/docs/reference/bank.html @@ -0,0 +1,176 @@ + + + + + + + + +Bank Marketing Data Set — bank • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    This data set is originally from the Bank Marketing data set, +UCI Machine Learning Repository.

    + + +
    data(bank)
    + +

    Format

    + +

    A data.table with 4521 rows and 17 variables

    + +

    Details

    + +

    It contains only the following: bank.csv with 10 +randomly selected from 3 (older version of this dataset with less inputs).

    + +

    References

    + +

    http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

    +

    S. Moro, P. Cortez and P. Rita. (2014) +A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems

    + + +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/data.bin b/R-package/docs/reference/data.bin new file mode 100644 index 000000000000..836fab9305c3 Binary files /dev/null and b/R-package/docs/reference/data.bin differ diff --git a/R-package/docs/reference/dim.html b/R-package/docs/reference/dim.html new file mode 100644 index 000000000000..0f85511b01ec --- /dev/null +++ b/R-package/docs/reference/dim.html @@ -0,0 +1,194 @@ + + + + + + + + +Dimensions of an lgb.Dataset — dim.lgb.Dataset • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Returns a vector of numbers of rows and of columns in an lgb.Dataset.

    + + +
    # S3 method for lgb.Dataset
    +dim(x, ...)
    + +

    Arguments

    + + + + + + + + + + +
    x

    Object of class lgb.Dataset

    ...

    other parameters

    + +

    Value

    + +

    a vector of numbers of rows and of columns

    + +

    Details

    + +

    Note: since nrow and ncol internally use dim, they can also +be directly used with an lgb.Dataset object.

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) + +stopifnot(nrow(dtrain) == nrow(train$data)) +stopifnot(ncol(dtrain) == ncol(train$data)) +stopifnot(all(dim(dtrain) == dim(train$data)))
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/dimnames.lgb.Dataset.html b/R-package/docs/reference/dimnames.lgb.Dataset.html new file mode 100644 index 000000000000..ea727b454a88 --- /dev/null +++ b/R-package/docs/reference/dimnames.lgb.Dataset.html @@ -0,0 +1,348 @@ + + + + + + + + +Handling of column names of <code>lgb.Dataset</code> — dimnames.lgb.Dataset • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Only column names are supported for lgb.Dataset, thus setting of +row names would have no effect and returned row names would be NULL.

    + + +
    # S3 method for lgb.Dataset
    +dimnames(x)
    +
    +# S3 method for lgb.Dataset
    +dimnames(x) <- value
    + +

    Arguments

    + + + + + + + + + + +
    x

    object of class lgb.Dataset

    value

    a list of two elements: the first one is ignored +and the second one is column names

    + +

    Details

    + +

    Generic dimnames methods are used by colnames. +Since row names are irrelevant, it is recommended to use colnames directly.

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +lgb.Dataset.construct(dtrain)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    dimnames(dtrain)
    #> [[1]] +#> NULL +#> +#> [[2]] +#> [1] "cap-shape=bell" "cap-shape=conical" +#> [3] "cap-shape=convex" "cap-shape=flat" +#> [5] "cap-shape=knobbed" "cap-shape=sunken" +#> [7] "cap-surface=fibrous" "cap-surface=grooves" +#> [9] "cap-surface=scaly" "cap-surface=smooth" +#> [11] "cap-color=brown" "cap-color=buff" +#> [13] "cap-color=cinnamon" "cap-color=gray" +#> [15] "cap-color=green" "cap-color=pink" +#> [17] "cap-color=purple" "cap-color=red" +#> [19] "cap-color=white" "cap-color=yellow" +#> [21] "bruises?=bruises" "bruises?=no" +#> [23] "odor=almond" "odor=anise" +#> [25] "odor=creosote" "odor=fishy" +#> [27] "odor=foul" "odor=musty" +#> [29] "odor=none" "odor=pungent" +#> [31] "odor=spicy" "gill-attachment=attached" +#> [33] "gill-attachment=descending" "gill-attachment=free" +#> [35] "gill-attachment=notched" "gill-spacing=close" +#> [37] "gill-spacing=crowded" "gill-spacing=distant" +#> [39] "gill-size=broad" "gill-size=narrow" +#> [41] "gill-color=black" "gill-color=brown" +#> [43] "gill-color=buff" "gill-color=chocolate" +#> [45] "gill-color=gray" "gill-color=green" +#> [47] "gill-color=orange" "gill-color=pink" +#> [49] "gill-color=purple" "gill-color=red" +#> [51] "gill-color=white" "gill-color=yellow" +#> [53] "stalk-shape=enlarging" "stalk-shape=tapering" +#> [55] "stalk-root=bulbous" "stalk-root=club" +#> [57] "stalk-root=cup" "stalk-root=equal" +#> [59] "stalk-root=rhizomorphs" "stalk-root=rooted" +#> [61] "stalk-root=missing" "stalk-surface-above-ring=fibrous" +#> [63] "stalk-surface-above-ring=scaly" "stalk-surface-above-ring=silky" +#> [65] "stalk-surface-above-ring=smooth" "stalk-surface-below-ring=fibrous" +#> [67] "stalk-surface-below-ring=scaly" "stalk-surface-below-ring=silky" +#> [69] "stalk-surface-below-ring=smooth" "stalk-color-above-ring=brown" +#> [71] "stalk-color-above-ring=buff" "stalk-color-above-ring=cinnamon" +#> [73] "stalk-color-above-ring=gray" "stalk-color-above-ring=orange" +#> [75] "stalk-color-above-ring=pink" "stalk-color-above-ring=red" +#> [77] "stalk-color-above-ring=white" "stalk-color-above-ring=yellow" +#> [79] "stalk-color-below-ring=brown" "stalk-color-below-ring=buff" +#> [81] "stalk-color-below-ring=cinnamon" "stalk-color-below-ring=gray" +#> [83] "stalk-color-below-ring=orange" "stalk-color-below-ring=pink" +#> [85] "stalk-color-below-ring=red" "stalk-color-below-ring=white" +#> [87] "stalk-color-below-ring=yellow" "veil-type=partial" +#> [89] "veil-type=universal" "veil-color=brown" +#> [91] "veil-color=orange" "veil-color=white" +#> [93] "veil-color=yellow" "ring-number=none" +#> [95] "ring-number=one" "ring-number=two" +#> [97] "ring-type=cobwebby" "ring-type=evanescent" +#> [99] "ring-type=flaring" "ring-type=large" +#> [101] "ring-type=none" "ring-type=pendant" +#> [103] "ring-type=sheathing" "ring-type=zone" +#> [105] "spore-print-color=black" "spore-print-color=brown" +#> [107] "spore-print-color=buff" "spore-print-color=chocolate" +#> [109] "spore-print-color=green" "spore-print-color=orange" +#> [111] "spore-print-color=purple" "spore-print-color=white" +#> [113] "spore-print-color=yellow" "population=abundant" +#> [115] "population=clustered" "population=numerous" +#> [117] "population=scattered" "population=several" +#> [119] "population=solitary" "habitat=grasses" +#> [121] "habitat=leaves" "habitat=meadows" +#> [123] "habitat=paths" "habitat=urban" +#> [125] "habitat=waste" "habitat=woods" +#>
    colnames(dtrain)
    #> [1] "cap-shape=bell" "cap-shape=conical" +#> [3] "cap-shape=convex" "cap-shape=flat" +#> [5] "cap-shape=knobbed" "cap-shape=sunken" +#> [7] "cap-surface=fibrous" "cap-surface=grooves" +#> [9] "cap-surface=scaly" "cap-surface=smooth" +#> [11] "cap-color=brown" "cap-color=buff" +#> [13] "cap-color=cinnamon" "cap-color=gray" +#> [15] "cap-color=green" "cap-color=pink" +#> [17] "cap-color=purple" "cap-color=red" +#> [19] "cap-color=white" "cap-color=yellow" +#> [21] "bruises?=bruises" "bruises?=no" +#> [23] "odor=almond" "odor=anise" +#> [25] "odor=creosote" "odor=fishy" +#> [27] "odor=foul" "odor=musty" +#> [29] "odor=none" "odor=pungent" +#> [31] "odor=spicy" "gill-attachment=attached" +#> [33] "gill-attachment=descending" "gill-attachment=free" +#> [35] "gill-attachment=notched" "gill-spacing=close" +#> [37] "gill-spacing=crowded" "gill-spacing=distant" +#> [39] "gill-size=broad" "gill-size=narrow" +#> [41] "gill-color=black" "gill-color=brown" +#> [43] "gill-color=buff" "gill-color=chocolate" +#> [45] "gill-color=gray" "gill-color=green" +#> [47] "gill-color=orange" "gill-color=pink" +#> [49] "gill-color=purple" "gill-color=red" +#> [51] "gill-color=white" "gill-color=yellow" +#> [53] "stalk-shape=enlarging" "stalk-shape=tapering" +#> [55] "stalk-root=bulbous" "stalk-root=club" +#> [57] "stalk-root=cup" "stalk-root=equal" +#> [59] "stalk-root=rhizomorphs" "stalk-root=rooted" +#> [61] "stalk-root=missing" "stalk-surface-above-ring=fibrous" +#> [63] "stalk-surface-above-ring=scaly" "stalk-surface-above-ring=silky" +#> [65] "stalk-surface-above-ring=smooth" "stalk-surface-below-ring=fibrous" +#> [67] "stalk-surface-below-ring=scaly" "stalk-surface-below-ring=silky" +#> [69] "stalk-surface-below-ring=smooth" "stalk-color-above-ring=brown" +#> [71] "stalk-color-above-ring=buff" "stalk-color-above-ring=cinnamon" +#> [73] "stalk-color-above-ring=gray" "stalk-color-above-ring=orange" +#> [75] "stalk-color-above-ring=pink" "stalk-color-above-ring=red" +#> [77] "stalk-color-above-ring=white" "stalk-color-above-ring=yellow" +#> [79] "stalk-color-below-ring=brown" "stalk-color-below-ring=buff" +#> [81] "stalk-color-below-ring=cinnamon" "stalk-color-below-ring=gray" +#> [83] "stalk-color-below-ring=orange" "stalk-color-below-ring=pink" +#> [85] "stalk-color-below-ring=red" "stalk-color-below-ring=white" +#> [87] "stalk-color-below-ring=yellow" "veil-type=partial" +#> [89] "veil-type=universal" "veil-color=brown" +#> [91] "veil-color=orange" "veil-color=white" +#> [93] "veil-color=yellow" "ring-number=none" +#> [95] "ring-number=one" "ring-number=two" +#> [97] "ring-type=cobwebby" "ring-type=evanescent" +#> [99] "ring-type=flaring" "ring-type=large" +#> [101] "ring-type=none" "ring-type=pendant" +#> [103] "ring-type=sheathing" "ring-type=zone" +#> [105] "spore-print-color=black" "spore-print-color=brown" +#> [107] "spore-print-color=buff" "spore-print-color=chocolate" +#> [109] "spore-print-color=green" "spore-print-color=orange" +#> [111] "spore-print-color=purple" "spore-print-color=white" +#> [113] "spore-print-color=yellow" "population=abundant" +#> [115] "population=clustered" "population=numerous" +#> [117] "population=scattered" "population=several" +#> [119] "population=solitary" "habitat=grasses" +#> [121] "habitat=leaves" "habitat=meadows" +#> [123] "habitat=paths" "habitat=urban" +#> [125] "habitat=waste" "habitat=woods"
    colnames(dtrain) <- make.names(1:ncol(train$data)) +print(dtrain, verbose = TRUE)
    #> <lgb.Dataset> +#> Public: +#> construct: function () +#> create_valid: function (data, info = list(), ...) +#> dim: function () +#> finalize: function () +#> get_colnames: function () +#> getinfo: function (name) +#> initialize: function (data, params = list(), reference = NULL, colnames = NULL, +#> save_binary: function (fname) +#> set_categorical_feature: function (categorical_feature) +#> set_colnames: function (colnames) +#> set_reference: function (reference) +#> setinfo: function (name, info) +#> slice: function (idxset, ...) +#> update_params: function (params) +#> Private: +#> categorical_feature: NULL +#> colnames: X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X ... +#> free_raw_data: TRUE +#> get_handle: function () +#> handle: NULL +#> info: list +#> params: list +#> predictor: NULL +#> raw_data: dgCMatrix +#> reference: NULL +#> set_predictor: function (predictor) +#> used_indices: NULL
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/getinfo.html b/R-package/docs/reference/getinfo.html new file mode 100644 index 000000000000..5ccc1e734dde --- /dev/null +++ b/R-package/docs/reference/getinfo.html @@ -0,0 +1,206 @@ + + + + + + + + +Get information of an lgb.Dataset object — getinfo • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Get information of an lgb.Dataset object

    + + +
    getinfo(dataset, ...)
    +
    +# S3 method for lgb.Dataset
    +getinfo(dataset, name, ...)
    + +

    Arguments

    + + + + + + + + + + + + + + +
    dataset

    Object of class lgb.Dataset

    ...

    other parameters

    name

    the name of the information field to get (see details)

    + +

    Value

    + +

    info data

    + +

    Details

    + +

    The name field can be one of the following:

      +
    • label: label lightgbm learn from ;

    • +
    • weight: to do a weight rescale ;

    • +
    • group: group size

    • +
    • init_score: initial score is the base prediction lightgbm will boost from ;

    • +
    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +lgb.Dataset.construct(dtrain)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    +labels <- lightgbm::getinfo(dtrain, "label") +lightgbm::setinfo(dtrain, "label", 1 - labels) + +labels2 <- lightgbm::getinfo(dtrain, "label") +stopifnot(all(labels2 == 1 - labels))
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/index.html b/R-package/docs/reference/index.html new file mode 100644 index 000000000000..9f9e16bf1c88 --- /dev/null +++ b/R-package/docs/reference/index.html @@ -0,0 +1,407 @@ + + + + + + + + +Function reference • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    Dataset

    +

    Datasets included with the R package

    +
    +

    data

    +

    Test part from Mushroom Data Set

    +

    data

    +

    Training part from Mushroom Data Set

    +

    data

    +

    Bank Marketing Data Set

    +

    Data Input / Output

    +

    Data I/O required for LightGBM

    +
    +

    dim

    +

    Dimensions of an lgb.Dataset

    +

    dimnames dimnames<-

    +

    Handling of column names of lgb.Dataset

    +

    getinfo

    +

    Get information of an lgb.Dataset object

    +

    setinfo

    +

    Set information of an lgb.Dataset object

    +

    slice

    +

    Slice a dataset

    +

    lgb.Dataset.construct

    +

    Construct Dataset explicitly

    +

    lgb.Dataset.create.valid

    +

    Construct validation data

    +

    lgb.Dataset

    +

    Construct lgb.Dataset object

    +

    lgb.Dataset.save

    +

    Save lgb.Dataset to a binary file

    +

    lgb.Dataset.set.categorical

    +

    Set categorical feature of lgb.Dataset

    +

    lgb.Dataset.set.reference

    +

    Set reference of lgb.Dataset

    +

    Machine Learning

    +

    Train models with LightGBM

    +
    +

    lgb.prepare

    +

    Data preparator for LightGBM datasets (numeric)

    +

    lgb.prepare2

    +

    Data preparator for LightGBM datasets (integer)

    +

    lgb.prepare_rules

    +

    Data preparator for LightGBM datasets with rules (numeric)

    +

    lgb.prepare_rules2

    +

    Data preparator for LightGBM datasets with rules (integer)

    +

    lgb.cv lgb.train lightgbm

    +

    Main CV logic for LightGBM

    +

    Saving / Loading Models

    +

    Save and Load LightGBM models

    +
    +

    lgb.dump

    +

    Dump LightGBM model to json

    +

    lgb.load

    +

    Load LightGBM model

    +

    lgb.model.dt.tree

    +

    Parse a LightGBM model json dump

    +

    lgb.save

    +

    Save LightGBM model

    +

    predict

    +

    Predict method for LightGBM model

    +

    readRDS.lgb.Booster

    +

    readRDS for lgb.Booster models

    +

    saveRDS.lgb.Booster

    +

    saveRDS for lgb.Booster models

    +

    Predictive Analysis

    +

    Analyze your predictions

    +
    +

    lgb.get.eval.result

    +

    Get record evaluation result from booster

    +

    lgb.importance

    +

    Compute feature importance in a model

    +

    lgb.interprete

    +

    Compute feature contribution of prediction

    +

    lgb.plot.importance

    +

    Plot feature importance as a bar graph

    +

    lgb.plot.interpretation

    +

    Plot feature contribution as a bar graph

    +

    Miscellaneous

    +

    Ungroupable functions to troubleshoot LightGBM

    +
    +

    lgb.unloader

    +

    LightGBM unloading error fix

    +
    +
    + + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.Dataset.construct.html b/R-package/docs/reference/lgb.Dataset.construct.html new file mode 100644 index 000000000000..07cddef1377f --- /dev/null +++ b/R-package/docs/reference/lgb.Dataset.construct.html @@ -0,0 +1,173 @@ + + + + + + + + +Construct Dataset explicitly — lgb.Dataset.construct • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Construct Dataset explicitly

    + + +
    lgb.Dataset.construct(dataset)
    + +

    Arguments

    + + + + + + +
    dataset

    Object of class lgb.Dataset

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +lgb.Dataset.construct(dtrain)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.Dataset.create.valid.html b/R-package/docs/reference/lgb.Dataset.create.valid.html new file mode 100644 index 000000000000..fc107aec7ab3 --- /dev/null +++ b/R-package/docs/reference/lgb.Dataset.create.valid.html @@ -0,0 +1,193 @@ + + + + + + + + +Construct validation data — lgb.Dataset.create.valid • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Construct validation data according to training data

    + + +
    lgb.Dataset.create.valid(dataset, data, info = list(), ...)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataset

    lgb.Dataset object, training data

    data

    a matrix object, a dgCMatrix object or a character representing a filename

    info

    a list of information of the lgb.Dataset object

    ...

    other information to pass to info.

    + +

    Value

    + +

    constructed dataset

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.Dataset.data b/R-package/docs/reference/lgb.Dataset.data new file mode 100644 index 000000000000..836fab9305c3 Binary files /dev/null and b/R-package/docs/reference/lgb.Dataset.data differ diff --git a/R-package/docs/reference/lgb.Dataset.html b/R-package/docs/reference/lgb.Dataset.html new file mode 100644 index 000000000000..e73f6a6fdfa4 --- /dev/null +++ b/R-package/docs/reference/lgb.Dataset.html @@ -0,0 +1,210 @@ + + + + + + + + +Construct lgb.Dataset object — lgb.Dataset • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Construct lgb.Dataset object from dense matrix, sparse matrix +or local file (that was created previously by saving an lgb.Dataset).

    + + +
    lgb.Dataset(data, params = list(), reference = NULL, colnames = NULL,
    +  categorical_feature = NULL, free_raw_data = TRUE, info = list(), ...)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    data

    a matrix object, a dgCMatrix object or a character representing a filename

    params

    a list of parameters

    reference

    reference dataset

    colnames

    names of columns

    categorical_feature

    categorical features

    free_raw_data

    TRUE for need to free raw data after construct

    info

    a list of information of the lgb.Dataset object

    ...

    other information to pass to info or parameters pass to params

    + +

    Value

    + +

    constructed dataset

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +lgb.Dataset.save(dtrain, "lgb.Dataset.data")
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    dtrain <- lgb.Dataset("lgb.Dataset.data") +lgb.Dataset.construct(dtrain)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromFile_R" not available for .Call() for package "lib_lightgbm"
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.Dataset.save.html b/R-package/docs/reference/lgb.Dataset.save.html new file mode 100644 index 000000000000..f2ba8c8f2433 --- /dev/null +++ b/R-package/docs/reference/lgb.Dataset.save.html @@ -0,0 +1,184 @@ + + + + + + + + +Save <code>lgb.Dataset</code> to a binary file — lgb.Dataset.save • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Save lgb.Dataset to a binary file

    + + +
    lgb.Dataset.save(dataset, fname)
    + +

    Arguments

    + + + + + + + + + + +
    dataset

    object of class lgb.Dataset

    fname

    object filename of output file

    + +

    Value

    + +

    passed dataset

    + + +

    Examples

    +
    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +lgb.Dataset.save(dtrain, "data.bin")
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.Dataset.set.categorical.html b/R-package/docs/reference/lgb.Dataset.set.categorical.html new file mode 100644 index 000000000000..005ad1ae0b13 --- /dev/null +++ b/R-package/docs/reference/lgb.Dataset.set.categorical.html @@ -0,0 +1,184 @@ + + + + + + + + +Set categorical feature of <code>lgb.Dataset</code> — lgb.Dataset.set.categorical • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Set categorical feature of lgb.Dataset

    + + +
    lgb.Dataset.set.categorical(dataset, categorical_feature)
    + +

    Arguments

    + + + + + + + + + + +
    dataset

    object of class lgb.Dataset

    categorical_feature

    categorical features

    + +

    Value

    + +

    passed dataset

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +lgb.Dataset.save(dtrain, "lgb.Dataset.data")
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    dtrain <- lgb.Dataset("lgb.Dataset.data") +lgb.Dataset.set.categorical(dtrain, 1:2)
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.Dataset.set.reference.html b/R-package/docs/reference/lgb.Dataset.set.reference.html new file mode 100644 index 000000000000..cf60db01cd0d --- /dev/null +++ b/R-package/docs/reference/lgb.Dataset.set.reference.html @@ -0,0 +1,186 @@ + + + + + + + + +Set reference of <code>lgb.Dataset</code> — lgb.Dataset.set.reference • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    If you want to use validation data, you should set reference to training data

    + + +
    lgb.Dataset.set.reference(dataset, reference)
    + +

    Arguments

    + + + + + + + + + + +
    dataset

    object of class lgb.Dataset

    reference

    object of class lgb.Dataset

    + +

    Value

    + +

    passed dataset

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package ="lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset(test$data, test = train$label) +lgb.Dataset.set.reference(dtest, dtrain)
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.dump.html b/R-package/docs/reference/lgb.dump.html new file mode 100644 index 000000000000..d9fd185005e0 --- /dev/null +++ b/R-package/docs/reference/lgb.dump.html @@ -0,0 +1,194 @@ + + + + + + + + +Dump LightGBM model to json — lgb.dump • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Dump LightGBM model to json

    + + +
    lgb.dump(booster, num_iteration = NULL)
    + +

    Arguments

    + + + + + + + + + + +
    booster

    Object of class lgb.Booster

    num_iteration

    number of iteration want to predict with, NULL or <= 0 means use best iteration

    + +

    Value

    + +

    json format of model

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    json_model <- lgb.dump(model)
    #> Error in c("R6", name) %in% class(object): object 'model' not found
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.get.eval.result.html b/R-package/docs/reference/lgb.get.eval.result.html new file mode 100644 index 000000000000..45340a43ae72 --- /dev/null +++ b/R-package/docs/reference/lgb.get.eval.result.html @@ -0,0 +1,207 @@ + + + + + + + + +Get record evaluation result from booster — lgb.get.eval.result • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Get record evaluation result from booster

    + + +
    lgb.get.eval.result(booster, data_name, eval_name, iters = NULL,
    +  is_err = FALSE)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + +
    booster

    Object of class lgb.Booster

    data_name

    name of dataset

    eval_name

    name of evaluation

    iters

    iterations, NULL will return all

    is_err

    TRUE will return evaluation error instead

    + +

    Value

    + +

    vector of evaluation result

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    lgb.get.eval.result(model, "test", "l2")
    #> Error in c("R6", name) %in% class(object): object 'model' not found
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.importance.html b/R-package/docs/reference/lgb.importance.html new file mode 100644 index 000000000000..2c1f8e4639ce --- /dev/null +++ b/R-package/docs/reference/lgb.importance.html @@ -0,0 +1,194 @@ + + + + + + + + +Compute feature importance in a model — lgb.importance • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Creates a data.table of feature importances in a model.

    + + +
    lgb.importance(model, percentage = TRUE)
    + +

    Arguments

    + + + + + + + + + + +
    model

    object of class lgb.Booster.

    percentage

    whether to show importance in relative percentage.

    + +

    Value

    + +

    For a tree model, a data.table with the following columns:

      +
    • Feature Feature names in the model.

    • +
    • Gain The total gain of this feature's splits.

    • +
    • Cover The number of observation related to this feature.

    • +
    • Frequency The number of times a feature splited in trees.

    • +
    + + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) + +params = list(objective = "binary", + learning_rate = 0.01, num_leaves = 63, max_depth = -1, + min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) + model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    +tree_imp1 <- lgb.importance(model, percentage = TRUE)
    #> Error in inherits(model, "lgb.Booster"): object 'model' not found
    tree_imp2 <- lgb.importance(model, percentage = FALSE)
    #> Error in lgb.importance(model, percentage = FALSE): object 'model' not found
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.interprete.html b/R-package/docs/reference/lgb.interprete.html new file mode 100644 index 000000000000..7815019e3f30 --- /dev/null +++ b/R-package/docs/reference/lgb.interprete.html @@ -0,0 +1,204 @@ + + + + + + + + +Compute feature contribution of prediction — lgb.interprete • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Computes feature contribution components of rawscore prediction.

    + + +
    lgb.interprete(model, data, idxset, num_iteration = NULL)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    model

    object of class lgb.Booster.

    data

    a matrix object or a dgCMatrix object.

    idxset

    a integer vector of indices of rows needed.

    num_iteration

    number of iteration want to predict with, NULL or <= 0 means use best iteration.

    + +

    Value

    + +

    For regression, binary classification and lambdarank model, a list of data.table with the following columns:

      +
    • Feature Feature names in the model.

    • +
    • Contribution The total contribution of this feature's splits.

    • +

    For multiclass classification, a list of data.table with the Feature column and Contribution columns to each class.

    + + +

    Examples

    +
    library(lightgbm) +Sigmoid <- function(x) 1 / (1 + exp(-x)) +Logit <- function(x) log(x / (1 - x)) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test + +params = list(objective = "binary", + learning_rate = 0.01, num_leaves = 63, max_depth = -1, + min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) + model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    +tree_interpretation <- lgb.interprete(model, test$data, 1:5)
    #> Error in c("R6", name) %in% class(object): object 'model' not found
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.load.html b/R-package/docs/reference/lgb.load.html new file mode 100644 index 000000000000..e7ffe4f9903f --- /dev/null +++ b/R-package/docs/reference/lgb.load.html @@ -0,0 +1,196 @@ + + + + + + + + +Load LightGBM model — lgb.load • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Load LightGBM model from saved model file or string +Load LightGBM takes in either a file path or model string +If both are provided, Load will default to loading from file

    + + +
    lgb.load(filename = NULL, model_str = NULL)
    + +

    Arguments

    + + + + + + + + + + +
    filename

    path of model file

    model_str

    a str containing the model

    + +

    Value

    + +

    lgb.Booster

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    lgb.save(model, "model.txt")
    #> Error in c("R6", name) %in% class(object): object 'model' not found
    load_booster <- lgb.load(filename = "model.txt")
    #> Error in lgb.load(filename = "model.txt"): lgb.load: file does not exist for supplied filename
    model_string <- model$save_model_to_string(NULL) # saves best iteration
    #> Error in eval(expr, envir, enclos): object 'model' not found
    load_booster_from_str <- lgb.load(model_str = model_string)
    #> Error in lgb.load(model_str = model_string): object 'model_string' not found
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.model.dt.tree.html b/R-package/docs/reference/lgb.model.dt.tree.html new file mode 100644 index 000000000000..44b306de7c53 --- /dev/null +++ b/R-package/docs/reference/lgb.model.dt.tree.html @@ -0,0 +1,203 @@ + + + + + + + + +Parse a LightGBM model json dump — lgb.model.dt.tree • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Parse a LightGBM model json dump into a data.table structure.

    + + +
    lgb.model.dt.tree(model, num_iteration = NULL)
    + +

    Arguments

    + + + + + + +
    model

    object of class lgb.Booster

    + +

    Value

    + +

    A data.table with detailed information about model trees' nodes and leafs.

    +

    The columns of the data.table are:

      +
    • tree_index: ID of a tree in a model (integer)

    • +
    • split_index: ID of a node in a tree (integer)

    • +
    • split_feature: for a node, it's a feature name (character); + for a leaf, it simply labels it as "NA"

    • +
    • node_parent: ID of the parent node for current node (integer)

    • +
    • leaf_index: ID of a leaf in a tree (integer)

    • +
    • leaf_parent: ID of the parent node for current leaf (integer)

    • +
    • split_gain: Split gain of a node

    • +
    • threshold: Spliting threshold value of a node

    • +
    • decision_type: Decision type of a node

    • +
    • default_left: Determine how to handle NA value, TRUE -> Left, FALSE -> Right

    • +
    • internal_value: Node value

    • +
    • internal_count: The number of observation collected by a node

    • +
    • leaf_value: Leaf value

    • +
    • leaf_count: The number of observation collected by a leaf

    • +
    + + + +

    Examples

    +
    library(lightgbm) + +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) + +params = list(objective = "binary", + learning_rate = 0.01, num_leaves = 63, max_depth = -1, + min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) + model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    +tree_dt <- lgb.model.dt.tree(model)
    #> Error in c("R6", name) %in% class(object): object 'model' not found
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.plot.importance-2.png b/R-package/docs/reference/lgb.plot.importance-2.png new file mode 100644 index 000000000000..fa2235b02093 Binary files /dev/null and b/R-package/docs/reference/lgb.plot.importance-2.png differ diff --git a/R-package/docs/reference/lgb.plot.importance-3.png b/R-package/docs/reference/lgb.plot.importance-3.png new file mode 100644 index 000000000000..fa2235b02093 Binary files /dev/null and b/R-package/docs/reference/lgb.plot.importance-3.png differ diff --git a/R-package/docs/reference/lgb.plot.importance.html b/R-package/docs/reference/lgb.plot.importance.html new file mode 100644 index 000000000000..d30fe66a18cc --- /dev/null +++ b/R-package/docs/reference/lgb.plot.importance.html @@ -0,0 +1,208 @@ + + + + + + + + +Plot feature importance as a bar graph — lgb.plot.importance • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph.

    + + +
    lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain",
    +  left_margin = 10, cex = NULL)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + +
    tree_imp

    a data.table returned by lgb.importance.

    top_n

    maximal number of top features to include into the plot.

    measure

    the name of importance measure to plot, can be "Gain", "Cover" or "Frequency".

    left_margin

    (base R barplot) allows to adjust the left margin size to fit feature names.

    cex

    (base R barplot) passed as cex.names parameter to barplot.

    + +

    Value

    + +

    The lgb.plot.importance function creates a barplot +and silently returns a processed data.table with top_n features sorted by defined importance.

    + +

    Details

    + +

    The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature. +Features are shown ranked in a decreasing importance order.

    + + +

    Examples

    +
    data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) + +params = list(objective = "binary", + learning_rate = 0.01, num_leaves = 63, max_depth = -1, + min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) + model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    +tree_imp <- lgb.importance(model, percentage = TRUE)
    #> Error in lgb.importance(model, percentage = TRUE): object 'model' not found
    lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain")
    #> Error in nrow(tree_imp): object 'tree_imp' not found
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.plot.interpretation-2.png b/R-package/docs/reference/lgb.plot.interpretation-2.png new file mode 100644 index 000000000000..91174065aeb9 Binary files /dev/null and b/R-package/docs/reference/lgb.plot.interpretation-2.png differ diff --git a/R-package/docs/reference/lgb.plot.interpretation-3.png b/R-package/docs/reference/lgb.plot.interpretation-3.png new file mode 100644 index 000000000000..91174065aeb9 Binary files /dev/null and b/R-package/docs/reference/lgb.plot.interpretation-3.png differ diff --git a/R-package/docs/reference/lgb.plot.interpretation.html b/R-package/docs/reference/lgb.plot.interpretation.html new file mode 100644 index 000000000000..77dbb4675d35 --- /dev/null +++ b/R-package/docs/reference/lgb.plot.interpretation.html @@ -0,0 +1,213 @@ + + + + + + + + +Plot feature contribution as a bar graph — lgb.plot.interpretation • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Plot previously calculated feature contribution as a bar graph.

    + + +
    lgb.plot.interpretation(tree_interpretation_dt, top_n = 10, cols = 1,
    +  left_margin = 10, cex = NULL)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + +
    tree_interpretation_dt

    a data.table returned by lgb.interprete.

    top_n

    maximal number of top features to include into the plot.

    cols

    the column numbers of layout, will be used only for multiclass classification feature contribution.

    left_margin

    (base R barplot) allows to adjust the left margin size to fit feature names.

    cex

    (base R barplot) passed as cex.names parameter to barplot.

    + +

    Value

    + +

    The lgb.plot.interpretation function creates a barplot.

    + +

    Details

    + +

    The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature. +Features are shown ranked in a decreasing contribution order.

    + + +

    Examples

    +
    library(lightgbm) +Sigmoid <- function(x) {1 / (1 + exp(-x))} +Logit <- function(x) {log(x / (1 - x))} +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test + +params = list(objective = "binary", + learning_rate = 0.01, num_leaves = 63, max_depth = -1, + min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) + model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    model <- lgb.train(params, dtrain, 20)
    #> Error in .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm"): "LGBM_DatasetCreateFromCSC_R" not available for .Call() for package "lib_lightgbm"
    +tree_interpretation <- lgb.interprete(model, test$data, 1:5)
    #> Error in c("R6", name) %in% class(object): object 'model' not found
    lgb.plot.interpretation(tree_interpretation[[1]], top_n = 10)
    #> Error in ncol(tree_interpretation_dt): object 'tree_interpretation' not found
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.prepare.html b/R-package/docs/reference/lgb.prepare.html new file mode 100644 index 000000000000..0cb0cfc0ed52 --- /dev/null +++ b/R-package/docs/reference/lgb.prepare.html @@ -0,0 +1,216 @@ + + + + + + + + +Data preparator for LightGBM datasets (numeric) — lgb.prepare • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric without integers. Please use lgb.prepare_rules if you want to apply this transformation to other datasets.

    + + +
    lgb.prepare(data)
    + +

    Arguments

    + + + + + + +
    data

    A data.frame or data.table to prepare.

    + +

    Value

    + +

    The cleaned dataset. It must be converted to a matrix format (as.matrix) for input in lgb.Dataset.

    + + +

    Examples

    +
    library(lightgbm) +data(iris) + +str(iris)
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ... + +str(lgb.prepare(data = iris)) # Convert all factors/chars to numeric
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : num 1 1 1 1 1 1 1 1 1 1 ... + +# When lightgbm package is installed, and you do not want to load it +# You can still use the function! +lgb.unloader() +str(lightgbm::lgb.prepare(data = iris))
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : num 1 1 1 1 1 1 1 1 1 1 ... +
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.prepare2.html b/R-package/docs/reference/lgb.prepare2.html new file mode 100644 index 000000000000..b0a44a1d24aa --- /dev/null +++ b/R-package/docs/reference/lgb.prepare2.html @@ -0,0 +1,217 @@ + + + + + + + + +Data preparator for LightGBM datasets (integer) — lgb.prepare2 • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). Please use lgb.prepare_rules2 if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.

    + + +
    lgb.prepare2(data)
    + +

    Arguments

    + + + + + + +
    data

    A data.frame or data.table to prepare.

    + +

    Value

    + +

    The cleaned dataset. It must be converted to a matrix format (as.matrix) for input in lgb.Dataset.

    + + +

    Examples

    +
    library(lightgbm) +data(iris) + +str(iris)
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ... + +str(lgb.prepare2(data = iris)) # Convert all factors/chars to integer
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : int 1 1 1 1 1 1 1 1 1 1 ... + +# When lightgbm package is installed, and you do not want to load it +# You can still use the function! +lgb.unloader() +str(lightgbm::lgb.prepare2(data = iris))
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : int 1 1 1 1 1 1 1 1 1 1 ... + +
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.prepare_rules.html b/R-package/docs/reference/lgb.prepare_rules.html new file mode 100644 index 000000000000..68fec978b0d7 --- /dev/null +++ b/R-package/docs/reference/lgb.prepare_rules.html @@ -0,0 +1,246 @@ + + + + + + + + +Data preparator for LightGBM datasets with rules (numeric) — lgb.prepare_rules • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.

    + + +
    lgb.prepare_rules(data, rules = NULL)
    + +

    Arguments

    + + + + + + + + + + +
    data

    A data.frame or data.table to prepare.

    rules

    A set of rules from the data preparator, if already used.

    + +

    Value

    + +

    A list with the cleaned dataset (data) and the rules (rules). The data must be converted to a matrix format (as.matrix) for input in lgb.Dataset.

    + + +

    Examples

    +
    library(lightgbm) +data(iris) + +str(iris)
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ... + +new_iris <- lgb.prepare_rules(data = iris) # Autoconverter +str(new_iris$data)
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : num 1 1 1 1 1 1 1 1 1 1 ... + +data(iris) # Erase iris dataset +iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
    #> Warning: invalid factor level, NA generated
    # Warning message: +# In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, : +# invalid factor level, NA generated + +# Use conversion using known rules +# Unknown factors become 0, excellent for sparse datasets +newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules) + +# Unknown factor is now zero, perfect for sparse datasets +newer_iris$data[1, ] # Species became 0 as it is an unknown factor
    #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> 1 5.1 3.5 1.4 0.2 0
    # Sepal.Length Sepal.Width Petal.Length Petal.Width Species +# 1 5.1 3.5 1.4 0.2 0 + +newer_iris$data[1, 5] <- 1 # Put back real initial value + +# Is the newly created dataset equal? YES! +all.equal(new_iris$data, newer_iris$data)
    #> [1] TRUE
    # [1] TRUE + +# Can we test our own rules? +data(iris) # Erase iris dataset + +# We remapped values differently +personal_rules <- list(Species = c("setosa" = 3, + "versicolor" = 2, + "virginica" = 1)) +newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules) +str(newest_iris$data) # SUCCESS!
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : num 0 3 3 3 3 3 3 3 3 3 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : num 3 3 3 3 3 3 3 3 3 3 ... + +
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.prepare_rules2.html b/R-package/docs/reference/lgb.prepare_rules2.html new file mode 100644 index 000000000000..bfa1a169974a --- /dev/null +++ b/R-package/docs/reference/lgb.prepare_rules2.html @@ -0,0 +1,246 @@ + + + + + + + + +Data preparator for LightGBM datasets with rules (integer) — lgb.prepare_rules2 • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.

    + + +
    lgb.prepare_rules2(data, rules = NULL)
    + +

    Arguments

    + + + + + + + + + + +
    data

    A data.frame or data.table to prepare.

    rules

    A set of rules from the data preparator, if already used.

    + +

    Value

    + +

    A list with the cleaned dataset (data) and the rules (rules). The data must be converted to a matrix format (as.matrix) for input in lgb.Dataset.

    + + +

    Examples

    +
    library(lightgbm) +data(iris) + +str(iris)
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ... + +new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter +str(new_iris$data)
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : int 1 1 1 1 1 1 1 1 1 1 ... + +data(iris) # Erase iris dataset +iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
    #> Warning: invalid factor level, NA generated
    # Warning message: +# In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, : +# invalid factor level, NA generated + +# Use conversion using known rules +# Unknown factors become 0, excellent for sparse datasets +newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules) + +# Unknown factor is now zero, perfect for sparse datasets +newer_iris$data[1, ] # Species became 0 as it is an unknown factor
    #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> 1 5.1 3.5 1.4 0.2 0
    # Sepal.Length Sepal.Width Petal.Length Petal.Width Species +# 1 5.1 3.5 1.4 0.2 0 + +newer_iris$data[1, 5] <- 1 # Put back real initial value + +# Is the newly created dataset equal? YES! +all.equal(new_iris$data, newer_iris$data)
    #> [1] TRUE
    # [1] TRUE + +# Can we test our own rules? +data(iris) # Erase iris dataset + +# We remapped values differently +personal_rules <- list(Species = c("setosa" = 3L, + "versicolor" = 2L, + "virginica" = 1L)) +newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules) +str(newest_iris$data) # SUCCESS!
    #> 'data.frame': 150 obs. of 5 variables: +#> $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +#> $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Species : int 0 3 3 3 3 3 3 3 3 3 ...
    # 'data.frame': 150 obs. of 5 variables: +# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... +# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... +# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... +# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +# $ Species : int 3 3 3 3 3 3 3 3 3 3 ... + +
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.save.html b/R-package/docs/reference/lgb.save.html new file mode 100644 index 000000000000..5195e250dfc6 --- /dev/null +++ b/R-package/docs/reference/lgb.save.html @@ -0,0 +1,210 @@ + + + + + + + + +Save LightGBM model — lgb.save • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Save LightGBM model

    + + +
    lgb.save(booster, filename, num_iteration = NULL)
    + +

    Arguments

    + + + + + + + + + + + + + + +
    booster

    Object of class lgb.Booster

    filename

    saved filename

    num_iteration

    number of iteration want to predict with, NULL or <= 0 means use best iteration

    + +

    Value

    + +

    lgb.Booster

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> [1]: test's l2:6.44165e-17 +#> [2]: test's l2:1.97215e-31 +#> [3]: test's l2:0 +#> [4]: test's l2:0 +#> [5]: test's l2:0 +#> [6]: test's l2:0 +#> [7]: test's l2:0 +#> [8]: test's l2:0 +#> [9]: test's l2:0 +#> [10]: test's l2:0 +#> [11]: test's l2:0 +#> [12]: test's l2:0 +#> [13]: test's l2:0
    lgb.save(model, "model.txt")
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.train.html b/R-package/docs/reference/lgb.train.html new file mode 100644 index 000000000000..7d99714851d4 --- /dev/null +++ b/R-package/docs/reference/lgb.train.html @@ -0,0 +1,438 @@ + + + + + + + + +Main CV logic for LightGBM — lgb.cv • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Main CV logic for LightGBM

    +

    Main training logic for LightGBM

    +

    Simple interface for training an lightgbm model. +Its documentation is combined with lgb.train.

    + + +
    lgb.cv(params = list(), data, nrounds = 10, nfold = 3, label = NULL,
    +  weight = NULL, obj = NULL, eval = NULL, verbose = 1, record = TRUE,
    +  eval_freq = 1L, showsd = TRUE, stratified = TRUE, folds = NULL,
    +  init_model = NULL, colnames = NULL, categorical_feature = NULL,
    +  early_stopping_rounds = NULL, callbacks = list(), ...)
    +
    +lgb.train(params = list(), data, nrounds = 10, valids = list(),
    +  obj = NULL, eval = NULL, verbose = 1, record = TRUE, eval_freq = 1L,
    +  init_model = NULL, colnames = NULL, categorical_feature = NULL,
    +  early_stopping_rounds = NULL, callbacks = list(), reset_data = FALSE,
    +  ...)
    +
    +lightgbm(data, label = NULL, weight = NULL, params = list(),
    +  nrounds = 10, verbose = 1, eval_freq = 1L,
    +  early_stopping_rounds = NULL, save_name = "lightgbm.model",
    +  init_model = NULL, callbacks = list(), ...)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    params

    List of parameters

    data

    a lgb.Dataset object, used for CV

    nrounds

    number of CV rounds

    nfold

    the original dataset is randomly partitioned into nfold equal size subsamples.

    label

    vector of response values. Should be provided only when data is an R-matrix.

    weight

    vector of response values. If not NULL, will set to dataset

    obj

    objective function, can be character or custom objective function. Examples include +regression, regression_l1, huber, +binary, lambdarank, multiclass, multiclass

    eval

    evaluation function, can be (list of) character or custom eval function

    verbose

    verbosity for output, if <= 0, also will disable the print of evalutaion during training

    record

    Boolean, TRUE will record iteration message to booster$record_evals

    eval_freq

    evalutaion output frequence, only effect when verbose > 0

    showsd

    boolean, whether to show standard deviation of cross validation

    stratified

    a boolean indicating whether sampling of folds should be stratified +by the values of outcome labels.

    folds

    list provides a possibility to use a list of pre-defined CV folds +(each element must be a vector of test fold's indices). When folds are supplied, +the nfold and stratified parameters are ignored.

    init_model

    path of model file of lgb.Booster object, will continue train from this model

    colnames

    feature names, if not null, will use this to overwrite the names in dataset

    categorical_feature

    list of str or int +type int represents index, +type str represents feature names

    early_stopping_rounds

    int +Activates early stopping. +Requires at least one validation data and one metric +If there's more than one, will check all of them +Returns the model with (best_iter + early_stopping_rounds) +If early stopping occurs, the model will have 'best_iter' field

    callbacks

    list of callback functions +List of callback functions that are applied at each iteration.

    ...

    other parameters, see Parameters.rst for more informations

    valids

    a list of lgb.Dataset objects, used for validation

    reset_data

    Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets

    boosting

    boosting type. gbdt, dart

    num_leaves

    number of leaves in one tree. defaults to 127

    max_depth

    Limit the max depth for tree model. This is used to deal with overfit when #data is small. +Tree still grow by leaf-wise.

    num_threads

    Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).

    params

    List of parameters

    data

    a lgb.Dataset object, used for training

    nrounds

    number of training rounds

    obj

    objective function, can be character or custom objective function. Examples include +regression, regression_l1, huber, +binary, lambdarank, multiclass, multiclass

    boosting

    boosting type. gbdt, dart

    num_leaves

    number of leaves in one tree. defaults to 127

    max_depth

    Limit the max depth for tree model. This is used to deal with overfit when #data is small. +Tree still grow by leaf-wise.

    num_threads

    Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).

    eval

    evaluation function, can be (a list of) character or custom eval function

    verbose

    verbosity for output, if <= 0, also will disable the print of evalutaion during training

    record

    Boolean, TRUE will record iteration message to booster$record_evals

    eval_freq

    evalutaion output frequency, only effect when verbose > 0

    init_model

    path of model file of lgb.Booster object, will continue training from this model

    colnames

    feature names, if not null, will use this to overwrite the names in dataset

    categorical_feature

    list of str or int +type int represents index, +type str represents feature names

    early_stopping_rounds

    int +Activates early stopping. +Requires at least one validation data and one metric +If there's more than one, will check all of them +Returns the model with (best_iter + early_stopping_rounds) +If early stopping occurs, the model will have 'best_iter' field

    callbacks

    list of callback functions +List of callback functions that are applied at each iteration.

    ...

    other parameters, see Parameters.rst for more informations

    + +

    Value

    + +

    a trained model lgb.CVBooster.

    +

    a trained booster model lgb.Booster.

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +params <- list(objective = "regression", metric = "l2") +model <- lgb.cv(params, + dtrain, + 10, + nfold = 5, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> [1]: valid's l2:0.000460829+0.000921659 +#> [2]: valid's l2:0.000460829+0.000921659 +#> [3]: valid's l2:0.000460829+0.000921659 +#> [4]: valid's l2:0.000460829+0.000921659 +#> [5]: valid's l2:0.000460829+0.000921659 +#> [6]: valid's l2:0.000460829+0.000921659 +#> [7]: valid's l2:0.000460829+0.000921659 +#> [8]: valid's l2:0.000460829+0.000921659 +#> [9]: valid's l2:0.000460829+0.000921659 +#> [10]: valid's l2:0.000460829+0.000921659
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> [1]: test's l2:6.44165e-17 +#> [2]: test's l2:1.97215e-31 +#> [3]: test's l2:0 +#> [4]: test's l2:0 +#> [5]: test's l2:0 +#> [6]: test's l2:0 +#> [7]: test's l2:0 +#> [8]: test's l2:0 +#> [9]: test's l2:0 +#> [10]: test's l2:0 +#> [11]: test's l2:0 +#> [12]: test's l2:0 +#> [13]: test's l2:0
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/lgb.unloader.html b/R-package/docs/reference/lgb.unloader.html new file mode 100644 index 000000000000..a1fbd6454555 --- /dev/null +++ b/R-package/docs/reference/lgb.unloader.html @@ -0,0 +1,220 @@ + + + + + + + + +LightGBM unloading error fix — lgb.unloader • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.

    + + +
    lgb.unloader(restore = TRUE, wipe = FALSE, envir = .GlobalEnv)
    + +

    Arguments

    + + + + + + + + + + + + + + +
    wipe

    Whether to wipe all lgb.Dataset and lgb.Booster from the global environment. Defaults to FALSE which means to not remove them.

    envir

    The environment to perform wiping on if wipe == TRUE. Defaults to .GlobalEnv which is the global environment.

    restart

    Whether to reload LightGBM immediately after detaching from R. Defaults to TRUE which means automatically reload LightGBM once unloading is performed.

    + +

    Value

    + +

    NULL invisibly.

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> [1]: test's l2:6.44165e-17 +#> [2]: test's l2:1.97215e-31 +#> [3]: test's l2:0 +#> [4]: test's l2:0 +#> [5]: test's l2:0 +#> [6]: test's l2:0 +#> [7]: test's l2:0 +#> [8]: test's l2:0 +#> [9]: test's l2:0 +#> [10]: test's l2:0 +#> [11]: test's l2:0 +#> [12]: test's l2:0 +#> [13]: test's l2:0
    +# Disabled the following line as it crashes the documentation generator +# lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) +rm(model, dtrain, dtest) # Not needed if wipe = TRUE +gc() # Not needed if wipe = TRUE
    #> used (Mb) gc trigger (Mb) max used (Mb) +#> Ncells 1641119 87.7 2637877 140.9 2637877 140.9 +#> Vcells 3135706 24.0 5721718 43.7 4701429 35.9
    +# Disabled the following line as it crashes the documentation generator +# library(lightgbm) +# Do whatever you want again with LightGBM without object clashing +
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/predict.lgb.Booster.html b/R-package/docs/reference/predict.lgb.Booster.html new file mode 100644 index 000000000000..69d79ede7036 --- /dev/null +++ b/R-package/docs/reference/predict.lgb.Booster.html @@ -0,0 +1,237 @@ + + + + + + + + +Predict method for LightGBM model — predict.lgb.Booster • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Predicted values based on class lgb.Booster

    + + +
    # S3 method for lgb.Booster
    +predict(object, data, num_iteration = NULL,
    +  rawscore = FALSE, predleaf = FALSE, header = FALSE, reshape = FALSE,
    +  ...)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    object

    Object of class lgb.Booster

    data

    a matrix object, a dgCMatrix object or a character representing a filename

    num_iteration

    number of iteration want to predict with, NULL or <= 0 means use best iteration

    rawscore

    whether the prediction should be returned in the for of original untransformed +sum of predictions from boosting iterations' results. E.g., setting rawscore=TRUE for +logistic regression would result in predictions for log-odds instead of probabilities.

    predleaf

    whether predict leaf index instead.

    header

    only used for prediction for text file. True if text file has header

    reshape

    whether to reshape the vector of predictions to a matrix form when there are several +prediction outputs per case.

    + +

    Value

    + +

    For regression or binary classification, it returns a vector of length nrows(data). +For multiclass classification, either a num_class * nrows(data) vector or +a (nrows(data), num_class) dimension matrix is returned, depending on +the reshape value.

    +

    When predleaf = TRUE, the output is a matrix object with the +number of columns corresponding to the number of trees.

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> [1]: test's l2:6.44165e-17 +#> [2]: test's l2:1.97215e-31 +#> [3]: test's l2:0 +#> [4]: test's l2:0 +#> [5]: test's l2:0 +#> [6]: test's l2:0 +#> [7]: test's l2:0 +#> [8]: test's l2:0 +#> [9]: test's l2:0 +#> [10]: test's l2:0 +#> [11]: test's l2:0 +#> [12]: test's l2:0 +#> [13]: test's l2:0
    preds <- predict(model, test$data)
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/readRDS.lgb.Booster.html b/R-package/docs/reference/readRDS.lgb.Booster.html new file mode 100644 index 000000000000..5433f5756221 --- /dev/null +++ b/R-package/docs/reference/readRDS.lgb.Booster.html @@ -0,0 +1,207 @@ + + + + + + + + +readRDS for lgb.Booster models — readRDS.lgb.Booster • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Attemps to load a model using RDS.

    + + +
    readRDS.lgb.Booster(file = "", refhook = NULL)
    + +

    Arguments

    + + + + + + + + + + +
    file

    a connection or the name of the file where the R object is saved to or read from.

    refhook

    a hook function for handling reference objects.

    + +

    Value

    + +

    lgb.Booster.

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> [1]: test's l2:6.44165e-17 +#> [2]: test's l2:1.97215e-31 +#> [3]: test's l2:0 +#> [4]: test's l2:0 +#> [5]: test's l2:0 +#> [6]: test's l2:0 +#> [7]: test's l2:0 +#> [8]: test's l2:0 +#> [9]: test's l2:0 +#> [10]: test's l2:0 +#> [11]: test's l2:0 +#> [12]: test's l2:0 +#> [13]: test's l2:0
    saveRDS.lgb.Booster(model, "model.rds") +new_model <- readRDS.lgb.Booster("model.rds")
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/saveRDS.lgb.Booster.html b/R-package/docs/reference/saveRDS.lgb.Booster.html new file mode 100644 index 000000000000..e7ae91fb8a46 --- /dev/null +++ b/R-package/docs/reference/saveRDS.lgb.Booster.html @@ -0,0 +1,227 @@ + + + + + + + + +saveRDS for lgb.Booster models — saveRDS.lgb.Booster • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Attemps to save a model using RDS. Has an additional parameter (raw) which decides whether to save the raw model or not.

    + + +
    saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL,
    +  compress = TRUE, refhook = NULL, raw = TRUE)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    object

    R object to serialize.

    file

    a connection or the name of the file where the R object is saved to or read from.

    ascii

    a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.

    version

    the workspace format version to use. NULL specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.

    compress

    a logical specifying whether saving to a named file is to use "gzip" compression, or one of "gzip", "bzip2" or "xz" to indicate the type of compression to be used. Ignored if file is a connection.

    refhook

    a hook function for handling reference objects.

    raw

    whether to save the model in a raw variable or not, recommended to leave it to TRUE.

    + +

    Value

    + +

    NULL invisibly.

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10)
    #> [1]: test's l2:6.44165e-17 +#> [2]: test's l2:1.97215e-31 +#> [3]: test's l2:0 +#> [4]: test's l2:0 +#> [5]: test's l2:0 +#> [6]: test's l2:0 +#> [7]: test's l2:0 +#> [8]: test's l2:0 +#> [9]: test's l2:0 +#> [10]: test's l2:0 +#> [11]: test's l2:0 +#> [12]: test's l2:0 +#> [13]: test's l2:0
    saveRDS.lgb.Booster(model, "model.rds")
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/setinfo.html b/R-package/docs/reference/setinfo.html new file mode 100644 index 000000000000..5d0fd11f34cc --- /dev/null +++ b/R-package/docs/reference/setinfo.html @@ -0,0 +1,211 @@ + + + + + + + + +Set information of an lgb.Dataset object — setinfo • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Set information of an lgb.Dataset object

    + + +
    setinfo(dataset, ...)
    +
    +# S3 method for lgb.Dataset
    +setinfo(dataset, name, info, ...)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataset

    Object of class "lgb.Dataset"

    ...

    other parameters

    name

    the name of the field to get

    info

    the specific field of information to set

    + +

    Value

    + +

    passed object

    + +

    Details

    + +

    The name field can be one of the following:

      +
    • label: label lightgbm learn from ;

    • +
    • weight: to do a weight rescale ;

    • +
    • init_score: initial score is the base prediction lightgbm will boost from ;

    • +
    • group.

    • +
    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +lgb.Dataset.construct(dtrain) + +labels <- lightgbm::getinfo(dtrain, "label") +lightgbm::setinfo(dtrain, "label", 1 - labels) + +labels2 <- lightgbm::getinfo(dtrain, "label") +stopifnot(all.equal(labels2, 1 - labels))
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/docs/reference/slice.html b/R-package/docs/reference/slice.html new file mode 100644 index 000000000000..18007f1fbd85 --- /dev/null +++ b/R-package/docs/reference/slice.html @@ -0,0 +1,193 @@ + + + + + + + + +Slice a dataset — slice • lightgbm + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    +
    + + + +

    Get a new lgb.Dataset containing the specified rows of +orginal lgb.Dataset object

    + + +
    slice(dataset, ...)
    +
    +# S3 method for lgb.Dataset
    +slice(dataset, idxset, ...)
    + +

    Arguments

    + + + + + + + + + + + + + + +
    dataset

    Object of class "lgb.Dataset"

    ...

    other parameters (currently not used)

    idxset

    a integer vector of indices of rows needed

    + +

    Value

    + +

    constructed sub dataset

    + + +

    Examples

    +
    library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) + +dsub <- lightgbm::slice(dtrain, 1:42) +labels <- lightgbm::getinfo(dsub, "label")
    #> Error in dataset$getinfo(name): Cannot perform getinfo before construct Dataset.
    +
    +
    + +
    + + +
    + + + diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd index 21d1a0a9c32d..23ce158d1d04 100644 --- a/R-package/man/lgb.model.dt.tree.Rd +++ b/R-package/man/lgb.model.dt.tree.Rd @@ -25,6 +25,7 @@ The columns of the \code{data.table} are: \item \code{split_gain}: Split gain of a node \item \code{threshold}: Spliting threshold value of a node \item \code{decision_type}: Decision type of a node + \item \code{default_left}: Determine how to handle NA value, TRUE -> Left, FALSE -> Right \item \code{internal_value}: Node value \item \code{internal_count}: The number of observation collected by a node \item \code{leaf_value}: Leaf value diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index b72384935e19..f6bc33e34901 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -75,7 +75,7 @@ If early stopping occurs, the model will have 'best_iter' field} \item{callbacks}{list of callback functions List of callback functions that are applied at each iteration.} -\item{...}{other parameters, see parameters.md for more informations} +\item{...}{other parameters, see Parameters.rst for more informations} \item{valids}{a list of \code{lgb.Dataset} objects, used for validation} @@ -135,7 +135,7 @@ If early stopping occurs, the model will have 'best_iter' field} \item{callbacks}{list of callback functions List of callback functions that are applied at each iteration.} -\item{...}{other parameters, see parameters.md for more informations} +\item{...}{other parameters, see Parameters.rst for more informations} } \value{ a trained model \code{lgb.CVBooster}. diff --git a/R-package/man/lgb.unloader.Rd b/R-package/man/lgb.unloader.Rd index 391462ece19e..039245b93542 100644 --- a/R-package/man/lgb.unloader.Rd +++ b/R-package/man/lgb.unloader.Rd @@ -37,11 +37,14 @@ model <- lgb.train(params, min_data = 1, learning_rate = 1, early_stopping_rounds = 10) -lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) + +# Disabled the following line as it crashes the documentation generator +# lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) rm(model, dtrain, dtest) # Not needed if wipe = TRUE gc() # Not needed if wipe = TRUE -library(lightgbm) +# Disabled the following line as it crashes the documentation generator +# library(lightgbm) # Do whatever you want again with LightGBM without object clashing } diff --git a/R-package/pkgdown/_pkgdown.yml b/R-package/pkgdown/_pkgdown.yml new file mode 100644 index 000000000000..7e9115617733 --- /dev/null +++ b/R-package/pkgdown/_pkgdown.yml @@ -0,0 +1,120 @@ +template: + params: + bootswatch: united + +authors: + Guolin Ke: + href: https://github.com/guolinke + html: Guolin Ke + Damien Soukhavong: + href: https://github.com/Laurae2 + html: Damien Soukhavong + Yachen Yan: + href: https://github.com/yanyachen + html: Yachen Yan + +site: + root: '' + title: LightGBM, Light Gradient Boosting Machine + +reference: + - title: Dataset + desc: Datasets included with the R package + contents: + - '`agaricus.test`' + - '`agaricus.train`' + - '`bank`' + - title: Data Input / Output + desc: Data I/O required for LightGBM + contents: + - '`dim.lgb.Dataset`' + - '`dimnames.lgb.Dataset`' + - '`getinfo`' + - '`setinfo`' + - '`slice`' + - '`lgb.Dataset.construct`' + - '`lgb.Dataset.create.valid`' + - '`lgb.Dataset`' + - '`lgb.Dataset.save`' + - '`lgb.Dataset.set.categorical`' + - '`lgb.Dataset.set.reference`' + - title: Machine Learning + desc: Train models with LightGBM + contents: + - '`lgb.prepare`' + - '`lgb.prepare2`' + - '`lgb.prepare_rules`' + - '`lgb.prepare_rules2`' + - '`lgb.cv`' + - '`lgb.train`' + - title: Saving / Loading Models + desc: Save and Load LightGBM models + contents: + - '`lgb.dump`' + - '`lgb.load`' + - '`lgb.model.dt.tree`' + - '`lgb.save`' + - '`predict.lgb.Booster`' + - '`readRDS.lgb.Booster`' + - '`saveRDS.lgb.Booster`' + - title: Predictive Analysis + desc: Analyze your predictions + contents: + - '`lgb.get.eval.result`' + - '`lgb.importance`' + - '`lgb.interprete`' + - '`lgb.plot.importance`' + - '`lgb.plot.interpretation`' + - title: Miscellaneous + desc: Ungroupable functions to troubleshoot LightGBM + contents: + - '`lgb.unloader`' + +navbar: + title: LightGBM + type: default + left: + - icon: fa-home fa-lg + href: index.html + - text: Reference + href: reference/index.html + - text: Vignettes + menu: + - text: Basic Walkthrough + href: articles/basic_walkthrough.html + - text: Boosting from existing prediction + href: articles/boost_from_prediction.html + - text: Categorical Feature Preparation + href: articles/categorical_features_prepare.html + - text: Categorical Feature Preparation with Rule + href: articles/categorical_features_rules.html + - text: Cross Validation + href: articles/cross_validation.html + - text: Early Stop in training + href: articles/early_stopping.html + - text: Efficiency for Many Model Trainings + href: articles/efficient_many_training.html + - text: Leaf (in)Stability example + href: articles/leaf_stability.html + - text: Multiclass training/prediction + href: articles/multiclass.html + - text: Weight-Parameter adjustment relationship + href: articles/weight_param.html + right: + - icon: fa-github fa-lg + href: https://github.com/Microsoft/LightGBM + +articles: +- title: Vignettes + desc: ~ + contents: + - '`basic_walkthrough`' + - '`boost_from_prediction`' + - '`categorical_features_prepare`' + - '`categorical_features_rules`' + - '`cross_validation`' + - '`early_stopping`' + - '`efficient_many_training`' + - '`leaf_stability`' + - '`multiclass`' + - '`weight_param`' diff --git a/R-package/pkgdown/doc_gen.R b/R-package/pkgdown/doc_gen.R new file mode 100644 index 000000000000..f44f0307e514 --- /dev/null +++ b/R-package/pkgdown/doc_gen.R @@ -0,0 +1,19 @@ +# Load useful libraries for development +library(devtools) +library(roxygen2) # devtools::install_github("klutometis/roxygen") +library(pkgdown) # devtools::install_github("Laurae2/pkgdown") # devtools::install_github("hadley/pkgdown") + +# Set the working directory to where I am +# setwd("E:/GitHub/LightGBM/R-package") + +# Generate documentation +document() + +# Check for errors +devtools::check(document = FALSE) + +# Build static website +pkgdown::build_site(run_dont_run = TRUE) + +# Install package +install() diff --git a/R-package/vignettes/basic_walkthrough.Rmd b/R-package/vignettes/basic_walkthrough.Rmd new file mode 100644 index 000000000000..909dc5ed6d13 --- /dev/null +++ b/R-package/vignettes/basic_walkthrough.Rmd @@ -0,0 +1,156 @@ +--- +title: "Basic Walkthrough" +output: html_document +--- + +```{r} +require(lightgbm) +require(methods) + +# We load in the agaricus dataset +# In this example, we are aiming to predict whether a mushroom is edible +data(agaricus.train, package = "lightgbm") +data(agaricus.test, package = "lightgbm") +train <- agaricus.train +test <- agaricus.test + +# The loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1} +class(train$label) +class(train$data) + +#--------------------Basic Training using lightgbm---------------- +# This is the basic usage of lightgbm you can put matrix in data field +# Note: we are putting in sparse matrix here, lightgbm naturally handles sparse input +# Use sparse matrix when your feature is sparse (e.g. when you are using one-hot encoding vector) +print("Training lightgbm with sparseMatrix") +bst <- lightgbm(data = train$data, + label = train$label, + num_leaves = 4, + learning_rate = 1, + nrounds = 2, + nthread = 1, + objective = "binary") + +# Alternatively, you can put in dense matrix, i.e. basic R-matrix +print("Training lightgbm with Matrix") +bst <- lightgbm(data = as.matrix(train$data), + label = train$label, + num_leaves = 4, + learning_rate = 1, + nrounds = 2, + nthread = 1, + objective = "binary") + +# You can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features +print("Training lightgbm with lgb.Dataset") +dtrain <- lgb.Dataset(data = train$data, + label = train$label) +bst <- lightgbm(data = dtrain, + num_leaves = 4, + learning_rate = 1, + nrounds = 2, + nthread = 1, + objective = "binary") + +# Verbose = 0,1,2 +print("Train lightgbm with verbose 0, no message") +bst <- lightgbm(data = dtrain, + num_leaves = 4, + learning_rate = 1, + nrounds = 2, + nthread = 1, + objective = "binary", + verbose = 0) + +print("Train lightgbm with verbose 1, print evaluation metric") +bst <- lightgbm(data = dtrain, + num_leaves = 4, + learning_rate = 1, + nrounds = 2, + nthread = 1, + objective = "binary", + verbose = 1) + +print("Train lightgbm with verbose 2, also print information about tree") +bst <- lightgbm(data = dtrain, + num_leaves = 4, + learning_rate = 1, + nrounds = 2, + nthread = 1, + objective = "binary", + verbose = 2) + +# You can also specify data as file path to a LibSVM/TCV/CSV format input +# Since we do not have this file with us, the following line is just for illustration +# bst <- lightgbm(data = "agaricus.train.svm", num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary") + +#--------------------Basic prediction using lightgbm-------------- +# You can do prediction using the following line +# You can put in Matrix, sparseMatrix, or lgb.Dataset +pred <- predict(bst, test$data) +err <- mean(as.numeric(pred > 0.5) != test$label) +print(paste("test-error=", err)) + +#--------------------Save and load models------------------------- +# Save model to binary local file +lgb.save(bst, "lightgbm.model") + +# Load binary model to R +bst2 <- lgb.load("lightgbm.model") +pred2 <- predict(bst2, test$data) + +# pred2 should be identical to pred +print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred)))) + +#--------------------Advanced features --------------------------- +# To use advanced features, we need to put data in lgb.Dataset +dtrain <- lgb.Dataset(data = train$data, label = train$label, free_raw_data = FALSE) +dtest <- lgb.Dataset(data = test$data, label = test$label, free_raw_data = FALSE) + +#--------------------Using validation set------------------------- +# valids is a list of lgb.Dataset, each of them is tagged with name +valids <- list(train = dtrain, test = dtest) + +# To train with valids, use lgb.train, which contains more advanced features +# valids allows us to monitor the evaluation result on all data in the list +print("Train lightgbm using lgb.train with valids") +bst <- lgb.train(data = dtrain, + num_leaves = 4, + learning_rate = 1, + nrounds = 2, + valids = valids, + nthread = 1, + objective = "binary") + +# We can change evaluation metrics, or use multiple evaluation metrics +print("Train lightgbm using lgb.train with valids, watch logloss and error") +bst <- lgb.train(data = dtrain, + num_leaves = 4, + learning_rate = 1, + nrounds = 2, + valids = valids, + eval = c("binary_error", "binary_logloss"), + nthread = 1, + objective = "binary") + +# lgb.Dataset can also be saved using lgb.Dataset.save +lgb.Dataset.save(dtrain, "dtrain.buffer") + +# To load it in, simply call lgb.Dataset +dtrain2 <- lgb.Dataset("dtrain.buffer") +bst <- lgb.train(data = dtrain2, + num_leaves = 4, + learning_rate = 1, + nrounds = 2, + valids = valids, + nthread = 1, + objective = "binary") + +# information can be extracted from lgb.Dataset using getinfo +label = getinfo(dtest, "label") +pred <- predict(bst, test$data) +err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label) +print(paste("test-error=", err)) + +``` + diff --git a/R-package/vignettes/boost_from_prediction.Rmd b/R-package/vignettes/boost_from_prediction.Rmd new file mode 100644 index 000000000000..bee556edb3f4 --- /dev/null +++ b/R-package/vignettes/boost_from_prediction.Rmd @@ -0,0 +1,45 @@ +--- +title: "Boosting from existing prediction" +output: html_document +--- + +```{r} +require(lightgbm) +require(methods) + +# Load in the agaricus dataset +data(agaricus.train, package = "lightgbm") +data(agaricus.test, package = "lightgbm") +dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) +dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) + +valids <- list(eval = dtest, train = dtrain) +#--------------------Advanced features --------------------------- +# advanced: start from a initial base prediction +print("Start running example to start from a initial prediction") + +# Train lightgbm for 1 round +param <- list(num_leaves = 4, + learning_rate = 1, + nthread = 1, + objective = "binary") +bst <- lgb.train(param, dtrain, 1, valids = valids) + +# Note: we need the margin value instead of transformed prediction in set_init_score +ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE) +ptest <- predict(bst, agaricus.test$data, rawscore = TRUE) + +# set the init_score property of dtrain and dtest +# base margin is the base prediction we will boost from +setinfo(dtrain, "init_score", ptrain) +setinfo(dtest, "init_score", ptest) + +print("This is result of boost from initial prediction") +bst <- lgb.train(params = param, + data = dtrain, + nrounds = 5, + nthread = 1, + valids = valids) + +``` + diff --git a/R-package/vignettes/categorical_features_prepare.Rmd b/R-package/vignettes/categorical_features_prepare.Rmd new file mode 100644 index 000000000000..9968a9eea304 --- /dev/null +++ b/R-package/vignettes/categorical_features_prepare.Rmd @@ -0,0 +1,91 @@ +--- +title: "Categorical Feature Preparation" +output: html_document +--- + +```{r} +# Here we are going to try training a model with categorical features + +# Load libraries +library(data.table) +library(lightgbm) + +# Load data and look at the structure +# +# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables: +# $ age : int 30 33 35 30 59 35 36 39 41 43 ... +# $ job : chr "unemployed" "services" "management" "management" ... +# $ marital : chr "married" "married" "single" "married" ... +# $ education: chr "primary" "secondary" "tertiary" "tertiary" ... +# $ default : chr "no" "no" "no" "no" ... +# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ... +# $ housing : chr "no" "yes" "yes" "yes" ... +# $ loan : chr "no" "yes" "no" "yes" ... +# $ contact : chr "cellular" "cellular" "cellular" "unknown" ... +# $ day : int 19 11 16 3 5 23 14 6 14 17 ... +# $ month : chr "oct" "may" "apr" "jun" ... +# $ duration : int 79 220 185 199 226 141 341 151 57 313 ... +# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ... +# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ... +# $ previous : int 0 4 1 0 0 3 2 0 0 2 ... +# $ poutcome : chr "unknown" "failure" "failure" "unknown" ... +# $ y : chr "no" "no" "no" "no" ... +data(bank, package = "lightgbm") +str(bank) + +# We must now transform the data to fit in LightGBM +# For this task, we use lgb.prepare +# The function transforms the data into a fittable data +# +# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables: +# $ age : int 30 33 35 30 59 35 36 39 41 43 ... +# $ job : chr "unemployed" "services" "management" "management" ... +# $ marital : chr "married" "married" "single" "married" ... +# $ education: chr "primary" "secondary" "tertiary" "tertiary" ... +# $ default : chr "no" "no" "no" "no" ... +# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ... +# $ housing : chr "no" "yes" "yes" "yes" ... +# $ loan : chr "no" "yes" "no" "yes" ... +# $ contact : chr "cellular" "cellular" "cellular" "unknown" ... +# $ day : int 19 11 16 3 5 23 14 6 14 17 ... +# $ month : chr "oct" "may" "apr" "jun" ... +# $ duration : int 79 220 185 199 226 141 341 151 57 313 ... +# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ... +# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ... +# $ previous : int 0 4 1 0 0 3 2 0 0 2 ... +# $ poutcome : chr "unknown" "failure" "failure" "unknown" ... +# $ y : chr "no" "no" "no" "no" ... +bank <- lgb.prepare(data = bank) +str(bank) + +# Remove 1 to label because it must be between 0 and 1 +bank$y <- bank$y - 1 + +# Data input to LightGBM must be a matrix, without the label +my_data <- as.matrix(bank[, 1:16, with = FALSE]) + +# Creating the LightGBM dataset with categorical features +# The categorical features must be indexed like in R (1-indexed, not 0-indexed) +lgb_data <- lgb.Dataset(data = my_data, + label = bank$y, + categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)) + +# We can now train a model +model <- lgb.train(list(objective = "binary", + metric = "l2", + nthread = 1, + min_data = 1, + learning_rate = 0.1, + min_hessian = 1, + max_depth = 2), + lgb_data, + 100, + valids = list(train = lgb_data)) + +# Try to find split_feature: 2 +# If you find it, it means it used a categorical feature in the first tree +lgb.dump(model, num_iteration = 1) + + +``` + diff --git a/R-package/vignettes/categorical_features_rules.Rmd b/R-package/vignettes/categorical_features_rules.Rmd new file mode 100644 index 000000000000..739feca799ef --- /dev/null +++ b/R-package/vignettes/categorical_features_rules.Rmd @@ -0,0 +1,101 @@ +--- +title: "Categorical Feature Preparation with Rule" +output: html_document +--- + +```{r} +# Here we are going to try training a model with categorical features + +# Load libraries +library(data.table) +library(lightgbm) + +# Load data and look at the structure +# +# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables: +# $ age : int 30 33 35 30 59 35 36 39 41 43 ... +# $ job : chr "unemployed" "services" "management" "management" ... +# $ marital : chr "married" "married" "single" "married" ... +# $ education: chr "primary" "secondary" "tertiary" "tertiary" ... +# $ default : chr "no" "no" "no" "no" ... +# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ... +# $ housing : chr "no" "yes" "yes" "yes" ... +# $ loan : chr "no" "yes" "no" "yes" ... +# $ contact : chr "cellular" "cellular" "cellular" "unknown" ... +# $ day : int 19 11 16 3 5 23 14 6 14 17 ... +# $ month : chr "oct" "may" "apr" "jun" ... +# $ duration : int 79 220 185 199 226 141 341 151 57 313 ... +# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ... +# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ... +# $ previous : int 0 4 1 0 0 3 2 0 0 2 ... +# $ poutcome : chr "unknown" "failure" "failure" "unknown" ... +# $ y : chr "no" "no" "no" "no" ... +data(bank, package = "lightgbm") +str(bank) + +# We are dividing the dataset into two: one train, one validation +bank_train <- bank[1:4000, ] +bank_test <- bank[4001:4521, ] + +# We must now transform the data to fit in LightGBM +# For this task, we use lgb.prepare +# The function transforms the data into a fittable data +# +# Classes 'data.table' and 'data.frame': 521 obs. of 17 variables: +# $ age : int 53 36 58 26 34 55 55 34 41 38 ... +# $ job : num 1 10 10 9 10 2 2 3 3 4 ... +# $ marital : num 1 2 1 3 3 2 2 2 1 1 ... +# $ education: num 2 2 2 2 2 1 2 3 2 2 ... +# $ default : num 1 1 1 1 1 1 1 1 1 1 ... +# $ balance : int 26 191 -123 -147 179 1086 471 105 1588 70 ... +# $ housing : num 2 1 1 1 1 2 2 2 2 1 ... +# $ loan : num 1 1 1 1 1 1 1 1 2 1 ... +# $ contact : num 1 1 1 3 1 1 3 3 3 1 ... +# $ day : int 7 31 5 4 19 6 30 28 20 27 ... +# $ month : num 9 2 2 7 2 9 9 9 7 11 ... +# $ duration : int 56 69 131 95 294 146 58 249 10 255 ... +# $ campaign : int 1 1 2 2 3 1 2 2 8 3 ... +# $ pdays : int 359 -1 -1 -1 -1 272 -1 -1 -1 148 ... +# $ previous : int 1 0 0 0 0 2 0 0 0 1 ... +# $ poutcome : num 1 4 4 4 4 1 4 4 4 3 ... +# $ y : num 1 1 1 1 1 1 1 1 1 2 ... +bank_rules <- lgb.prepare_rules(data = bank_train) +bank_train <- bank_rules$data +bank_test <- lgb.prepare_rules(data = bank_test, rules = bank_rules$rules)$data +str(bank_test) + +# Remove 1 to label because it must be between 0 and 1 +bank_train$y <- bank_train$y - 1 +bank_test$y <- bank_test$y - 1 + +# Data input to LightGBM must be a matrix, without the label +my_data_train <- as.matrix(bank_train[, 1:16, with = FALSE]) +my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE]) + +# Creating the LightGBM dataset with categorical features +# The categorical features can be passed to lgb.train to not copy and paste a lot +dtrain <- lgb.Dataset(data = my_data_train, + label = bank_train$y) +dtest <- lgb.Dataset(data = my_data_test, + label = bank_test$y) + +# We can now train a model +model <- lgb.train(list(objective = "binary", + metric = "l2", + nthread = 1, + min_data = 1, + learning_rate = 0.1, + min_hessian = 1, + max_depth = 2, + categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)), + dtrain, + 100, + valids = list(train = dtrain, valid = dtest)) + +# Try to find split_feature: 11 +# If you find it, it means it used a categorical feature in the first tree +lgb.dump(model, num_iteration = 1) + + +``` + diff --git a/R-package/vignettes/cross_validation.Rmd b/R-package/vignettes/cross_validation.Rmd new file mode 100644 index 000000000000..a0a91e2a640e --- /dev/null +++ b/R-package/vignettes/cross_validation.Rmd @@ -0,0 +1,66 @@ +--- +title: "Cross Validation" +output: html_document +--- + +```{r} +require(lightgbm) +# load in the agaricus dataset +data(agaricus.train, package = "lightgbm") +data(agaricus.test, package = "lightgbm") +dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) +dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) + +nrounds <- 2 +param <- list(num_leaves = 4, + learning_rate = 1, + nthread = 1, + objective = "binary") + +print("Running cross validation") +# Do cross validation, this will print result out as +# [iteration] metric_name:mean_value+std_value +# std_value is standard deviation of the metric +lgb.cv(param, + dtrain, + nrounds, + nfold = 5, + eval = "binary_error") + +print("Running cross validation, disable standard deviation display") +# do cross validation, this will print result out as +# [iteration] metric_name:mean_value+std_value +# std_value is standard deviation of the metric +lgb.cv(param, + dtrain, + nrounds, + nfold = 5, + eval = "binary_error", + showsd = FALSE) + +# You can also do cross validation with cutomized loss function +print("Running cross validation, with cutomsized loss function") + +logregobj <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + preds <- 1 / (1 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1 - preds) + return(list(grad = grad, hess = hess)) +} +evalerror <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + err <- as.numeric(sum(labels != (preds > 0))) / length(labels) + return(list(name = "error", value = err, higher_better = FALSE)) +} + +# train with customized objective +lgb.cv(params = param, + data = dtrain, + nrounds = nrounds, + obj = logregobj, + eval = evalerror, + nfold = 5) + +``` + diff --git a/R-package/vignettes/early_stopping.Rmd b/R-package/vignettes/early_stopping.Rmd new file mode 100644 index 000000000000..5a169958a1a0 --- /dev/null +++ b/R-package/vignettes/early_stopping.Rmd @@ -0,0 +1,59 @@ +--- +title: "Early Stop in training" +output: html_document +--- + +```{r} +require(lightgbm) +require(methods) + +# Load in the agaricus dataset +data(agaricus.train, package = "lightgbm") +data(agaricus.test, package = "lightgbm") + +dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) +dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) + +# Note: for customized objective function, we leave objective as default +# Note: what we are getting is margin value in prediction +# You must know what you are doing +param <- list(num_leaves = 4, + learning_rate = 1, + nthread = 1) +valids <- list(eval = dtest) +num_round <- 20 + +# User define objective function, given prediction, return gradient and second order gradient +# This is loglikelihood loss +logregobj <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + preds <- 1 / (1 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1 - preds) + return(list(grad = grad, hess = hess)) +} + +# User defined evaluation function, return a pair metric_name, result, higher_better +# NOTE: when you do customized loss function, the default prediction value is margin +# This may make buildin evalution metric not function properly +# For example, we are doing logistic loss, the prediction is score before logistic transformation +# The buildin evaluation error assumes input is after logistic transformation +# Take this in mind when you use the customization, and maybe you need write customized evaluation function +evalerror <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + err <- as.numeric(sum(labels != (preds > 0.5))) / length(labels) + return(list(name = "error", value = err, higher_better = FALSE)) +} +print("Start training with early Stopping setting") + +bst <- lgb.train(param, + dtrain, + num_round, + valids, + objective = logregobj, + eval = evalerror, + early_stopping_round = 3, + nthread = 1) + +``` + diff --git a/R-package/vignettes/efficient_many_training.Rmd b/R-package/vignettes/efficient_many_training.Rmd new file mode 100644 index 000000000000..7f1ffb9b3211 --- /dev/null +++ b/R-package/vignettes/efficient_many_training.Rmd @@ -0,0 +1,48 @@ +--- +title: "Efficiency for Many Model Trainings" +output: html_document +--- + +```{r} +# Efficient training means training without giving up too much RAM +# In the case of many trainings (like 100+ models), RAM will be eaten very quickly +# Therefore, it is essential to know a strategy to deal with such issue + +# More results can be found here: https://github.com/Microsoft/LightGBM/issues/879#issuecomment-326656580 +# Quote: "@Laurae2 Thanks for nice easily reproducible example (unlike mine). +# With reset=FALSE you get after 500 iterations (not 1000): OS reports 27GB usage, while R gc() reports 1.5GB. +# Just doing reset=TRUE will already improve things: OS reports 4.6GB. +# Doing reset=TRUE and calling gc() in the loop will have OS 1.3GB. Thanks for the latest tip." + +# 2018-01-21 example patch: use the "small" switch to make it bigger. + +small <- TRUE + +# Load library +library(lightgbm) + +# Generate fictive data of size 1M x 100 +set.seed(11111) +x_data <- matrix(rnorm(n = ifelse(small, 1000000, 100000000), mean = 0, sd = 100), nrow = ifelse(small, 10000, 1000000), ncol = 100) +y_data <- rnorm(n = ifelse(small, 10000, 1000000), mean = 0, sd = 5) + +# Create lgb.Dataset for training +data <- lgb.Dataset(x_data, label = y_data) +data$construct() + +# Loop through a training of 1000 models, please check your RAM on your task manager +# It MUST remain constant (if not increasing very slightly) +gbm <- list() + +for (i in 1:(ifelse(small, 100, 1000))) { + cat(format(Sys.time(), "%a %b %d %Y %X"), ": ", i, "\n", sep = "") + gbm[[i]] <- lgb.train(params = list(objective = "regression", + nthread = 1), + data = data, + 1, + reset_data = TRUE) + gc(verbose = FALSE) +} + +``` + diff --git a/R-package/vignettes/leaf_stability.Rmd b/R-package/vignettes/leaf_stability.Rmd new file mode 100644 index 000000000000..e53ce56315a3 --- /dev/null +++ b/R-package/vignettes/leaf_stability.Rmd @@ -0,0 +1,139 @@ +--- +title: "Leaf (in)Stability example" +output: html_document +--- + +```{r} +# We are going to look at how iterating too much might generate observation instability. +# Obviously, we are in a controlled environment, without issues (real rules). +# Do not do this in a real scenario. + +# First, we load our libraries +library(lightgbm) +library(ggplot2) + +# Second, we load our data +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) + +# Third, we setup parameters and we train a model +params <- list(objective = "regression", metric = "l2") +valids <- list(test = dtest) +model <- lgb.train(params, + dtrain, + 50, + valids, + min_data = 1, + learning_rate = 0.1, + bagging_fraction = 0.1, + bagging_freq = 1, + bagging_seed = 1, + nthread = 1) + +# We create a data.frame with the following structure: +# X = average leaf of the observation throughout all trees +# Y = prediction probability (clamped to [1e-15, 1-1e-15]) +# Z = logloss +# binned = binned quantile of average leaf +new_data <- data.frame(X = rowMeans(predict(model, + agaricus.test$data, + predleaf = TRUE)), + Y = pmin(pmax(predict(model, + agaricus.test$data), 1e-15), 1 - 1e-15)) +new_data$Z <- -(agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y)) +new_data$binned <- .bincode(x = new_data$X, + breaks = quantile(x = new_data$X, + probs = (1:9)/10), + right = TRUE, + include.lowest = TRUE) +new_data$binned[is.na(new_data$binned)] <- 0 +new_data$binned <- as.factor(new_data$binned) + +# We can check the binned content +table(new_data$binned) + +# We can plot the binned content +# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss +# On the third plot, it is smooth! +ggplot(data = new_data, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability") +ggplot(data = new_data, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss") +ggplot(data = new_data, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") + + +# Now, let's show with other parameters +model2 <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + nthread = 1) + +# We create the data structure, but for model2 +new_data2 <- data.frame(X = rowMeans(predict(model2, + agaricus.test$data, + predleaf = TRUE)), + Y = pmin(pmax(predict(model2, + agaricus.test$data), 1e-15), 1 - 1e-15)) +new_data2$Z <- -(agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y)) +new_data2$binned <- .bincode(x = new_data2$X, + breaks = quantile(x = new_data2$X, + probs = (1:9)/10), + right = TRUE, + include.lowest = TRUE) +new_data2$binned[is.na(new_data2$binned)] <- 0 +new_data2$binned <- as.factor(new_data2$binned) + +# We can check the binned content +table(new_data2$binned) + +# We can plot the binned content +# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss +# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue +# However, if the rules were not true, the loss would explode. +ggplot(data = new_data2, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability") +ggplot(data = new_data2, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss") +ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") + + +# Now, try with very severe overfitting +model3 <- lgb.train(params, + dtrain, + 1000, + valids, + min_data = 1, + learning_rate = 1, + nthread = 1) + +# We create the data structure, but for model3 +new_data3 <- data.frame(X = rowMeans(predict(model3, + agaricus.test$data, + predleaf = TRUE)), + Y = pmin(pmax(predict(model3, + agaricus.test$data), 1e-15), 1 - 1e-15)) +new_data3$Z <- -(agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y)) +new_data3$binned <- .bincode(x = new_data3$X, + breaks = quantile(x = new_data3$X, + probs = (1:9)/10), + right = TRUE, + include.lowest = TRUE) +new_data3$binned[is.na(new_data3$binned)] <- 0 +new_data3$binned <- as.factor(new_data3$binned) + +# We can check the binned content +table(new_data3$binned) + +# We can plot the binned content +# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue. +# However, if the rules were not true, the loss would explode. See the sudden spikes? +ggplot(data = new_data3, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") + +# Compare with our second model, the difference is severe. This is smooth. +ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") + +``` + diff --git a/R-package/vignettes/multiclass.Rmd b/R-package/vignettes/multiclass.Rmd new file mode 100644 index 000000000000..bf66cc8fd798 --- /dev/null +++ b/R-package/vignettes/multiclass.Rmd @@ -0,0 +1,77 @@ +--- +title: "Multiclass training/prediction" +output: html_document +--- + +```{r} +require(lightgbm) + +# We load the default iris dataset shipped with R +data(iris) + +# We must convert factors to numeric +# They must be starting from number 0 to use multiclass +# For instance: 0, 1, 2, 3, 4, 5... +iris$Species <- as.numeric(as.factor(iris$Species)) - 1 + +# We cut the data set into 80% train and 20% validation +# The 10 last samples of each class are for validation + +train <- as.matrix(iris[c(1:40, 51:90, 101:140), ]) +test <- as.matrix(iris[c(41:50, 91:100, 141:150), ]) +dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5]) +dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5]) +valids <- list(test = dtest) + +# Method 1 of training +params <- list(objective = "multiclass", metric = "multi_error", num_class = 3) +model <- lgb.train(params, + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10, + nthread = 1) + +# We can predict on test data, outputs a 90-length vector +# Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3... +my_preds <- predict(model, test[, 1:4]) + +# Method 2 of training, identical +model <- lgb.train(list(), + dtrain, + 100, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10, + objective = "multiclass", + metric = "multi_error", + num_class = 3, + nthread = 1) + +# We can predict on test data, identical +my_preds <- predict(model, test[, 1:4]) + +# A (30x3) matrix with the predictions, use parameter reshape +# class1 class2 class3 +# obs1 obs1 obs1 +# obs2 obs2 obs2 +# .... .... .... +my_preds <- predict(model, test[, 1:4], reshape = TRUE) + +# We can also get the predicted scores before the Sigmoid/Softmax application +my_preds <- predict(model, test[, 1:4], rawscore = TRUE) + +# Raw score predictions as matrix instead of vector +my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE) + +# We can also get the leaf index +my_preds <- predict(model, test[, 1:4], predleaf = TRUE) + +# Predict leaf index as matrix instead of vector +my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE) + +``` + diff --git a/R-package/vignettes/weight_param.Rmd b/R-package/vignettes/weight_param.Rmd new file mode 100644 index 000000000000..c248e56f4c51 --- /dev/null +++ b/R-package/vignettes/weight_param.Rmd @@ -0,0 +1,110 @@ +--- +title: "Weight-Parameter adjustment relationship" +output: html_document +--- + +```{r} +# This demo R code is to provide a demonstration of hyperparameter adjustment +# when scaling weights for appropriate learning +# As with any optimizers, bad parameters can impair performance + +# Load library +library(lightgbm) + +# We will train a model with the following scenarii: +# - Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning) +# - Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning) +# - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning) + +# Setup small weights +weights1 <- rep(1/100000, 6513) +weights2 <- rep(1/100000, 1611) + +# Load data and create datasets +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label, weight = weights1) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label, weight = weights2) +valids <- list(test = dtest) + +# Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning) +# It cannot learn because regularization is too large! +# min_sum_hessian alone is bigger than the sum of weights, thus you will never learn anything +params <- list(objective = "regression", + metric = "l2", + device = "cpu", + min_sum_hessian = 10, + num_leaves = 7, + max_depth = 3, + nthread = 1) +model <- lgb.train(params, + dtrain, + 50, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10) +weight_loss <- as.numeric(model$record_evals$test$l2$eval) +plot(weight_loss) # Shows how poor the learning was: a straight line! + +# Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning) +# Adjusted regularization just consisting in multiplicating results by 1e4 (x10000) +# Notice how it learns, there is no issue as we adjusted regularization ourselves +params <- list(objective = "regression", + metric = "l2", + device = "cpu", + min_sum_hessian = 1e-4, + num_leaves = 7, + max_depth = 3, + nthread = 1) +model <- lgb.train(params, + dtrain, + 50, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10) +small_weight_loss <- as.numeric(model$record_evals$test$l2$eval) +plot(small_weight_loss) # It learns! + +# Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning) +# To make it better, we are first cleaning the environment and reloading LightGBM +lgb.unloader(wipe = TRUE) + +# And now, we are doing as usual +library(lightgbm) +data(agaricus.train, package = "lightgbm") +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label = train$label) +data(agaricus.test, package = "lightgbm") +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) +valids <- list(test = dtest) + +# Setup parameters and run model... +params <- list(objective = "regression", + metric = "l2", + device = "cpu", + min_sum_hessian = 10, + num_leaves = 7, + max_depth = 3, + nthread = 1) +model <- lgb.train(params, + dtrain, + 50, + valids, + min_data = 1, + learning_rate = 1, + early_stopping_rounds = 10) +large_weight_loss <- as.numeric(model$record_evals$test$l2$eval) +plot(large_weight_loss) # It learns! + + +# Do you want to compare the learning? They both converge. +plot(small_weight_loss, large_weight_loss) +curve(1*x, from = 0, to = 0.02, add = TRUE) + +``` +