forked from microsoft/LightGBM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
basic_walkthrough.R
143 lines (123 loc) · 5.33 KB
/
basic_walkthrough.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
require(lightgbm)
require(methods)
# We load in the agaricus dataset
# In this example, we are aiming to predict whether a mushroom is edible
data(agaricus.train, package = "lightgbm")
data(agaricus.test, package = "lightgbm")
train <- agaricus.train
test <- agaricus.test
# The loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
class(train$label)
class(train$data)
#--------------------Basic Training using lightgbm----------------
# This is the basic usage of lightgbm you can put matrix in data field
# Note: we are putting in sparse matrix here, lightgbm naturally handles sparse input
# Use sparse matrix when your feature is sparse (e.g. when you are using one-hot encoding vector)
print("Training lightgbm with sparseMatrix")
bst <- lightgbm(data = train$data,
label = train$label,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
# Alternatively, you can put in dense matrix, i.e. basic R-matrix
print("Training lightgbm with Matrix")
bst <- lightgbm(data = as.matrix(train$data),
label = train$label,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
# You can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features
print("Training lightgbm with lgb.Dataset")
dtrain <- lgb.Dataset(data = train$data,
label = train$label)
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
# Verbose = 0,1,2
print("Train lightgbm with verbose 0, no message")
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary",
verbose = 0)
print("Train lightgbm with verbose 1, print evaluation metric")
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
nthread = 2,
objective = "binary",
verbose = 1)
print("Train lightgbm with verbose 2, also print information about tree")
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
nthread = 2,
objective = "binary",
verbose = 2)
# You can also specify data as file path to a LibSVM/TCV/CSV format input
# Since we do not have this file with us, the following line is just for illustration
# bst <- lightgbm(data = "agaricus.train.svm", num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary")
#--------------------Basic prediction using lightgbm--------------
# You can do prediction using the following line
# You can put in Matrix, sparseMatrix, or lgb.Dataset
pred <- predict(bst, test$data)
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
#--------------------Save and load models-------------------------
# Save model to binary local file
lgb.save(bst, "lightgbm.model")
# Load binary model to R
bst2 <- lgb.load("lightgbm.model")
pred2 <- predict(bst2, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
#--------------------Advanced features ---------------------------
# To use advanced features, we need to put data in lgb.Dataset
dtrain <- lgb.Dataset(data = train$data, label = train$label, free_raw_data = FALSE)
dtest <- lgb.Dataset(data = test$data, label = test$label, free_raw_data = FALSE)
#--------------------Using validation set-------------------------
# valids is a list of lgb.Dataset, each of them is tagged with name
valids <- list(train = dtrain, test = dtest)
# To train with valids, use lgb.train, which contains more advanced features
# valids allows us to monitor the evaluation result on all data in the list
print("Train lightgbm using lgb.train with valids")
bst <- lgb.train(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
nthread = 2,
objective = "binary")
# We can change evaluation metrics, or use multiple evaluation metrics
print("Train lightgbm using lgb.train with valids, watch logloss and error")
bst <- lgb.train(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
eval = c("binary_error", "binary_logloss"),
nthread = 2,
objective = "binary")
# lgb.Dataset can also be saved using lgb.Dataset.save
lgb.Dataset.save(dtrain, "dtrain.buffer")
# To load it in, simply call lgb.Dataset
dtrain2 <- lgb.Dataset("dtrain.buffer")
bst <- lgb.train(data = dtrain2,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
nthread = 2,
objective = "binary")
# information can be extracted from lgb.Dataset using getinfo
label = getinfo(dtest, "label")
pred <- predict(bst, test$data)
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
print(paste("test-error=", err))