-
Notifications
You must be signed in to change notification settings - Fork 32
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix handling of the min_samples_leaf hyperparameter #35
Changes from 1 commit
56d83ca
1c6e62b
d2e2902
0497360
7e91c2c
b5b6eca
f60a344
fb26aa7
f83d1e9
e5c0fdc
1da31d7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,7 +24,7 @@ | |
('histogram', typeof(HISTOGRAM_DTYPE)[:]), # array of size n_bins | ||
]) | ||
class SplitInfo: | ||
def __init__(self, gain=0, feature_idx=0, bin_idx=0, | ||
def __init__(self, gain=-1., feature_idx=0, bin_idx=0, | ||
gradient_left=0., hessian_left=0., | ||
gradient_right=0., hessian_right=0., | ||
n_samples_left=0, n_samples_right=0): | ||
|
@@ -44,6 +44,7 @@ def __init__(self, gain=0, feature_idx=0, bin_idx=0, | |
('binned_features', uint8[::1, :]), | ||
('n_bins', uint32), | ||
('min_samples_leaf', optional(uint32)), | ||
('min_gain_to_split', float32), | ||
('all_gradients', float32[::1]), | ||
('all_hessians', float32[::1]), | ||
('ordered_gradients', float32[::1]), | ||
|
@@ -61,7 +62,8 @@ def __init__(self, gain=0, feature_idx=0, bin_idx=0, | |
class SplittingContext: | ||
def __init__(self, n_features, binned_features, n_bins, | ||
all_gradients, all_hessians, l2_regularization, | ||
min_hessian_to_split=1e-3, min_samples_leaf=None): | ||
min_hessian_to_split=1e-3, min_samples_leaf=None, | ||
min_gain_to_split=0.): | ||
self.n_features = n_features | ||
self.binned_features = binned_features | ||
self.n_bins = n_bins | ||
|
@@ -76,6 +78,7 @@ def __init__(self, n_features, binned_features, n_bins, | |
self.l2_regularization = l2_regularization | ||
self.min_hessian_to_split = min_hessian_to_split | ||
self.min_samples_leaf = min_samples_leaf | ||
self.min_gain_to_split = min_gain_to_split | ||
if self.constant_hessian: | ||
self.constant_hessian_value = self.all_hessians[0] # 1 scalar | ||
else: | ||
|
@@ -261,7 +264,7 @@ def find_node_split(context, sample_indices): | |
|
||
# Pre-allocate the results datastructure to be able to use prange: | ||
# numba jitclass do not seem to properly support default values for kwargs. | ||
split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0., 0, 0) | ||
split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) | ||
for i in range(context.n_features)] | ||
for feature_idx in prange(context.n_features): | ||
split_info = _find_histogram_split(context, feature_idx, | ||
|
@@ -302,7 +305,7 @@ def find_node_split_subtraction(context, sample_indices, parent_histograms, | |
sibling_histograms[0]['sum_hessians'].sum()) | ||
|
||
# Pre-allocate the results datastructure to be able to use prange | ||
split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0., 0, 0) | ||
split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) | ||
for i in range(context.n_features)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This data structure could probably be also stored as an attribute on the context to avoid reallocating it over and over again. |
||
for feature_idx in prange(context.n_features): | ||
split_info = _find_histogram_split_subtraction( | ||
|
@@ -400,7 +403,7 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples): | |
continue | ||
if n_samples_right < context.min_samples_leaf: | ||
# won't get any better | ||
continue | ||
break | ||
|
||
if context.constant_hessian: | ||
hessian_left += (histogram[bin_idx]['count'] | ||
|
@@ -412,7 +415,7 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples): | |
hessian_right = context.sum_hessians - hessian_left | ||
if hessian_right < context.min_hessian_to_split: | ||
# won't get any better | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add a comment to say that the loss functions are all convex and therefore the hessians are positive. |
||
continue | ||
break | ||
|
||
gradient_left += histogram[bin_idx]['sum_gradients'] | ||
gradient_right = context.sum_gradients - gradient_left | ||
|
@@ -421,7 +424,7 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples): | |
context.sum_gradients, context.sum_hessians, | ||
context.l2_regularization) | ||
|
||
if gain > best_split.gain: | ||
if gain > best_split.gain and gain > context.min_gain_to_split: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should probably have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes this is the cause of the bad trees. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops I spoke too quickly, I had set |
||
best_split.gain = gain | ||
best_split.feature_idx = feature_idx | ||
best_split.bin_idx = bin_idx | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,9 +20,24 @@ def test_boston_dataset(): | |
gradients = y_train.astype(np.float32) | ||
hessians = np.ones(1, dtype=np.float32) | ||
|
||
min_samples_leaf = 5 | ||
max_leaf_nodes = 31 | ||
grower = TreeGrower(X_train_binned, gradients, hessians, | ||
min_samples_leaf=7, max_leaf_nodes=31) | ||
min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) | ||
grower.grow() | ||
|
||
import pytest | ||
lb = pytest.importorskip("lightgbm") | ||
est_lightgbm = lb.LGBMRegressor(n_estimators=1, | ||
min_data_in_bin=1, | ||
learning_rate=1, | ||
min_data_in_leaf=min_samples_leaf, | ||
num_leaves=max_leaf_nodes) | ||
est_lightgbm.fit(X_train_binned, y_train) | ||
|
||
from pygbm.plotting import plot_tree | ||
plot_tree(grower, est_lightgbm) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please do not put such plots in the regular tests. Use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also if you want to compare lightgbm and pygbm on the Boston dataset, please add a new test in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, this is not meant to stay, I just left it so you can reproduce my plots if you wanted to |
||
|
||
predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) | ||
|
||
assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.65 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should raise a
ValueError
if the user passesmin_gain_to_split < 0
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's already a check in the grower. BTW
min_gain_to_split
is only a parameter of the grower, not ofGradientBoostingMachine
. We should move it there right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right this is fine for now. No need to expose it to the public API at this time.