Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of the min_samples_leaf hyperparameter #35

Merged
merged 11 commits into from
Nov 3, 2018
2 changes: 1 addition & 1 deletion pygbm/grower.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def __init__(self, features_data, all_gradients, all_hessians,
self.splitting_context = SplittingContext(
features_data.shape[1], features_data, n_bins,
all_gradients, all_hessians, l2_regularization,
min_hessian_to_split, min_samples_leaf)
min_hessian_to_split, min_samples_leaf, min_gain_to_split)
self.max_leaf_nodes = max_leaf_nodes
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
Expand Down
17 changes: 10 additions & 7 deletions pygbm/splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
('histogram', typeof(HISTOGRAM_DTYPE)[:]), # array of size n_bins
])
class SplitInfo:
def __init__(self, gain=0, feature_idx=0, bin_idx=0,
def __init__(self, gain=-1., feature_idx=0, bin_idx=0,
gradient_left=0., hessian_left=0.,
gradient_right=0., hessian_right=0.,
n_samples_left=0, n_samples_right=0):
Expand All @@ -44,6 +44,7 @@ def __init__(self, gain=0, feature_idx=0, bin_idx=0,
('binned_features', uint8[::1, :]),
('n_bins', uint32),
('min_samples_leaf', optional(uint32)),
('min_gain_to_split', float32),
('all_gradients', float32[::1]),
('all_hessians', float32[::1]),
('ordered_gradients', float32[::1]),
Expand All @@ -61,7 +62,8 @@ def __init__(self, gain=0, feature_idx=0, bin_idx=0,
class SplittingContext:
def __init__(self, n_features, binned_features, n_bins,
all_gradients, all_hessians, l2_regularization,
min_hessian_to_split=1e-3, min_samples_leaf=None):
min_hessian_to_split=1e-3, min_samples_leaf=None,
min_gain_to_split=0.):
self.n_features = n_features
self.binned_features = binned_features
self.n_bins = n_bins
Expand All @@ -76,6 +78,7 @@ def __init__(self, n_features, binned_features, n_bins,
self.l2_regularization = l2_regularization
self.min_hessian_to_split = min_hessian_to_split
self.min_samples_leaf = min_samples_leaf
self.min_gain_to_split = min_gain_to_split
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should raise a ValueError if the user passes min_gain_to_split < 0.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's already a check in the grower. BTW min_gain_to_split is only a parameter of the grower, not of GradientBoostingMachine. We should move it there right?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right this is fine for now. No need to expose it to the public API at this time.

if self.constant_hessian:
self.constant_hessian_value = self.all_hessians[0] # 1 scalar
else:
Expand Down Expand Up @@ -261,7 +264,7 @@ def find_node_split(context, sample_indices):

# Pre-allocate the results datastructure to be able to use prange:
# numba jitclass do not seem to properly support default values for kwargs.
split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0., 0, 0)
split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0)
for i in range(context.n_features)]
for feature_idx in prange(context.n_features):
split_info = _find_histogram_split(context, feature_idx,
Expand Down Expand Up @@ -302,7 +305,7 @@ def find_node_split_subtraction(context, sample_indices, parent_histograms,
sibling_histograms[0]['sum_hessians'].sum())

# Pre-allocate the results datastructure to be able to use prange
split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0., 0, 0)
split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0)
for i in range(context.n_features)]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This data structure could probably be also stored as an attribute on the context to avoid reallocating it over and over again.

for feature_idx in prange(context.n_features):
split_info = _find_histogram_split_subtraction(
Expand Down Expand Up @@ -400,7 +403,7 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples):
continue
if n_samples_right < context.min_samples_leaf:
# won't get any better
continue
break

if context.constant_hessian:
hessian_left += (histogram[bin_idx]['count']
Expand All @@ -412,7 +415,7 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples):
hessian_right = context.sum_hessians - hessian_left
if hessian_right < context.min_hessian_to_split:
# won't get any better
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add a comment to say that the loss functions are all convex and therefore the hessians are positive.

continue
break

gradient_left += histogram[bin_idx]['sum_gradients']
gradient_right = context.sum_gradients - gradient_left
Expand All @@ -421,7 +424,7 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples):
context.sum_gradients, context.sum_hessians,
context.l2_regularization)

if gain > best_split.gain:
if gain > best_split.gain and gain > context.min_gain_to_split:
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably have gain >= context.min_gain_to_split.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this is the cause of the bad trees.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops I spoke too quickly, I had set min_samples_leaf=1 in test_predictor.py and forgot about it.

best_split.gain = gain
best_split.feature_idx = feature_idx
best_split.bin_idx = bin_idx
Expand Down
4 changes: 2 additions & 2 deletions tests/test_compare_lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def test_same_predictions_easy_target(seed, n_samples, max_leaf_nodes):

pred_lgbm = est_lightgbm.predict(X_train)
pred_pygbm = est_pygbm.predict(X_train)
np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=5)
np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=3)

if max_leaf_nodes < 10 and n_samples > 1000:
pred_lgbm = est_lightgbm.predict(X_test)
pred_pygbm = est_pygbm.predict(X_test)
np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=5)
np.testing.assert_array_almost_equal(pred_lgbm, pred_pygbm, decimal=3)
17 changes: 16 additions & 1 deletion tests/test_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,24 @@ def test_boston_dataset():
gradients = y_train.astype(np.float32)
hessians = np.ones(1, dtype=np.float32)

min_samples_leaf = 5
max_leaf_nodes = 31
grower = TreeGrower(X_train_binned, gradients, hessians,
min_samples_leaf=7, max_leaf_nodes=31)
min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes)
grower.grow()

import pytest
lb = pytest.importorskip("lightgbm")
est_lightgbm = lb.LGBMRegressor(n_estimators=1,
min_data_in_bin=1,
learning_rate=1,
min_data_in_leaf=min_samples_leaf,
num_leaves=max_leaf_nodes)
est_lightgbm.fit(X_train_binned, y_train)

from pygbm.plotting import plot_tree
plot_tree(grower, est_lightgbm)
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please do not put such plots in the regular tests. Use the examples/ folder for visual debugging instead.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also if you want to compare lightgbm and pygbm on the Boston dataset, please add a new test in test_compare_lightgbm.py instead.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, this is not meant to stay, I just left it so you can reproduce my plots if you wanted to


predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.65
Expand Down
15 changes: 10 additions & 5 deletions tests/test_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def test_histogram_split(n_bins):
l2_regularization = 0
min_hessian_to_split = 1e-3
min_samples_leaf = None
min_gain_to_split = 0.
binned_features = np.asfortranarray(
rng.randint(0, n_bins, size=(int(1e4), 2)), dtype=np.uint8)
binned_feature = binned_features.T[feature_idx]
Expand All @@ -34,7 +35,8 @@ def test_histogram_split(n_bins):
binned_features, n_bins,
all_gradients, all_hessians,
l2_regularization,
min_hessian_to_split, min_samples_leaf)
min_hessian_to_split,
min_samples_leaf, min_gain_to_split)

split_info = _find_histogram_split(context, feature_idx,
sample_indices)
Expand Down Expand Up @@ -62,6 +64,7 @@ def test_split_vs_split_subtraction(constant_hessian):
l2_regularization = 0.
min_hessian_to_split = 1e-3
min_samples_leaf = None
min_gain_to_split = 0.

binned_features = rng.randint(0, n_bins, size=(n_samples, n_features),
dtype=np.uint8)
Expand All @@ -76,7 +79,7 @@ def test_split_vs_split_subtraction(constant_hessian):
context = SplittingContext(n_features, binned_features, n_bins,
all_gradients, all_hessians,
l2_regularization, min_hessian_to_split,
min_samples_leaf)
min_samples_leaf, min_gain_to_split)

mask = rng.randint(0, 2, n_samples).astype(np.bool)
sample_indices_left = sample_indices[mask]
Expand Down Expand Up @@ -137,7 +140,8 @@ def test_gradient_and_hessian_sanity(constant_hessian):
n_samples = 500
l2_regularization = 0.
min_hessian_to_split = 1e-3
min_samples_leaf = None
min_samples_leaf = 1e-3
min_gain_to_split = 0.

binned_features = rng.randint(0, n_bins, size=(n_samples, n_features),
dtype=np.uint8)
Expand All @@ -152,7 +156,7 @@ def test_gradient_and_hessian_sanity(constant_hessian):
context = SplittingContext(n_features, binned_features, n_bins,
all_gradients, all_hessians,
l2_regularization, min_hessian_to_split,
min_samples_leaf)
min_samples_leaf, min_gain_to_split)

mask = rng.randint(0, 2, n_samples).astype(np.bool)
sample_indices_left = sample_indices[mask]
Expand Down Expand Up @@ -225,6 +229,7 @@ def test_split_indices():
l2_regularization = 0.
min_hessian_to_split = 1e-3
min_samples_leaf = None
min_gain_to_split = 0.

# split will happen on feature 1 and on bin 3
binned_features = [[0, 0],
Expand All @@ -245,7 +250,7 @@ def test_split_indices():
context = SplittingContext(n_features, binned_features, n_bins,
all_gradients, all_hessians,
l2_regularization, min_hessian_to_split,
min_samples_leaf)
min_samples_leaf, min_gain_to_split)

assert_array_almost_equal(sample_indices, context.partition)
si_root, _ = find_node_split(context, sample_indices)
Expand Down