Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of the min_samples_leaf hyperparameter #35

Merged
merged 11 commits into from
Nov 3, 2018
29 changes: 17 additions & 12 deletions pygbm/grower.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ def _intilialize_root(self):
if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1):
self._finalize_leaf(self.root)
return
if (self.min_samples_leaf is not None
and self.root.n_samples < self.min_samples_leaf):
# Do not even bother computing any splitting statistics.
self._finalize_leaf(self.root)
return
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great to add a test for this case in test_grower.py.


self._compute_spittability(self.root)

def _compute_spittability(self, node, only_hist=False):
Expand All @@ -121,7 +127,6 @@ def _compute_spittability(self, node, only_hist=False):
the node. If _compute_spittability is called again by the grower on
this same node, the histograms won't be computed again.
"""

# Compute split_info and histograms if not already done
if node.split_info is None and node.histograms is None:
# If the sibling has less samples, compute its hist first (with
Expand Down Expand Up @@ -151,11 +156,19 @@ def _compute_spittability(self, node, only_hist=False):

if only_hist:
# _compute_spittability was called by a sibling. We only needed to
# compute the histogram
# compute the histogram.
return

if node.split_info.gain < self.min_gain_to_split:
self._finalize_leaf(node)

elif (self.min_samples_leaf is not None and (
node.split_info.n_samples_left < self.min_samples_leaf
or node.split_info.n_samples_right < self.min_samples_leaf)):
# Reject the split as it would result in at least one leaf that
# would be too small.
self._finalize_leaf(node)

else:
heappush(self.splittable_nodes, node)

Expand Down Expand Up @@ -208,16 +221,8 @@ def split_next(self):
self._finalize_splittable_nodes()

else:
if (self.min_samples_leaf is not None
and len(sample_indices_left) < self.min_samples_leaf):
self._finalize_leaf(left_child_node)
else:
self._compute_spittability(left_child_node)
if (self.min_samples_leaf is not None
and len(sample_indices_right) < self.min_samples_leaf):
self._finalize_leaf(right_child_node)
else:
self._compute_spittability(right_child_node)
self._compute_spittability(left_child_node)
self._compute_spittability(right_child_node)
NicolasHug marked this conversation as resolved.
Show resolved Hide resolved
return left_child_node, right_child_node

def can_split_further(self):
Expand Down
47 changes: 29 additions & 18 deletions pygbm/splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,24 @@
('hessian_left', float32),
('gradient_right', float32),
('hessian_right', float32),
('n_samples_left', uint32),
('n_samples_right', uint32),
('histogram', typeof(HISTOGRAM_DTYPE)[:]), # array of size n_bins
])
class SplitInfo:
def __init__(self, gain=0, feature_idx=0, bin_idx=0,
gradient_left=0., hessian_left=0.,
gradient_right=0., hessian_right=0.):
gradient_right=0., hessian_right=0.,
n_samples_left=0, n_samples_right=0):
self.gain = gain
self.feature_idx = feature_idx
self.bin_idx = bin_idx
self.gradient_left = gradient_left
self.hessian_left = hessian_left
self.gradient_right = gradient_right
self.hessian_right = hessian_right
self.n_samples_left = n_samples_left
self.n_samples_right = n_samples_right


@jitclass([
Expand Down Expand Up @@ -236,17 +241,16 @@ def find_node_split_subtraction(context, sample_indices, parent_histograms,
# be to compute an average but it's probably not worth it.
sum_gradients = (parent_histograms[0]['sum_gradients'].sum() -
sibling_histograms[0]['sum_gradients'].sum())

n_samples = sample_indices.shape[0]
if context.constant_hessian:
n_samples = sample_indices.shape[0]
sum_hessians = context.constant_hessian_value * float32(n_samples)
else:
sum_hessians = (parent_histograms[0]['sum_hessians'].sum() -
sibling_histograms[0]['sum_hessians'].sum())

return _parallel_find_split_subtraction(
context, parent_histograms, sibling_histograms,
sum_gradients, sum_hessians)
sum_gradients, sum_hessians, n_samples)


@njit
Expand Down Expand Up @@ -280,7 +284,7 @@ def _parallel_find_split(splitter, sample_indices, ordered_gradients,
"""
# Pre-allocate the results datastructure to be able to use prange:
# numba jitclass do not seem to properly support default values for kwargs.
split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0.)
split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0., 0, 0)
for i in range(splitter.n_features)]
for feature_idx in prange(splitter.n_features):
split_info = _find_histogram_split(
Expand All @@ -295,7 +299,7 @@ def _parallel_find_split(splitter, sample_indices, ordered_gradients,
@njit(parallel=True)
def _parallel_find_split_subtraction(context, parent_histograms,
sibling_histograms,
sum_gradients, sum_hessians):
sum_gradients, sum_hessians, n_samples):
"""For each feature, find the best bin to split by histogram substraction

This in turn calls _find_histogram_split_subtraction that does not need
Expand All @@ -307,12 +311,12 @@ def _parallel_find_split_subtraction(context, parent_histograms,
histograms by substraction.
"""
# Pre-allocate the results datastructure to be able to use prange
split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0.)
split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0., 0, 0)
for i in range(context.n_features)]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This data structure could probably be also stored as an attribute on the context to avoid reallocating it over and over again.

for feature_idx in prange(context.n_features):
split_info = _find_histogram_split_subtraction(
context, feature_idx, parent_histograms, sibling_histograms,
sum_gradients, sum_hessians)
sum_gradients, sum_hessians, n_samples)
split_infos[feature_idx] = split_info

return _find_best_feature_to_split_helper(
Expand All @@ -324,8 +328,9 @@ def _find_histogram_split(context, feature_idx, sample_indices,
ordered_gradients, ordered_hessians,
sum_gradients, sum_hessians):
"""Compute the histogram for a given feature and return the best bin."""
n_samples = sample_indices.shape[0]
binned_feature = context.binned_features.T[feature_idx]
root_node = binned_feature.shape[0] == sample_indices.shape[0]
root_node = binned_feature.shape[0] == n_samples

if root_node:
if context.constant_hessian:
Expand All @@ -346,13 +351,14 @@ def _find_histogram_split(context, feature_idx, sample_indices,
ordered_gradients, ordered_hessians)

return _find_best_bin_to_split_helper(
context, feature_idx, histogram, sum_gradients, sum_hessians)
context, feature_idx, histogram, sum_gradients, sum_hessians,
sample_indices.shape[0])


@njit(fastmath=True)
def _find_histogram_split_subtraction(context, feature_idx,
parent_histograms, sibling_histograms,
sum_gradients, sum_hessians):
sum_gradients, sum_hessians, n_samples):
"""Compute the histogram by substraction of parent and sibling

Uses the identity: hist(parent) = hist(left) + hist(right)
Expand All @@ -362,26 +368,29 @@ def _find_histogram_split_subtraction(context, feature_idx,
sibling_histograms[feature_idx])

return _find_best_bin_to_split_helper(
context, feature_idx, histogram, sum_gradients, sum_hessians)
context, feature_idx, histogram, sum_gradients, sum_hessians,
n_samples)


@njit(locals={'gradient_left': float32, 'hessian_left': float32},
@njit(locals={'gradient_left': float32, 'hessian_left': float32,
'n_samples_left': uint32},
fastmath=True)
def _find_best_bin_to_split_helper(context, feature_idx, histogram,
sum_gradients, sum_hessians):
sum_gradients, sum_hessians, n_samples):
NicolasHug marked this conversation as resolved.
Show resolved Hide resolved
"""Find best bin to split on and return the corresponding SplitInfo"""
# Allocate the structure for the best split information. It can be
# returned as such (with a negative gain) if the min_hessian_to_split
# condition is not satisfied. Such invalid splits are later discarded by
# the TreeGrower.
best_split = SplitInfo(-1., 0, 0, 0., 0., 0., 0.)
best_split = SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0)
gradient_left, hessian_left, n_samples_left = 0., 0., 0

gradient_left, hessian_left = 0., 0.
for bin_idx in range(context.n_bins):
gradient_left += histogram[bin_idx]['sum_gradients']
n_samples_left += histogram[bin_idx]['count']
if context.constant_hessian:
hessian_left += (histogram[bin_idx]['count'] *
context.constant_hessian_value)
hessian_left += (histogram[bin_idx]['count']
* context.constant_hessian_value)
NicolasHug marked this conversation as resolved.
Show resolved Hide resolved
else:
hessian_left += histogram[bin_idx]['sum_hessians']
if hessian_left < context.min_hessian_to_split:
Expand All @@ -400,8 +409,10 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram,
best_split.bin_idx = bin_idx
best_split.gradient_left = gradient_left
best_split.hessian_left = hessian_left
best_split.n_samples_left = n_samples_left
best_split.gradient_right = gradient_right
best_split.hessian_right = hessian_right
best_split.n_samples_right = n_samples - n_samples_left

best_split.histogram = histogram
return best_split
Expand Down
9 changes: 5 additions & 4 deletions tests/test_compare_lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_same_predictions_easy_target(seed, n_samples, max_leaf_nodes):

rng = np.random.RandomState(seed=seed)
n_samples = n_samples
min_sample_leaf = 1
min_samples_leaf = 1 # XXX: changing this breaks the test
max_iter = 1

# data = linear target, 5 features, 3 irrelevant.
Expand All @@ -44,13 +44,14 @@ def test_same_predictions_easy_target(seed, n_samples, max_leaf_nodes):
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

est_lightgbm = lb.LGBMRegressor(n_estimators=max_iter,
min_data=1, min_data_in_bin=1,
min_data_in_bin=1,
learning_rate=1,
min_child_samples=min_sample_leaf,
min_data_in_leaf=min_samples_leaf,
num_leaves=max_leaf_nodes)
est_pygbm = GradientBoostingMachine(max_iter=max_iter,
learning_rate=1,
validation_split=None, scoring=None,
min_samples_leaf=min_sample_leaf,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes)
est_lightgbm.fit(X_train, y_train)
est_pygbm.fit(X_train, y_train)
Expand Down
40 changes: 40 additions & 0 deletions tests/test_grower.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pytest import approx

from pygbm.grower import TreeGrower
from pygbm.binning import BinMapper


def _make_training_data(n_bins=256, constant_hessian=True):
Expand Down Expand Up @@ -180,3 +181,42 @@ def predict(features):
# Check that training set can be recovered exactly:
predictions = predictor.predict_binned(features_data)
assert_allclose(predictions, all_gradients)


@pytest.mark.parametrize(
'n_samples, min_samples_leaf, n_bins, constant_hessian, noise',
[
(11, 10, 7, True, 0),
(13, 10, 42, False, 0),
(56, 10, 255, True, 0.1),
(101, 3, 7, True, 0),
(200, 42, 42, False, 0),
(300, 55, 255, True, 0.1),
]
)
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
constant_hessian, noise):
rng = np.random.RandomState(seed=0)
# data = linear target, 3 features, 1 irrelevant.
X = rng.normal(size=(n_samples, 3))
y = X[:, 0] - X[:, 1]
if noise:
y_scale = y.std()
y += rng.normal(scale=noise, size=n_samples) * y_scale
mapper = BinMapper(max_bins=n_bins)
X = mapper.fit_transform(X)

all_gradients = y.astype(np.float32)
if constant_hessian:
all_hessians = np.ones(shape=1, dtype=np.float32)
else:
all_hessians = np.ones_like(all_gradients)
grower = TreeGrower(X, all_gradients, all_hessians,
n_bins=n_bins, shrinkage=1.,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=n_samples)
grower.grow()
predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
for node in predictor.nodes:
if node['is_leaf']:
assert node['count'] >= min_samples_leaf
7 changes: 4 additions & 3 deletions tests/test_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ def test_boston_dataset():
gradients = y_train.astype(np.float32)
hessians = np.ones(1, dtype=np.float32)

grower = TreeGrower(X_train_binned, gradients, hessians, max_leaf_nodes=31)
grower = TreeGrower(X_train_binned, gradients, hessians,
min_samples_leaf=5, max_leaf_nodes=31)
grower.grow()
predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)

assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.9
assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.75
assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.65

assert_allclose(predictor.predict(X_train),
Expand All @@ -33,5 +34,5 @@ def test_boston_dataset():
assert_allclose(predictor.predict(X_test),
predictor.predict_binned(X_test_binned))

assert r2_score(y_train, predictor.predict(X_train)) > 0.9
assert r2_score(y_train, predictor.predict(X_train)) > 0.75
assert r2_score(y_test, predictor.predict(X_test)) > 0.65
9 changes: 9 additions & 0 deletions tests/test_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ def test_histogram_split(n_bins):
assert split_info.bin_idx == true_bin
assert split_info.gain >= 0
assert split_info.feature_idx == feature_idx
assert (split_info.n_samples_left + split_info.n_samples_right
== sample_indices.shape[0])
# Constant hessian: 1. per sample.
assert split_info.n_samples_left == split_info.hessian_left


@pytest.mark.parametrize('constant_hessian', [True, False])
Expand Down Expand Up @@ -256,3 +260,8 @@ def test_split_indices():
context.partition[:position_right])
assert_array_almost_equal(samples_right,
context.partition[position_right:])

# Check that the resulting split indices sizes are consistent with the
# count statistics anticipated when looking for the best split.
assert samples_left.shape[0] == si_root.n_samples_left
assert samples_right.shape[0] == si_root.n_samples_right