ogrisel · ogrisel · Nov 3, 2018 · Nov 2, 2018 · Nov 2, 2018 · Nov 2, 2018
diff --git a/pygbm/grower.py b/pygbm/grower.py
@@ -110,6 +110,12 @@ def _intilialize_root(self):
         if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1):
             self._finalize_leaf(self.root)
             return
+        if (self.min_samples_leaf is not None
+                and self.root.n_samples < self.min_samples_leaf):
+            # Do not even bother computing any splitting statistics.
+            self._finalize_leaf(self.root)
+            return
+
         self._compute_spittability(self.root)
 
     def _compute_spittability(self, node, only_hist=False):
@@ -121,7 +127,6 @@ def _compute_spittability(self, node, only_hist=False):
         the node. If _compute_spittability is called again by the grower on
         this same node, the histograms won't be computed again.
         """
-
         # Compute split_info and histograms if not already done
         if node.split_info is None and node.histograms is None:
             # If the sibling has less samples, compute its hist first (with
@@ -151,11 +156,19 @@ def _compute_spittability(self, node, only_hist=False):
 
         if only_hist:
             # _compute_spittability was called by a sibling. We only needed to
-            # compute the histogram
+            # compute the histogram.
             return
 
         if node.split_info.gain < self.min_gain_to_split:
             self._finalize_leaf(node)
+
+        elif (self.min_samples_leaf is not None and (
+                node.split_info.n_samples_left < self.min_samples_leaf
+                or node.split_info.n_samples_right < self.min_samples_leaf)):
+            # Reject the split as it would result in at least one leaf that
+            # would be too small.
+            self._finalize_leaf(node)
+
         else:
             heappush(self.splittable_nodes, node)
 
@@ -208,16 +221,8 @@ def split_next(self):
             self._finalize_splittable_nodes()
 
         else:
-            if (self.min_samples_leaf is not None
-                    and len(sample_indices_left) < self.min_samples_leaf):
-                self._finalize_leaf(left_child_node)
-            else:
-                self._compute_spittability(left_child_node)
-            if (self.min_samples_leaf is not None
-                    and len(sample_indices_right) < self.min_samples_leaf):
-                self._finalize_leaf(right_child_node)
-            else:
-                self._compute_spittability(right_child_node)
+            self._compute_spittability(left_child_node)
+            self._compute_spittability(right_child_node)
         return left_child_node, right_child_node
 
     def can_split_further(self):

diff --git a/pygbm/splitting.py b/pygbm/splitting.py
@@ -18,19 +18,24 @@
     ('hessian_left', float32),
     ('gradient_right', float32),
     ('hessian_right', float32),
+    ('n_samples_left', uint32),
+    ('n_samples_right', uint32),
     ('histogram', typeof(HISTOGRAM_DTYPE)[:]),  # array of size n_bins
 ])
 class SplitInfo:
     def __init__(self, gain=0, feature_idx=0, bin_idx=0,
                  gradient_left=0., hessian_left=0.,
-                 gradient_right=0., hessian_right=0.):
+                 gradient_right=0., hessian_right=0.,
+                 n_samples_left=0, n_samples_right=0):
         self.gain = gain
         self.feature_idx = feature_idx
         self.bin_idx = bin_idx
         self.gradient_left = gradient_left
         self.hessian_left = hessian_left
         self.gradient_right = gradient_right
         self.hessian_right = hessian_right
+        self.n_samples_left = n_samples_left
+        self.n_samples_right = n_samples_right
 
 
 @jitclass([
@@ -236,17 +241,16 @@ def find_node_split_subtraction(context, sample_indices, parent_histograms,
     # be to compute an average but it's probably not worth it.
     sum_gradients = (parent_histograms[0]['sum_gradients'].sum() -
                      sibling_histograms[0]['sum_gradients'].sum())
-
+    n_samples = sample_indices.shape[0]
     if context.constant_hessian:
-        n_samples = sample_indices.shape[0]
         sum_hessians = context.constant_hessian_value * float32(n_samples)
     else:
         sum_hessians = (parent_histograms[0]['sum_hessians'].sum() -
                         sibling_histograms[0]['sum_hessians'].sum())
 
     return _parallel_find_split_subtraction(
         context, parent_histograms, sibling_histograms,
-        sum_gradients, sum_hessians)
+        sum_gradients, sum_hessians, n_samples)
 
 
 @njit
@@ -280,7 +284,7 @@ def _parallel_find_split(splitter, sample_indices, ordered_gradients,
     """
     # Pre-allocate the results datastructure to be able to use prange:
     # numba jitclass do not seem to properly support default values for kwargs.
-    split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0.)
+    split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0., 0, 0)
                    for i in range(splitter.n_features)]
     for feature_idx in prange(splitter.n_features):
         split_info = _find_histogram_split(
@@ -295,7 +299,7 @@ def _parallel_find_split(splitter, sample_indices, ordered_gradients,
 @njit(parallel=True)
 def _parallel_find_split_subtraction(context, parent_histograms,
                                      sibling_histograms,
-                                     sum_gradients, sum_hessians):
+                                     sum_gradients, sum_hessians, n_samples):
     """For each feature, find the best bin to split by histogram substraction
 
     This in turn calls _find_histogram_split_subtraction that does not need
@@ -307,12 +311,12 @@ def _parallel_find_split_subtraction(context, parent_histograms,
     histograms by substraction.
     """
     # Pre-allocate the results datastructure to be able to use prange
-    split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0.)
+    split_infos = [SplitInfo(0, 0, 0, 0., 0., 0., 0., 0, 0)
                    for i in range(context.n_features)]
     for feature_idx in prange(context.n_features):
         split_info = _find_histogram_split_subtraction(
             context, feature_idx, parent_histograms, sibling_histograms,
-            sum_gradients, sum_hessians)
+            sum_gradients, sum_hessians, n_samples)
         split_infos[feature_idx] = split_info
 
     return _find_best_feature_to_split_helper(
@@ -324,8 +328,9 @@ def _find_histogram_split(context, feature_idx, sample_indices,
                           ordered_gradients, ordered_hessians,
                           sum_gradients, sum_hessians):
     """Compute the histogram for a given feature and return the best bin."""
+    n_samples = sample_indices.shape[0]
     binned_feature = context.binned_features.T[feature_idx]
-    root_node = binned_feature.shape[0] == sample_indices.shape[0]
+    root_node = binned_feature.shape[0] == n_samples
 
     if root_node:
         if context.constant_hessian:
@@ -346,13 +351,14 @@ def _find_histogram_split(context, feature_idx, sample_indices,
                 ordered_gradients, ordered_hessians)
 
     return _find_best_bin_to_split_helper(
-        context, feature_idx, histogram, sum_gradients, sum_hessians)
+        context, feature_idx, histogram, sum_gradients, sum_hessians,
+        sample_indices.shape[0])
 
 
 @njit(fastmath=True)
 def _find_histogram_split_subtraction(context, feature_idx,
                                       parent_histograms, sibling_histograms,
-                                      sum_gradients, sum_hessians):
+                                      sum_gradients, sum_hessians, n_samples):
     """Compute the histogram by substraction of parent and sibling
 
     Uses the identity: hist(parent) = hist(left) + hist(right)
@@ -362,26 +368,29 @@ def _find_histogram_split_subtraction(context, feature_idx,
         sibling_histograms[feature_idx])
 
     return _find_best_bin_to_split_helper(
-        context, feature_idx, histogram, sum_gradients, sum_hessians)
+        context, feature_idx, histogram, sum_gradients, sum_hessians,
+        n_samples)
 
 
-@njit(locals={'gradient_left': float32, 'hessian_left': float32},
+@njit(locals={'gradient_left': float32, 'hessian_left': float32,
+              'n_samples_left': uint32},
       fastmath=True)
 def _find_best_bin_to_split_helper(context, feature_idx, histogram,
-                                   sum_gradients, sum_hessians):
+                                   sum_gradients, sum_hessians, n_samples):
     """Find best bin to split on and return the corresponding SplitInfo"""
     # Allocate the structure for the best split information. It can be
     # returned as such (with a negative gain) if the min_hessian_to_split
     # condition is not satisfied. Such invalid splits are later discarded by
     # the TreeGrower.
-    best_split = SplitInfo(-1., 0, 0, 0., 0., 0., 0.)
+    best_split = SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0)
+    gradient_left, hessian_left, n_samples_left = 0., 0., 0
 
-    gradient_left, hessian_left = 0., 0.
     for bin_idx in range(context.n_bins):
         gradient_left += histogram[bin_idx]['sum_gradients']
+        n_samples_left += histogram[bin_idx]['count']
         if context.constant_hessian:
-            hessian_left += (histogram[bin_idx]['count'] *
-                             context.constant_hessian_value)
+            hessian_left += (histogram[bin_idx]['count']
+                             * context.constant_hessian_value)
         else:
             hessian_left += histogram[bin_idx]['sum_hessians']
         if hessian_left < context.min_hessian_to_split:
@@ -400,8 +409,10 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram,
             best_split.bin_idx = bin_idx
             best_split.gradient_left = gradient_left
             best_split.hessian_left = hessian_left
+            best_split.n_samples_left = n_samples_left
             best_split.gradient_right = gradient_right
             best_split.hessian_right = hessian_right
+            best_split.n_samples_right = n_samples - n_samples_left
 
     best_split.histogram = histogram
     return best_split

diff --git a/tests/test_compare_lightgbm.py b/tests/test_compare_lightgbm.py
@@ -32,7 +32,7 @@ def test_same_predictions_easy_target(seed, n_samples, max_leaf_nodes):
 
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples
-    min_sample_leaf = 1
+    min_samples_leaf = 1  # XXX: changing this breaks the test
     max_iter = 1
 
     # data = linear target, 5 features, 3 irrelevant.
@@ -44,13 +44,14 @@ def test_same_predictions_easy_target(seed, n_samples, max_leaf_nodes):
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 
     est_lightgbm = lb.LGBMRegressor(n_estimators=max_iter,
-                                    min_data=1, min_data_in_bin=1,
+                                    min_data_in_bin=1,
                                     learning_rate=1,
-                                    min_child_samples=min_sample_leaf,
+                                    min_data_in_leaf=min_samples_leaf,
                                     num_leaves=max_leaf_nodes)
     est_pygbm = GradientBoostingMachine(max_iter=max_iter,
+                                        learning_rate=1,
                                         validation_split=None, scoring=None,
-                                        min_samples_leaf=min_sample_leaf,
+                                        min_samples_leaf=min_samples_leaf,
                                         max_leaf_nodes=max_leaf_nodes)
     est_lightgbm.fit(X_train, y_train)
     est_pygbm.fit(X_train, y_train)

diff --git a/tests/test_grower.py b/tests/test_grower.py
@@ -5,6 +5,7 @@
 from pytest import approx
 
 from pygbm.grower import TreeGrower
+from pygbm.binning import BinMapper
 
 
 def _make_training_data(n_bins=256, constant_hessian=True):
@@ -180,3 +181,42 @@ def predict(features):
     # Check that training set can be recovered exactly:
     predictions = predictor.predict_binned(features_data)
     assert_allclose(predictions, all_gradients)
+
+
+@pytest.mark.parametrize(
+    'n_samples, min_samples_leaf, n_bins, constant_hessian, noise',
+    [
+        (11, 10, 7, True, 0),
+        (13, 10, 42, False, 0),
+        (56, 10, 255, True, 0.1),
+        (101, 3, 7, True, 0),
+        (200, 42, 42, False, 0),
+        (300, 55, 255, True, 0.1),
+    ]
+)
+def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
+                          constant_hessian, noise):
+    rng = np.random.RandomState(seed=0)
+    # data = linear target, 3 features, 1 irrelevant.
+    X = rng.normal(size=(n_samples, 3))
+    y = X[:, 0] - X[:, 1]
+    if noise:
+        y_scale = y.std()
+        y += rng.normal(scale=noise, size=n_samples) * y_scale
+    mapper = BinMapper(max_bins=n_bins)
+    X = mapper.fit_transform(X)
+
+    all_gradients = y.astype(np.float32)
+    if constant_hessian:
+        all_hessians = np.ones(shape=1, dtype=np.float32)
+    else:
+        all_hessians = np.ones_like(all_gradients)
+    grower = TreeGrower(X, all_gradients, all_hessians,
+                        n_bins=n_bins, shrinkage=1.,
+                        min_samples_leaf=min_samples_leaf,
+                        max_leaf_nodes=n_samples)
+    grower.grow()
+    predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
+    for node in predictor.nodes:
+        if node['is_leaf']:
+            assert node['count'] >= min_samples_leaf
diff --git a/tests/test_predictor.py b/tests/test_predictor.py
@@ -20,11 +20,12 @@ def test_boston_dataset():
     gradients = y_train.astype(np.float32)
     hessians = np.ones(1, dtype=np.float32)
 
-    grower = TreeGrower(X_train_binned, gradients, hessians, max_leaf_nodes=31)
+    grower = TreeGrower(X_train_binned, gradients, hessians,
+                        min_samples_leaf=5, max_leaf_nodes=31)
     grower.grow()
     predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
 
-    assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.9
+    assert r2_score(y_train, predictor.predict_binned(X_train_binned)) > 0.75
     assert r2_score(y_test, predictor.predict_binned(X_test_binned)) > 0.65
 
     assert_allclose(predictor.predict(X_train),
@@ -33,5 +34,5 @@ def test_boston_dataset():
     assert_allclose(predictor.predict(X_test),
                     predictor.predict_binned(X_test_binned))
 
-    assert r2_score(y_train, predictor.predict(X_train)) > 0.9
+    assert r2_score(y_train, predictor.predict(X_train)) > 0.75
     assert r2_score(y_test, predictor.predict(X_test)) > 0.65
diff --git a/tests/test_splitting.py b/tests/test_splitting.py
@@ -43,6 +43,10 @@ def test_histogram_split(n_bins):
             assert split_info.bin_idx == true_bin
             assert split_info.gain >= 0
             assert split_info.feature_idx == feature_idx
+            assert (split_info.n_samples_left + split_info.n_samples_right
+                    == sample_indices.shape[0])
+            # Constant hessian: 1. per sample.
+            assert split_info.n_samples_left == split_info.hessian_left
 
 
 @pytest.mark.parametrize('constant_hessian', [True, False])
@@ -256,3 +260,8 @@ def test_split_indices():
                               context.partition[:position_right])
     assert_array_almost_equal(samples_right,
                               context.partition[position_right:])
+
+    # Check that the resulting split indices sizes are consistent with the
+    # count statistics anticipated when looking for the best split.
+    assert samples_left.shape[0] == si_root.n_samples_left
+    assert samples_right.shape[0] == si_root.n_samples_right