microsoft · neNasko1 · Aug 5, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 9, 2024
@@ -76,6 +76,11 @@ class LIGHTGBM_EXPORT Boosting {
   */
   virtual void RefitTree(const int* tree_leaf_prediction, const size_t nrow, const size_t ncol) = 0;
 
+  /*!
+  * \brief Change the leaf values of a tree and update the scores
+  */
+  virtual void RefitTreeManual(int tree_idx, const double *vals, const int vals_size) = 0;
+
   /*!
   * \brief Training logic
   * \param gradients nullptr for using default objective, otherwise use self-defined boosting

@@ -778,6 +778,21 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterRefit(BoosterHandle handle,
                                         int32_t nrow,
                                         int32_t ncol);
 
+/*!
+ * \brief Refit a single tree by specifying a new set of leaf scores for each data point
+ * \note
+ * The length of the array referenced by ``vals`` must be equal to the number of leaves.
+ * \param handle Handle of the Booster model
+ * \param tree_idx Index of the tree to refit
+ * \param vals The new set of leaf scores for each data point
+ * \param vals_size Number of data points for which leaf scores are provided
+ * \return 0 when successful, -1 when failure occurs
+ */
+LIGHTGBM_C_EXPORT int LGBM_BoosterRefitTreeManual(BoosterHandle handle,
+                                                  int32_t tree_idx,
+                                                  const double *vals,
+                                                  const int vals_size);
+
 /*!
  * \brief Update the model by specifying gradient and Hessian directly
  *        (this can be used to support customized loss functions).

@@ -4920,6 +4920,37 @@ def refit(
         new_booster._network = self._network
         return new_booster
 
+    def refit_tree_manual(self, tree_id: int, values: np.ndarray) -> "Booster":
+        """Set all the outputs of a tree and recalculate the dataset scores.
+
+        .. versionadded:: 4.6.0
+
+        Parameters
+        ----------
+        tree_id : int
+            The index of the tree.
+        values : numpy 1-D array
+            Value to set as the outputs of the tree.
+            The number of elements should be equal to the number of leaves in the tree.
+
+        Returns
+        -------
+        self : Booster
+            Booster with the leaf outputs set.
+        """
+        values = _list_to_1d_numpy(values, dtype=np.float64, name="leaf_values")
+
+        _safe_call(
+            _LIB.LGBM_BoosterRefitTreeManual(
+                self._handle,
+                ctypes.c_int(tree_id),
+                values.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
+                ctypes.c_int(len(values)),
+            )
+        )
+        self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
+        return self
+
     def get_leaf_output(self, tree_id: int, leaf_id: int) -> float:
         """Get the output of a leaf.
 

@@ -295,6 +295,25 @@ void GBDT::RefitTree(const int* tree_leaf_prediction, const size_t nrow, const s
   }
 }
 
+void GBDT::RefitTreeManual(int tree_idx, const double *vals, const int vals_size) {
+  CHECK_GE(tree_idx, 0);
+  CHECK_LT(static_cast<size_t>(tree_idx), models_.size());
+  CHECK_EQ(vals_size, models_[tree_idx]->num_leaves());
+  // reset score by adding the difference
+  for (int leaf_id = 0; leaf_id < models_[tree_idx]->num_leaves(); ++leaf_id) {
+    models_[tree_idx]->SetLeafOutput(leaf_id, vals[leaf_id] - models_[tree_idx]->LeafOutput(leaf_id));
+  }
+  // add the delta
+  train_score_updater_->AddScore(models_[tree_idx].get(), tree_idx % num_tree_per_iteration_);
+  for (auto& score_updater : valid_score_updater_) {
+    score_updater->AddScore(models_[tree_idx].get(), tree_idx % num_tree_per_iteration_);
+  }
+  // update the model
+  for (int leaf_id = 0; leaf_id < models_[tree_idx]->num_leaves(); ++leaf_id) {
+    models_[tree_idx]->SetLeafOutput(leaf_id, vals[leaf_id]);
+  }
+}
+
 /* If the custom "average" is implemented it will be used in place of the label average (if enabled)
 *
 * An improvement to this is to have options to explicitly choose

@@ -145,6 +145,8 @@ class GBDT : public GBDTBase {
 
   void RefitTree(const int* tree_leaf_prediction, const size_t nrow, const size_t ncol) override;
 
+  void RefitTreeManual(int tree_idx, const double *vals, const int vals_size) override;
+
   /*!
   * \brief Training logic
   * \param gradients nullptr for using default objective, otherwise use self-defined boosting

@@ -412,6 +412,11 @@ class Booster {
     boosting_->RefitTree(leaf_preds, nrow, ncol);
   }
 
+  void RefitTreeManual(int tree_idx, const double *vals, const int vals_size) {
+    UNIQUE_LOCK(mutex_)
+    boosting_->RefitTreeManual(tree_idx, vals, vals_size);
+  }
+
   bool TrainOneIter(const score_t* gradients, const score_t* hessians) {
     UNIQUE_LOCK(mutex_)
     return boosting_->TrainOneIter(gradients, hessians);
@@ -2058,6 +2063,17 @@ int LGBM_BoosterRefit(BoosterHandle handle, const int32_t* leaf_preds, int32_t n
   API_END();
 }
 
+int LGBM_BoosterRefitTreeManual(BoosterHandle handle,
+                                int tree_idx,
+                                const double *vals,
+                                const int vals_size) {
+  API_BEGIN();
+  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
+  ref_booster->RefitTreeManual(tree_idx, vals, vals_size);
+  API_END();
+}
+
+
 int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished) {
   API_BEGIN();
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);

@@ -2337,6 +2337,49 @@ def test_refit_dataset_params(rng):
     np.testing.assert_allclose(stored_weights, refit_weight)
 
 
+def test_refit_tree_manual():
+    def retrieve_leaves_from_tree(tree):
+        if "leaf_index" in tree:
+            return {tree["leaf_index"]: tree["leaf_value"]}
+
+        left_child = retrieve_leaves_from_tree(tree["left_child"])
+        right_child = retrieve_leaves_from_tree(tree["right_child"])
+
+        return {**left_child, **right_child}
+
+    def retrieve_leaves_from_booster(booster, iteration):
+        tree = booster.dump_model(0, iteration)["tree_info"][0]["tree_structure"]
+        return retrieve_leaves_from_tree(tree)
+
+    def debias_callback(env):
+        booster = env.model
+        curr_values = retrieve_leaves_from_booster(booster, env.iteration)
+        eval_pred = booster.predict(df)
+        delta = np.log(np.mean(y) / np.mean(eval_pred))
+        refitted_values = [curr_values[ix] + delta for ix in range(len(curr_values))]
+        booster.refit_tree_manual(env.iteration, refitted_values)
+
+    X, y = make_synthetic_regression()
+    y = np.abs(y)
+    df = pd_DataFrame(X, columns=["x1", "x2", "x3", "x4"])
+    ds = lgb.Dataset(df, y)
+
+    params = {
+        "verbose": -1,
+        "n_estimators": 5,
+        "num_leaves": 5,
+        "objective": "gamma",
+    }
+
+    # Check that the model is biased when no callback is provided
+    bst = lgb.train(params, ds)
+    np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, bst.predict(df).mean(), y.mean())
+
+    # Check if debiasing worked
+    bst = lgb.train(params, ds, callbacks=[debias_callback])
+    np.testing.assert_allclose(bst.predict(df).mean(), y.mean())
+
+
 @pytest.mark.parametrize("boosting_type", ["rf", "dart"])
 def test_mape_for_specific_boosting_types(boosting_type):
     X, y = make_synthetic_regression()