Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package] Introduce refit_tree_manual to Booster class. #6617

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
5 changes: 5 additions & 0 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ class LIGHTGBM_EXPORT Boosting {
*/
virtual void RefitTree(const int* tree_leaf_prediction, const size_t nrow, const size_t ncol) = 0;

/*!
* \brief Change the leaf values of a tree and update the scores
*/
virtual void RefitTreeManual(int tree_idx, const double *vals, const int vals_size) = 0;

/*!
* \brief Training logic
* \param gradients nullptr for using default objective, otherwise use self-defined boosting
Expand Down
15 changes: 15 additions & 0 deletions include/LightGBM/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,21 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterRefit(BoosterHandle handle,
int32_t nrow,
int32_t ncol);

/*!
* \brief Refit a single tree by specifying a new set of leaf scores for each data point
* \note
* The length of the array referenced by ``vals`` must be equal to the number of leaves.
* \param handle Handle of the Booster model
* \param tree_idx Index of the tree to refit
* \param vals The new set of leaf scores for each data point
* \param vals_size Number of data points for which leaf scores are provided
* \return 0 when successful, -1 when failure occurs
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterRefitTreeManual(BoosterHandle handle,
int32_t tree_idx,
const double *vals,
const int vals_size);

/*!
* \brief Update the model by specifying gradient and Hessian directly
* (this can be used to support customized loss functions).
Expand Down
31 changes: 31 additions & 0 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4920,6 +4920,37 @@ def refit(
new_booster._network = self._network
return new_booster

def refit_tree_manual(self, tree_id: int, values: np.ndarray) -> "Booster":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a user it would not yet be clear to me how this function is different to set_leaf_output, i.e. why is this not just called set_leaf_outputs?

Copy link
Contributor Author

@neNasko1 neNasko1 Sep 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you propose changing the name of the function or just writing better docs? I am a bit unsure if calling this function set_leaf_outputs would be a bit strange as it does additional things than to just update the leaf values.

"""Set all the outputs of a tree and recalculate the dataset scores.

.. versionadded:: 4.6.0

Parameters
----------
tree_id : int
The index of the tree.
values : numpy 1-D array
Value to set as the outputs of the tree.
The number of elements should be equal to the number of leaves in the tree.

Returns
-------
self : Booster
Booster with the leaf outputs set.
"""
values = _list_to_1d_numpy(values, dtype=np.float64, name="leaf_values")

_safe_call(
_LIB.LGBM_BoosterRefitTreeManual(
self._handle,
ctypes.c_int(tree_id),
values.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
ctypes.c_int(len(values)),
)
)
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
return self

def get_leaf_output(self, tree_id: int, leaf_id: int) -> float:
"""Get the output of a leaf.

Expand Down
19 changes: 19 additions & 0 deletions src/boosting/gbdt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,25 @@ void GBDT::RefitTree(const int* tree_leaf_prediction, const size_t nrow, const s
}
}

void GBDT::RefitTreeManual(int tree_idx, const double *vals, const int vals_size) {
CHECK_GE(tree_idx, 0);
CHECK_LT(static_cast<size_t>(tree_idx), models_.size());
CHECK_EQ(vals_size, models_[tree_idx]->num_leaves());
// reset score by adding the difference
for (int leaf_id = 0; leaf_id < models_[tree_idx]->num_leaves(); ++leaf_id) {
models_[tree_idx]->SetLeafOutput(leaf_id, vals[leaf_id] - models_[tree_idx]->LeafOutput(leaf_id));
}
// add the delta
train_score_updater_->AddScore(models_[tree_idx].get(), tree_idx % num_tree_per_iteration_);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(models_[tree_idx].get(), tree_idx % num_tree_per_iteration_);
}
// update the model
for (int leaf_id = 0; leaf_id < models_[tree_idx]->num_leaves(); ++leaf_id) {
models_[tree_idx]->SetLeafOutput(leaf_id, vals[leaf_id]);
}
}

/* If the custom "average" is implemented it will be used in place of the label average (if enabled)
*
* An improvement to this is to have options to explicitly choose
Expand Down
2 changes: 2 additions & 0 deletions src/boosting/gbdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ class GBDT : public GBDTBase {

void RefitTree(const int* tree_leaf_prediction, const size_t nrow, const size_t ncol) override;

void RefitTreeManual(int tree_idx, const double *vals, const int vals_size) override;

/*!
* \brief Training logic
* \param gradients nullptr for using default objective, otherwise use self-defined boosting
Expand Down
16 changes: 16 additions & 0 deletions src/c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,11 @@ class Booster {
boosting_->RefitTree(leaf_preds, nrow, ncol);
}

void RefitTreeManual(int tree_idx, const double *vals, const int vals_size) {
UNIQUE_LOCK(mutex_)
boosting_->RefitTreeManual(tree_idx, vals, vals_size);
}

bool TrainOneIter(const score_t* gradients, const score_t* hessians) {
UNIQUE_LOCK(mutex_)
return boosting_->TrainOneIter(gradients, hessians);
Expand Down Expand Up @@ -2058,6 +2063,17 @@ int LGBM_BoosterRefit(BoosterHandle handle, const int32_t* leaf_preds, int32_t n
API_END();
}

int LGBM_BoosterRefitTreeManual(BoosterHandle handle,
int tree_idx,
const double *vals,
const int vals_size) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->RefitTreeManual(tree_idx, vals, vals_size);
API_END();
}


int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
Expand Down
43 changes: 43 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2337,6 +2337,49 @@ def test_refit_dataset_params(rng):
np.testing.assert_allclose(stored_weights, refit_weight)


def test_refit_tree_manual():
def retrieve_leaves_from_tree(tree):
if "leaf_index" in tree:
return {tree["leaf_index"]: tree["leaf_value"]}

left_child = retrieve_leaves_from_tree(tree["left_child"])
right_child = retrieve_leaves_from_tree(tree["right_child"])

return {**left_child, **right_child}

def retrieve_leaves_from_booster(booster, iteration):
tree = booster.dump_model(0, iteration)["tree_info"][0]["tree_structure"]
return retrieve_leaves_from_tree(tree)

def debias_callback(env):
booster = env.model
curr_values = retrieve_leaves_from_booster(booster, env.iteration)
eval_pred = booster.predict(df)
delta = np.log(np.mean(y) / np.mean(eval_pred))
refitted_values = [curr_values[ix] + delta for ix in range(len(curr_values))]
booster.refit_tree_manual(env.iteration, refitted_values)

X, y = make_synthetic_regression()
y = np.abs(y)
df = pd_DataFrame(X, columns=["x1", "x2", "x3", "x4"])
ds = lgb.Dataset(df, y)

params = {
"verbose": -1,
"n_estimators": 5,
"num_leaves": 5,
"objective": "gamma",
}

# Check that the model is biased when no callback is provided
bst = lgb.train(params, ds)
np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, bst.predict(df).mean(), y.mean())

# Check if debiasing worked
bst = lgb.train(params, ds, callbacks=[debias_callback])
np.testing.assert_allclose(bst.predict(df).mean(), y.mean())


@pytest.mark.parametrize("boosting_type", ["rf", "dart"])
def test_mape_for_specific_boosting_types(boosting_type):
X, y = make_synthetic_regression()
Expand Down
Loading