-
Notifications
You must be signed in to change notification settings - Fork 3.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix number of features mismatching in continual training (fix #5156) #5157
base: master
Are you sure you want to change the base?
Changes from 12 commits
24b7df0
442c03a
2887ac0
0a20a13
bda5bdb
7e7190f
adb8982
c823137
750d35c
3e52538
23e9991
d719d78
aab8d1f
1f09936
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -632,31 +632,54 @@ std::vector<double> GBDT::FeatureImportance(int num_iteration, int importance_ty | |
} | ||
|
||
std::vector<double> feature_importances(max_feature_idx_ + 1, 0.0); | ||
bool warn_about_feature_number = false; | ||
int max_feature_index_found = -1; | ||
if (importance_type == 0) { | ||
for (int iter = 0; iter < num_used_model; ++iter) { | ||
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { | ||
if (models_[iter]->split_gain(split_idx) > 0) { | ||
const int real_feature_index = models_[iter]->split_feature(split_idx); | ||
#ifdef DEBUG | ||
CHECK_GE(models_[iter]->split_feature(split_idx), 0); | ||
CHECK_GE(real_feature_index, 0); | ||
#endif | ||
feature_importances[models_[iter]->split_feature(split_idx)] += 1.0; | ||
if (static_cast<size_t>(real_feature_index) >= feature_importances.size()) { | ||
warn_about_feature_number = true; | ||
if (real_feature_index > max_feature_index_found) { | ||
max_feature_index_found = real_feature_index; | ||
} | ||
} else { | ||
feature_importances[real_feature_index] += 1.0; | ||
} | ||
} | ||
} | ||
} | ||
} else if (importance_type == 1) { | ||
for (int iter = 0; iter < num_used_model; ++iter) { | ||
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { | ||
if (models_[iter]->split_gain(split_idx) > 0) { | ||
const int real_feature_index = models_[iter]->split_feature(split_idx); | ||
#ifdef DEBUG | ||
CHECK_GE(models_[iter]->split_feature(split_idx), 0); | ||
CHECK_GE(real_feature_index, 0); | ||
#endif | ||
feature_importances[models_[iter]->split_feature(split_idx)] += models_[iter]->split_gain(split_idx); | ||
if (static_cast<size_t>(real_feature_index) >= feature_importances.size()) { | ||
warn_about_feature_number = true; | ||
if (real_feature_index > max_feature_index_found) { | ||
max_feature_index_found = real_feature_index; | ||
} | ||
} else { | ||
feature_importances[real_feature_index] += models_[iter]->split_gain(split_idx); | ||
} | ||
} | ||
} | ||
} | ||
} else { | ||
Log::Fatal("Unknown importance type: only support split=0 and gain=1"); | ||
} | ||
if (warn_about_feature_number) { | ||
Log::Warning("Only %d features found in dataset for continual training, but at least %d features found in initial model.", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible to add some information here that can help users understand what they should do about this warning? Or is this warning just being raised for the purpose of the unit test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added Log::Warning("Please check the number of features used in continual training."); in d719d78. Do you think it is appropriate? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! But I don't think "please check" will be any more informative for users, and I think that providing two separate WARN-level log messages for the same problem might be confusing. Would you consider this? Log::Warning(
"Only %d features found in dataset, but at least %d features found in initial model. "
"If this is intentional and the features in dataset match those used to train the initial model, no action is required."
"If not, training continuation may fail or produce bad results."
"To suppress this warning, provide a dataset with at least %d features"
static_cast<int>(feature_importances.size()),
max_feature_index_found + 1,
max_feature_index_found + 1
) |
||
static_cast<int>(feature_importances.size()), max_feature_index_found + 1); | ||
Log::Warning("Please check the number of features used in continual training."); | ||
} | ||
return feature_importances; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -970,6 +970,29 @@ def test_continue_train_multiclass(): | |||||||||||||||||||||||||||
assert evals_result['valid_0']['multi_logloss'][-1] == pytest.approx(ret) | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
def test_continue_train_different_feature_size(capsys): | ||||||||||||||||||||||||||||
np.random.seed(0) | ||||||||||||||||||||||||||||
train_X = np.hstack([np.ones(800).reshape(-1, 8), np.arange(200, 0, -1).reshape(-1, 2)]) | ||||||||||||||||||||||||||||
train_y = np.sum(train_X[:, -2:], axis=1) | ||||||||||||||||||||||||||||
train_data = lgb.Dataset(train_X, label=train_y) | ||||||||||||||||||||||||||||
params = { | ||||||||||||||||||||||||||||
"objective": "regression", | ||||||||||||||||||||||||||||
"num_trees": 10, | ||||||||||||||||||||||||||||
"num_leaves": 31, | ||||||||||||||||||||||||||||
"verbose": -1, | ||||||||||||||||||||||||||||
'predict_disable_shape_check': True, | ||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is it necessary to pass There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, good observation! But LightGBM/python-package/lightgbm/engine.py Lines 153 to 154 in 0018206
LightGBM/python-package/lightgbm/engine.py Lines 160 to 161 in 0018206
LightGBM/python-package/lightgbm/basic.py Lines 1519 to 1522 in 0018206
LightGBM/python-package/lightgbm/basic.py Line 1393 in 0018206
LightGBM/python-package/lightgbm/basic.py Lines 1399 to 1402 in 0018206
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see! So then is the bug addressed by this PR only present when you've also set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, without setting
I think the second sentence of the error "You can set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||
model = lgb.train(train_set=train_data, params=params) | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
train_X_cont = np.random.rand(100, 5) | ||||||||||||||||||||||||||||
train_y_cont = np.sum(train_X_cont, axis=1) | ||||||||||||||||||||||||||||
train_data_cont = lgb.Dataset(train_X_cont, label=train_y_cont) | ||||||||||||||||||||||||||||
params.update({"verbose": 2}) | ||||||||||||||||||||||||||||
lgb.train(train_set=train_data_cont, params=params, init_model=model) | ||||||||||||||||||||||||||||
captured = capsys.readouterr() | ||||||||||||||||||||||||||||
assert captured.out.find("features found in dataset for continual training, but at least") != -1 | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
def test_cv(): | ||||||||||||||||||||||||||||
X_train, y_train = make_synthetic_regression() | ||||||||||||||||||||||||||||
params = {'verbose': -1} | ||||||||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Provide warning instead of enlarging feature importance vector. Since
GBDT::FeatureImportance
is used by Python API and the python code doesn't know the final size of feature importance vector, dynamically enlarging the vector will cause out of bound access.