-
Notifications
You must be signed in to change notification settings - Fork 3.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Predefined bin thresholds #2325
Changes from 45 commits
fe5c8e2
439bcfd
34e72c8
5b21573
2be599a
6a098f0
0cd4abc
8f73636
6c2d048
feb861f
873fa64
050f57b
4cd89e4
698d9db
9d22071
3178609
934b305
dc45bd1
018182c
7a4df51
6095148
8b57a56
de83a69
01f18fd
360eacf
c478775
e3f1835
2280c56
76fa4cc
93d92eb
fec30a5
503e7b4
eecb80c
a02b3a3
cb12379
abe95d7
7aed689
35ce38b
28c0462
23dbb29
9ed04a3
7cdc732
51e93a9
4e3355a
821b2ab
8a52444
c591e7b
9c767ae
cf0afd4
25387ec
cc249f0
0e26e9f
feeb163
50ff73b
b5752ec
58d86aa
3e81b94
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
[ | ||
{ | ||
"feature": 0, | ||
"bin_upper_bound": [ 0.3, 0.35, 0.4 ] | ||
}, | ||
{ | ||
"feature": 1, | ||
"bin_upper_bound": [ -0.1, -0.15, -0.2 ] | ||
} | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,7 +71,7 @@ namespace LightGBM { | |
return true; | ||
} | ||
|
||
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts, | ||
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts, | ||
int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) { | ||
std::vector<double> bin_upper_bound; | ||
CHECK(max_bin > 0); | ||
|
@@ -149,8 +149,106 @@ namespace LightGBM { | |
return bin_upper_bound; | ||
} | ||
|
||
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, | ||
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { | ||
std::vector<double> FindBinWithPredefinedBin(const double* distinct_values, const int* counts, | ||
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we add an independent function, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @guolinke Yes, I can make that change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is that possible to add predefined bin based on the results of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ping @btrotta There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @guolinke I think that would result in sub-optimal choice of bins. Currently my code takes into account the forced bins when choosing the remaining ones, so that we can find evenly sized bins (i.e. it will not choose thresholds too close to the forced thresholds). But if we use |
||
std::vector<double> bin_upper_bound; | ||
|
||
// get list of distinct values | ||
int left_cnt_data = 0; | ||
int cnt_zero = 0; | ||
int right_cnt_data = 0; | ||
for (int i = 0; i < num_distinct_values; ++i) { | ||
if (distinct_values[i] <= -kZeroThreshold) { | ||
left_cnt_data += counts[i]; | ||
} else if (distinct_values[i] > kZeroThreshold) { | ||
right_cnt_data += counts[i]; | ||
} else { | ||
cnt_zero += counts[i]; | ||
} | ||
} | ||
|
||
// get number of positive and negative distinct values | ||
int left_cnt = -1; | ||
for (int i = 0; i < num_distinct_values; ++i) { | ||
if (distinct_values[i] > -kZeroThreshold) { | ||
left_cnt = i; | ||
break; | ||
} | ||
} | ||
if (left_cnt < 0) { | ||
left_cnt = num_distinct_values; | ||
} | ||
int right_start = -1; | ||
for (int i = left_cnt; i < num_distinct_values; ++i) { | ||
if (distinct_values[i] > kZeroThreshold) { | ||
right_start = i; | ||
break; | ||
} | ||
} | ||
|
||
// include zero bounds and infinity bound | ||
if (max_bin == 2) { | ||
if (left_cnt == 0) { | ||
bin_upper_bound.push_back(kZeroThreshold); | ||
} else { | ||
bin_upper_bound.push_back(-kZeroThreshold); | ||
} | ||
} else if (max_bin >= 3) { | ||
if (left_cnt > 0) { | ||
bin_upper_bound.push_back(-kZeroThreshold); | ||
} | ||
if (right_start >= 0) { | ||
bin_upper_bound.push_back(kZeroThreshold); | ||
} | ||
} | ||
bin_upper_bound.push_back(std::numeric_limits<double>::infinity()); | ||
|
||
// add forced bounds, excluding zeros since we have already added zero bounds | ||
int i = 0; | ||
while (i < forced_upper_bounds.size()) { | ||
if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) { | ||
forced_upper_bounds.erase(forced_upper_bounds.begin() + i); | ||
} else { | ||
++i; | ||
} | ||
} | ||
int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size()); | ||
int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size())); | ||
if (num_to_insert > 0) { | ||
bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert); | ||
} | ||
std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); | ||
|
||
// find remaining bounds | ||
std::vector<double> bounds_to_add; | ||
int value_ind = 0; | ||
for (int i = 0; i < bin_upper_bound.size(); ++i) { | ||
int cnt_in_bin = 0; | ||
int distinct_cnt_in_bin = 0; | ||
int bin_start = value_ind; | ||
while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) { | ||
cnt_in_bin += counts[value_ind]; | ||
++distinct_cnt_in_bin; | ||
++value_ind; | ||
} | ||
int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size()); | ||
int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * bins_remaining / total_sample_cnt))); | ||
num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1; | ||
if (i == bin_upper_bound.size() - 1) { | ||
num_sub_bins = bins_remaining + 1; | ||
} | ||
std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, | ||
num_sub_bins, cnt_in_bin, min_data_in_bin); | ||
bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity | ||
} | ||
bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); | ||
std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); | ||
CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin)); | ||
return bin_upper_bound; | ||
} | ||
|
||
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, | ||
int max_bin, size_t total_sample_cnt, int min_data_in_bin) { | ||
std::vector<double> bin_upper_bound; | ||
int left_cnt_data = 0; | ||
int cnt_zero = 0; | ||
|
@@ -207,8 +305,19 @@ namespace LightGBM { | |
return bin_upper_bound; | ||
} | ||
|
||
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, | ||
int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
if (forced_upper_bounds.empty()) { | ||
return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); | ||
} else { | ||
return FindBinWithPredefinedBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin, | ||
forced_upper_bounds); | ||
} | ||
} | ||
|
||
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, | ||
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) { | ||
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, | ||
std::vector<double> forced_upper_bounds) { | ||
int na_cnt = 0; | ||
int tmp_num_sample_values = 0; | ||
for (int i = 0; i < num_sample_values; ++i) { | ||
|
@@ -276,14 +385,17 @@ namespace LightGBM { | |
int num_distinct_values = static_cast<int>(distinct_values.size()); | ||
if (bin_type_ == BinType::NumericalBin) { | ||
if (missing_type_ == MissingType::Zero) { | ||
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); | ||
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, | ||
min_data_in_bin, forced_upper_bounds); | ||
if (bin_upper_bound_.size() == 2) { | ||
missing_type_ = MissingType::None; | ||
} | ||
} else if (missing_type_ == MissingType::None) { | ||
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); | ||
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, | ||
min_data_in_bin, forced_upper_bounds); | ||
} else { | ||
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin); | ||
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, | ||
min_data_in_bin, forced_upper_bounds); | ||
bin_upper_bound_.push_back(NaN); | ||
} | ||
num_bin_ = static_cast<int>(bin_upper_bound_.size()); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please use const T& for container object anywhere.