Skip to content

Commit

Permalink
Add functionality to force bin thresholds.
Browse files Browse the repository at this point in the history
  • Loading branch information
btrotta committed Aug 13, 2019
1 parent 439bcfd commit 34e72c8
Show file tree
Hide file tree
Showing 10 changed files with 227 additions and 34 deletions.
8 changes: 8 additions & 0 deletions docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,14 @@ Learning Control Parameters

- see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example

- ``forcedbins_filename`` :raw-html:`<a id="forcedbins_filename" title="Permalink to this parameter" href="#forcedbins_filename">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string

- path to a ``.json`` file that specifies bin upper bounds for some or all features

- ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)

- see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example

- ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``

- decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
Expand Down
3 changes: 2 additions & 1 deletion include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,10 @@ class BinMapper {
* \param bin_type Type of this bin
* \param use_missing True to enable missing value handle
* \param zero_as_missing True to use zero as missing value
* \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
*/
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
bool use_missing, bool zero_as_missing);
bool use_missing, bool zero_as_missing, std::vector<double> forced_upper_bounds);

/*!
* \brief Use specific number of bin to calculate the size of this class
Expand Down
5 changes: 5 additions & 0 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,11 @@ struct Config {
// desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
std::string forcedsplits_filename = "";

// desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
// desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning)
// desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/tests/data/forced_bins.json>`__ as an example
std::string forcedbins_filename = "";

// check = >=0.0
// check = <=1.0
// desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
Expand Down
3 changes: 3 additions & 0 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,8 @@ class Dataset {

void addFeaturesFrom(Dataset* other);

static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features);

private:
std::string data_filename_;
/*! \brief Store used features */
Expand Down Expand Up @@ -630,6 +632,7 @@ class Dataset {
bool is_finish_load_;
int max_bin_;
std::vector<int32_t> max_bin_by_feature_;
std::vector<std::vector<double>> forced_bin_bounds_;
int bin_construct_sample_cnt_;
int min_data_in_bin_;
bool use_missing_;
Expand Down
86 changes: 64 additions & 22 deletions src/io/bin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,10 @@ namespace LightGBM {
}

std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, std::vector<double> forced_upper_bounds) {
std::vector<double> bin_upper_bound;

// get list of distinct values
int left_cnt_data = 0;
int cnt_zero = 0;
int right_cnt_data = 0;
Expand All @@ -165,25 +167,17 @@ namespace LightGBM {
}
}

// get number of positive and negative distinct values
int left_cnt = -1;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroThreshold) {
left_cnt = i;
break;
}
}

if (left_cnt < 0) {
left_cnt = num_distinct_values;
}

if (left_cnt > 0) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
left_max_bin = std::max(1, left_max_bin);
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound.back() = -kZeroThreshold;
}

int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kZeroThreshold) {
Expand All @@ -192,21 +186,66 @@ namespace LightGBM {
}
}

if (right_start >= 0) {
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
CHECK(right_max_bin > 0);
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
// include zero bounds if possible
if (max_bin == 2) {
if (left_cnt == 0) {
bin_upper_bound.push_back(kZeroThreshold);
} else {
bin_upper_bound.push_back(-kZeroThreshold);
}
} else if (max_bin >= 3) {
bin_upper_bound.push_back(-kZeroThreshold);
bin_upper_bound.push_back(kZeroThreshold);
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}

// add forced bounds, excluding zeros since we have already added zero bounds
int i = 0;
while (i < forced_upper_bounds.size()) {
if (std::fabs(forced_upper_bounds[i]) <= kZeroThreshold) {
forced_upper_bounds.erase(forced_upper_bounds.begin() + i);
} else {
++i;
}
}
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
int num_to_insert = std::min(max_to_insert, static_cast<int>(forced_upper_bounds.size()));
if (num_to_insert > 0) {
bin_upper_bound.insert(bin_upper_bound.end(), forced_upper_bounds.begin(), forced_upper_bounds.begin() + num_to_insert);
}
std::sort(bin_upper_bound.begin(), bin_upper_bound.end());

// find remaining bounds
std::vector<double> bounds_to_add;
int value_ind = 0;
for (int i = 0; i < bin_upper_bound.size(); ++i) {
int cnt_in_bin = 0;
int distinct_cnt_in_bin = 0;
int bin_start = value_ind;
while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) {
cnt_in_bin += counts[value_ind];
++distinct_cnt_in_bin;
++value_ind;
}
int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * bins_remaining / total_sample_cnt)));
num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
if (i == bin_upper_bound.size() - 1) {
num_sub_bins = bins_remaining + 1;
}
std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin,
num_sub_bins, cnt_in_bin, min_data_in_bin);
bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity
}
bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
std::sort(bin_upper_bound.begin(), bin_upper_bound.end());
CHECK(bin_upper_bound.size() <= max_bin);
return bin_upper_bound;
}

void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing,
std::vector<double> forced_upper_bounds) {
int na_cnt = 0;
int tmp_num_sample_values = 0;
for (int i = 0; i < num_sample_values; ++i) {
Expand Down Expand Up @@ -274,14 +313,17 @@ namespace LightGBM {
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
min_data_in_bin, forced_upper_bounds);
if (bin_upper_bound_.size() == 2) {
missing_type_ = MissingType::None;
}
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
min_data_in_bin, forced_upper_bounds);
} else {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
min_data_in_bin, forced_upper_bounds);
bin_upper_bound_.push_back(NaN);
}
num_bin_ = static_cast<int>(bin_upper_bound_.size());
Expand Down
4 changes: 4 additions & 0 deletions src/io/config_auto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ std::unordered_set<std::string> Config::parameter_set({
"monotone_constraints",
"feature_contri",
"forcedsplits_filename",
"forcedbins_filename",
"refit_decay_rate",
"cegb_tradeoff",
"cegb_penalty_split",
Expand Down Expand Up @@ -396,6 +397,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str

GetString(params, "forcedsplits_filename", &forcedsplits_filename);

GetString(params, "forcedbins_filename", &forcedbins_filename);

GetDouble(params, "refit_decay_rate", &refit_decay_rate);
CHECK(refit_decay_rate >=0.0);
CHECK(refit_decay_rate <=1.0);
Expand Down Expand Up @@ -608,6 +611,7 @@ std::string Config::SaveMembersToString() const {
str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints), ",") << "]\n";
str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n";
str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
Expand Down
64 changes: 63 additions & 1 deletion src/io/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,17 @@
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/threading.h>
#include <LightGBM/json11.hpp>

#include <limits>
#include <chrono>
#include <cstdio>
#include <sstream>
#include <unordered_map>
#include <fstream>

using namespace json11;


namespace LightGBM {

Expand Down Expand Up @@ -324,6 +329,7 @@ void Dataset::Construct(
max_bin_by_feature_.resize(num_total_features_);
max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
}
forced_bin_bounds_ = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
max_bin_ = io_config.max_bin;
min_data_in_bin_ = io_config.min_data_in_bin;
bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
Expand Down Expand Up @@ -356,6 +362,12 @@ void Dataset::ResetConfig(const char* parameters) {
if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
}
if (param.count("forcedbins_filename")) {
std::vector<std::vector<double>> config_bounds = Dataset::GetForcedBins(io_config.forcedbins_filename, num_total_features_);
if (config_bounds != forced_bin_bounds_) {
Log::Warning("Cannot change forced bins after constructed Dataset handle.");
}
}

if (!io_config.monotone_constraints.empty()) {
CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
Expand Down Expand Up @@ -657,6 +669,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
for (int i = 0; i < num_total_features_; ++i) {
size_of_header += feature_names_[i].size() + sizeof(int);
}
// size of forced bins
for (int i = 0; i < num_total_features_; ++i) {
size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
}
writer->Write(&size_of_header, sizeof(size_of_header));
// write header
writer->Write(&num_data_, sizeof(num_data_));
Expand Down Expand Up @@ -705,6 +721,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
const char* c_str = feature_names_[i].c_str();
writer->Write(c_str, sizeof(char) * str_len);
}
// write forced bins
for (int i = 0; i < num_total_features_; ++i) {
int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
writer->Write(&num_bounds, sizeof(int));

for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
}
}

// get size of meta data
size_t size_of_metadata = metadata_.SizesInByte();
Expand Down Expand Up @@ -754,6 +779,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
for (auto n : feature_names_) {
fprintf(file, "%s, ", n.c_str());
}
fprintf(file, "\nforced_bins: ");
for (int i = 0; i < num_total_features_; ++i) {
fprintf(file, "\nfeature %d: ", i);
for (int j = 0; j < forced_bin_bounds_[i].size(); ++j) {
fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
}
}
std::vector<std::unique_ptr<BinIterator>> iterators;
iterators.reserve(num_features_);
for (int j = 0; j < num_features_; ++j) {
Expand Down Expand Up @@ -1005,6 +1037,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
PushVector(feature_names_, other->feature_names_);
PushVector(feature2subfeature_, other->feature2subfeature_);
PushVector(group_feature_cnt_, other->group_feature_cnt_);
PushVector(forced_bin_bounds_, other->forced_bin_bounds_);
feature_groups_.reserve(other->feature_groups_.size());
for (auto& fg : other->feature_groups_) {
feature_groups_.emplace_back(new FeatureGroup(*fg));
Expand All @@ -1027,10 +1060,39 @@ void Dataset::addFeaturesFrom(Dataset* other) {

PushClearIfEmpty(monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
PushClearIfEmpty(feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);

PushClearIfEmpty(max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1);
num_features_ += other->num_features_;
num_total_features_ += other->num_total_features_;
num_groups_ += other->num_groups_;
}


std::vector<std::vector<double>> Dataset::GetForcedBins(std::string forced_bins_path, int num_total_features) {
std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
if (forced_bins_path != "") {
std::ifstream forced_bins_stream(forced_bins_path.c_str());
std::stringstream buffer;
buffer << forced_bins_stream.rdbuf();
std::string err;
Json forced_bins_json = Json::parse(buffer.str(), err);
CHECK(forced_bins_json.is_array());
std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
for (int i = 0; i < forced_bins_arr.size(); ++i) {
int feature_num = forced_bins_arr[i]["feature"].int_value();
CHECK(feature_num < num_total_features);
std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
for (int j = 0; j < bounds_arr.size(); ++j) {
forced_bins[feature_num].push_back(bounds_arr[j].number_value());
}
}
// remove duplicates
for (int i = 0; i < num_total_features; ++i) {
auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
forced_bins[i].erase(new_end, forced_bins[i].end());
}
}
return forced_bins;
}


} // namespace LightGBM
Loading

0 comments on commit 34e72c8

Please sign in to comment.