From 6c2d048c79075be9f124d46553715dbeae06471d Mon Sep 17 00:00:00 2001 From: btrotta Date: Wed, 14 Aug 2019 20:10:21 +1000 Subject: [PATCH] Fix style issues. --- docs/Parameters.rst | 4 +- .../regression}/forced_bins.json | 2 +- examples/regression/train.conf | 3 ++ include/LightGBM/config.h | 4 +- src/io/bin.cpp | 2 +- src/io/dataset.cpp | 40 ++++++++++--------- src/io/dataset_loader.cpp | 1 + tests/python_package_test/test_engine.py | 3 +- 8 files changed, 34 insertions(+), 25 deletions(-) rename {tests/data => examples/regression}/forced_bins.json (98%) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 10105bfbed5a..c4f45f0010c4 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -408,9 +408,9 @@ Learning Control Parameters - path to a ``.json`` file that specifies bin upper bounds for some or all features - - ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) - - see `this file `__ as an example + - see `this file `__ as an example - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` diff --git a/tests/data/forced_bins.json b/examples/regression/forced_bins.json similarity index 98% rename from tests/data/forced_bins.json rename to examples/regression/forced_bins.json index aa74c36ffb78..1ee0a49d727c 100644 --- a/tests/data/forced_bins.json +++ b/examples/regression/forced_bins.json @@ -7,4 +7,4 @@ "feature": 1, "bin_upper_bound": [ -0.1, -0.15, -0.2 ] } -] \ No newline at end of file +] diff --git a/examples/regression/train.conf b/examples/regression/train.conf index 11396c23ecc2..4c73169dc8f9 100644 --- a/examples/regression/train.conf +++ b/examples/regression/train.conf @@ -29,6 +29,9 @@ is_training_metric = true # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. max_bin = 255 +# forced bin thresholds +# forcedbins_filename = forced_bins.json + # training data # if exsting weight file, should name to "regression.train.weight" # alias: train_data, train diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index d2a953ddb416..56903a9b96ae 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -403,8 +403,8 @@ struct Config { std::string forcedsplits_filename = ""; // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features - // desc = ``.json`` file should contain an array of objects, each containing the name ``feature`` (integer feature number) and ``bin_upper_bounds`` (array of thresolds for binning) - // desc = see `this file `__ as an example + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bounds`` (array of thresholds for binning) + // desc = see `this file `__ as an example std::string forcedbins_filename = ""; // check = >=0.0 diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 62713d1bddd3..2556a59b4715 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -320,7 +320,7 @@ namespace LightGBM { } } else if (missing_type_ == MissingType::None) { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, - min_data_in_bin, forced_upper_bounds); + min_data_in_bin, forced_upper_bounds); } else { bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin, forced_upper_bounds); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c931e945cd24..269c06c4c37d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -5,10 +5,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -1071,24 +1071,28 @@ std::vector> Dataset::GetForcedBins(std::string forced_bins_ std::vector> forced_bins(num_total_features, std::vector()); if (forced_bins_path != "") { std::ifstream forced_bins_stream(forced_bins_path.c_str()); - std::stringstream buffer; - buffer << forced_bins_stream.rdbuf(); - std::string err; - Json forced_bins_json = Json::parse(buffer.str(), err); - CHECK(forced_bins_json.is_array()); - std::vector forced_bins_arr = forced_bins_json.array_items(); - for (int i = 0; i < forced_bins_arr.size(); ++i) { - int feature_num = forced_bins_arr[i]["feature"].int_value(); - CHECK(feature_num < num_total_features); - std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); - for (int j = 0; j < bounds_arr.size(); ++j) { - forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + if (forced_bins_stream.fail()) { + Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); + } else { + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (int i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (int j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); } - } - // remove duplicates - for (int i = 0; i < num_total_features; ++i) { - auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); - forced_bins[i].erase(new_end, forced_bins[i].end()); } } return forced_bins; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index f36d5b1df27d..eb83d74bfe3d 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2016 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #include #include #include diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4eb1e2cb8e38..2420ee9ec853 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1596,7 +1596,8 @@ def test_forced_bins(self): x[:, 0] = np.arange(0, 1, 0.01) x[:, 1] = -np.arange(0, 1, 0.01) y = np.arange(0, 1, 0.01) - forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/forced_bins.json') + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../examples/regression/forced_bins.json') params = {'objective': 'regression_l1', 'max_bin': 6, 'forcedbins_filename': forcedbins_filename,