Skip to content

Commit

Permalink
Support UTF-8 characters in feature name again (#2976)
Browse files Browse the repository at this point in the history
* Support UTF-8 characters in feature name again

This commit reverts 0d59859.
Also see:
- #2226
- #2478
- #2229

I reproduced the issue and as @kidotaka gave us a great survey in #2226,
I don't conclude that the cause is UTF-8, but "an empty string (character)".
Therefore, I revert "throw error when meet non ascii (#2229)" whose commit hash
is 0d59859, and add support feture names as UTF-8 again.

* add tests

* fix check-docs tests

* update

* fix tests

* update .travis.yml

* fix tests

* update test_r_package.sh

* update test_r_package.sh

* update test_r_package.sh

* add a test for R-package

* update test_r_package.sh

* update test_r_package.sh

* update test_r_package.sh

* fix test for R-package

* update test_r_package.sh

* update test_r_package.sh

* update test_r_package.sh

* update test_r_package.sh

* update

* updte

* update

* remove unneeded comments
  • Loading branch information
henry0312 authored Apr 10, 2020
1 parent c633c6c commit 44a9120
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 22 deletions.
24 changes: 24 additions & 0 deletions R-package/tests/testthat/test_basic.R
Original file line number Diff line number Diff line change
Expand Up @@ -571,3 +571,27 @@ test_that("lgb.train() works with early stopping for regression", {
, early_stopping_rounds + 1L
)
})

test_that("lgb.train() supports non-ASCII feature names", {
testthat::skip("UTF-8 feature names are not fully supported in the R package")
dtrain <- lgb.Dataset(
data = matrix(rnorm(400L), ncol = 4L)
, label = rnorm(100L)
)
feature_names <- c("F_零", "F_一", "F_二", "F_三")
bst <- lgb.train(
data = dtrain
, nrounds = 5L
, obj = "regression"
, params = list(
metric = "rmse"
)
, colnames = feature_names
)
expect_true(lgb.is.Booster(bst))
dumped_model <- jsonlite::fromJSON(bst$dump_model())
expect_identical(
dumped_model[["feature_names"]]
, feature_names
)
})
5 changes: 0 additions & 5 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <LightGBM/config.h>
#include <LightGBM/feature_group.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>
Expand Down Expand Up @@ -633,10 +632,6 @@ class Dataset {
// replace ' ' in feature_names with '_'
bool spaceInFeatureName = false;
for (auto& feature_name : feature_names_) {
// check ascii
if (!Common::CheckASCII(feature_name)) {
Log::Fatal("Do not support non-ASCII characters in feature name.");
}
// check json
if (!Common::CheckAllowedJSON(feature_name)) {
Log::Fatal("Do not support special JSON characters in feature name.");
Expand Down
9 changes: 0 additions & 9 deletions include/LightGBM/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -921,15 +921,6 @@ static T SafeLog(T x) {
}
}

inline bool CheckASCII(const std::string& s) {
for (auto c : s) {
if (static_cast<unsigned char>(c) > 127) {
return false;
}
}
return true;
}

inline bool CheckAllowedJSON(const std::string& s) {
unsigned char char_code;
for (auto c : s) {
Expand Down
8 changes: 4 additions & 4 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2536,7 +2536,7 @@ def model_to_string(self, num_iteration=None, start_iteration=0):
ctypes.c_int64(actual_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
ret = string_buffer.value.decode()
ret = string_buffer.value.decode('utf-8')
ret += _dump_pandas_categorical(self.pandas_categorical)
return ret

Expand Down Expand Up @@ -2582,7 +2582,7 @@ def dump_model(self, num_iteration=None, start_iteration=0):
ctypes.c_int64(actual_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
ret = json.loads(string_buffer.value.decode())
ret = json.loads(string_buffer.value.decode('utf-8'))
ret['pandas_categorical'] = json.loads(json.dumps(self.pandas_categorical,
default=json_default_with_numpy))
return ret
Expand Down Expand Up @@ -2754,7 +2754,7 @@ def feature_name(self):
"Allocated feature name buffer size ({}) was inferior to the needed size ({})."
.format(reserved_string_buffer_size, required_string_buffer_size.value)
)
return [string_buffers[i].value.decode() for i in range_(num_feature)]
return [string_buffers[i].value.decode('utf-8') for i in range_(num_feature)]

def feature_importance(self, importance_type='split', iteration=None):
"""Get feature importances.
Expand Down Expand Up @@ -2954,7 +2954,7 @@ def __get_eval_info(self):
.format(reserved_string_buffer_size, required_string_buffer_size.value)
)
self.__name_inner_eval = \
[string_buffers[i].value.decode() for i in range_(self.__num_inner_eval)]
[string_buffers[i].value.decode('utf-8') for i in range_(self.__num_inner_eval)]
self.__higher_better_inner_eval = \
[name.startswith(('auc', 'ndcg@', 'map@')) for name in self.__name_inner_eval]

Expand Down
3 changes: 0 additions & 3 deletions src/io/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@ void Config::KV2Map(std::unordered_map<std::string, std::string>* params, const
if (tmp_strs.size() == 2) {
value = Common::RemoveQuotationSymbol(Common::Trim(tmp_strs[1]));
}
if (!Common::CheckASCII(key) || !Common::CheckASCII(value)) {
Log::Fatal("Do not support non-ASCII characters in config.");
}
if (key.size() > 0) {
auto value_search = params->find(key);
if (value_search == params->end()) { // not set
Expand Down
2 changes: 1 addition & 1 deletion tests/c_api_test/test_.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def c_array(ctype, values):


def c_str(string):
return ctypes.c_char_p(string.encode('ascii'))
return ctypes.c_char_p(string.encode('utf-8'))


def load_from_file(filename, reference):
Expand Down
15 changes: 15 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,21 @@ def test_feature_name(self):
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space)
self.assertListEqual(feature_names, gbm.feature_name())

def test_feature_name_with_non_ascii(self):
X_train = np.random.normal(size=(100, 4))
y_train = np.random.random(100)
# This has non-ascii strings.
feature_names = [u'F_零', u'F_一', u'F_二', u'F_三']
params = {'verbose': -1}
lgb_train = lgb.Dataset(X_train, y_train)

gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names)
self.assertListEqual(feature_names, gbm.feature_name())
gbm.save_model('lgb.model')

gbm2 = lgb.Booster(model_file='lgb.model')
self.assertListEqual(feature_names, gbm2.feature_name())

def test_save_load_copy_pickle(self):
def train_and_predict(init_model=None, return_model=False):
X, y = load_boston(True)
Expand Down

0 comments on commit 44a9120

Please sign in to comment.