Support UTF-8 characters in feature name again (#2976)

* Support UTF-8 characters in feature name again This commit reverts 0d59859. Also see: - #2226 - #2478 - #2229 I reproduced the issue and as @kidotaka gave us a great survey in #2226, I don't conclude that the cause is UTF-8, but "an empty string (character)". Therefore, I revert "throw error when meet non ascii (#2229)" whose commit hash is 0d59859, and add support feture names as UTF-8 again. * add tests * fix check-docs tests * update * fix tests * update .travis.yml * fix tests * update test_r_package.sh * update test_r_package.sh * update test_r_package.sh * add a test for R-package * update test_r_package.sh * update test_r_package.sh * update test_r_package.sh * fix test for R-package * update test_r_package.sh * update test_r_package.sh * update test_r_package.sh * update test_r_package.sh * update * updte * update * remove unneeded comments
microsoft · Apr 10, 2020 · 44a9120 · 44a9120
1 parent c633c6c
commit 44a9120
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 22 deletions.
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
@@ -571,3 +571,27 @@ test_that("lgb.train() works with early stopping for regression", {
     , early_stopping_rounds + 1L
   )
 })
+
+test_that("lgb.train() supports non-ASCII feature names", {
+  testthat::skip("UTF-8 feature names are not fully supported in the R package")
+  dtrain <- lgb.Dataset(
+    data = matrix(rnorm(400L), ncol =  4L)
+    , label = rnorm(100L)
+  )
+  feature_names <- c("F_零", "F_一", "F_二", "F_三")
+  bst <- lgb.train(
+    data = dtrain
+    , nrounds = 5L
+    , obj = "regression"
+    , params = list(
+      metric = "rmse"
+    )
+    , colnames = feature_names
+  )
+  expect_true(lgb.is.Booster(bst))
+  dumped_model <- jsonlite::fromJSON(bst$dump_model())
+  expect_identical(
+    dumped_model[["feature_names"]]
+    , feature_names
+  )
+})
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
@@ -8,7 +8,6 @@
 #include <LightGBM/config.h>
 #include <LightGBM/feature_group.h>
 #include <LightGBM/meta.h>
-#include <LightGBM/utils/common.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/text_reader.h>
@@ -633,10 +632,6 @@ class Dataset {
     // replace ' ' in feature_names with '_'
     bool spaceInFeatureName = false;
     for (auto& feature_name : feature_names_) {
-      // check ascii
-      if (!Common::CheckASCII(feature_name)) {
-        Log::Fatal("Do not support non-ASCII characters in feature name.");
-      }
       // check json
       if (!Common::CheckAllowedJSON(feature_name)) {
         Log::Fatal("Do not support special JSON characters in feature name.");

diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
@@ -921,15 +921,6 @@ static T SafeLog(T x) {
   }
 }
 
-inline bool CheckASCII(const std::string& s) {
-  for (auto c : s) {
-    if (static_cast<unsigned char>(c) > 127) {
-      return false;
-    }
-  }
-  return true;
-}
-
 inline bool CheckAllowedJSON(const std::string& s) {
   unsigned char char_code;
   for (auto c : s) {

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
@@ -2536,7 +2536,7 @@ def model_to_string(self, num_iteration=None, start_iteration=0):
                 ctypes.c_int64(actual_len),
                 ctypes.byref(tmp_out_len),
                 ptr_string_buffer))
-        ret = string_buffer.value.decode()
+        ret = string_buffer.value.decode('utf-8')
         ret += _dump_pandas_categorical(self.pandas_categorical)
         return ret
 
@@ -2582,7 +2582,7 @@ def dump_model(self, num_iteration=None, start_iteration=0):
                 ctypes.c_int64(actual_len),
                 ctypes.byref(tmp_out_len),
                 ptr_string_buffer))
-        ret = json.loads(string_buffer.value.decode())
+        ret = json.loads(string_buffer.value.decode('utf-8'))
         ret['pandas_categorical'] = json.loads(json.dumps(self.pandas_categorical,
                                                           default=json_default_with_numpy))
         return ret
@@ -2754,7 +2754,7 @@ def feature_name(self):
                 "Allocated feature name buffer size ({}) was inferior to the needed size ({})."
                 .format(reserved_string_buffer_size, required_string_buffer_size.value)
             )
-        return [string_buffers[i].value.decode() for i in range_(num_feature)]
+        return [string_buffers[i].value.decode('utf-8') for i in range_(num_feature)]
 
     def feature_importance(self, importance_type='split', iteration=None):
         """Get feature importances.
@@ -2954,7 +2954,7 @@ def __get_eval_info(self):
                         .format(reserved_string_buffer_size, required_string_buffer_size.value)
                     )
                 self.__name_inner_eval = \
-                    [string_buffers[i].value.decode() for i in range_(self.__num_inner_eval)]
+                    [string_buffers[i].value.decode('utf-8') for i in range_(self.__num_inner_eval)]
                 self.__higher_better_inner_eval = \
                     [name.startswith(('auc', 'ndcg@', 'map@')) for name in self.__name_inner_eval]
 

diff --git a/src/io/config.cpp b/src/io/config.cpp
@@ -20,9 +20,6 @@ void Config::KV2Map(std::unordered_map<std::string, std::string>* params, const
     if (tmp_strs.size() == 2) {
       value = Common::RemoveQuotationSymbol(Common::Trim(tmp_strs[1]));
     }
-    if (!Common::CheckASCII(key) || !Common::CheckASCII(value)) {
-      Log::Fatal("Do not support non-ASCII characters in config.");
-    }
     if (key.size() > 0) {
       auto value_search = params->find(key);
       if (value_search == params->end()) {  // not set

diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py
@@ -58,7 +58,7 @@ def c_array(ctype, values):
 
 
 def c_str(string):
-    return ctypes.c_char_p(string.encode('ascii'))
+    return ctypes.c_char_p(string.encode('utf-8'))
 
 
 def load_from_file(filename, reference):

diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
@@ -747,6 +747,21 @@ def test_feature_name(self):
         gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space)
         self.assertListEqual(feature_names, gbm.feature_name())
 
+    def test_feature_name_with_non_ascii(self):
+        X_train = np.random.normal(size=(100, 4))
+        y_train = np.random.random(100)
+        # This has non-ascii strings.
+        feature_names = [u'F_零', u'F_一', u'F_二', u'F_三']
+        params = {'verbose': -1}
+        lgb_train = lgb.Dataset(X_train, y_train)
+
+        gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names)
+        self.assertListEqual(feature_names, gbm.feature_name())
+        gbm.save_model('lgb.model')
+
+        gbm2 = lgb.Booster(model_file='lgb.model')
+        self.assertListEqual(feature_names, gbm2.feature_name())
+
     def test_save_load_copy_pickle(self):
         def train_and_predict(init_model=None, return_model=False):
             X, y = load_boston(True)