microsoft · shiyu1994 · May 7, 2021 · Mar 18, 2021 · Mar 18, 2021 · Mar 18, 2021
@@ -820,6 +820,12 @@ Dataset Parameters
 
    -  **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent function
 
+-  ``precise_float_parser`` :raw-html:`<a id="precise_float_parser" title="Permalink to this parameter" href="#precise_float_parser">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  use precise floating point number parsing for text parser (e.g. CSV, TSV, LibSVM input)
+
+   -  **Note**: setting this to ``true`` may lead to much slower text parsing
+
 Predict Parameters
 ~~~~~~~~~~~~~~~~~~
 

@@ -714,6 +714,10 @@ struct Config {
   // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent function
   bool save_binary = false;
 
+  // desc = use precise floating point number parsing for text parser (e.g. CSV, TSV, LibSVM input)
+  // desc = **Note**: setting this to ``true`` may lead to much slower text parsing
+  bool precise_float_parser = false;
+
   #pragma endregion
 
   #pragma region Predict Parameters

@@ -252,6 +252,8 @@ class Metadata {
 /*! \brief Interface for Parser */
 class Parser {
  public:
+  typedef const char* (*AtofFunc)(const char* p, double* out);
+
   /*! \brief virtual destructor */
   virtual ~Parser() {}
 
@@ -271,9 +273,10 @@ class Parser {
   * \param filename One Filename of data
   * \param num_features Pass num_features of this data file if you know, <=0 means don't know
   * \param label_idx index of label column
+  * \param precise_float_parser using precise floating point number parsing if true
   * \return Object of parser
   */
-  static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx);
+  static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx, bool precise_float_parser);
 };
 
 /*! \brief The main class of data set,

@@ -18,6 +18,7 @@
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <functional>
 #include <iomanip>
@@ -330,6 +331,27 @@ inline static const char* Atof(const char* p, double* out) {
   return p;
 }
 
+// Use fast_double_parse and strtod (if parse failed) to parse double.
+inline static const char* AtofPrecise(const char* p, double* out) {
 T operator()(const std::string& str) const { 
   double tmp; 
   // Fast (common) path: For numeric inputs in RFC 7159 format: 
   const bool fast_parse_succeeded = fast_double_parser::parse_number(str.c_str(), &tmp); 
   // Rare path: Not in RFC 7159 format. Possible "inf", "nan", etc. 
   if (!fast_parse_succeeded) { 
     std::string strlower(str); 
     std::transform(strlower.begin(), strlower.end(), strlower.begin(), [](int c) -> char { return static_cast<char>(::tolower(c)); }); 
     if (strlower == std::string("inf")) 
       tmp = std::numeric_limits<double>::infinity(); 
     else if (strlower == std::string("-inf")) 
       tmp = -std::numeric_limits<double>::infinity(); 
     else if (strlower == std::string("nan")) 
       tmp = std::numeric_limits<double>::quiet_NaN(); 
     else if (strlower == std::string("-nan")) 
       tmp = -std::numeric_limits<double>::quiet_NaN(); 
     else 
       Log::Fatal("Failed to parse double: %s", str.c_str()); 
   } 
   return static_cast<T>(tmp); 
 } 
 T operator()(const std::string& str) const { 
   double tmp; 
  
   // Fast (common) path: For numeric inputs in RFC 7159 format: 
   const bool fast_parse_succeeded = fast_double_parser::parse_number(str.c_str(), &tmp); 
  
   // Rare path: Not in RFC 7159 format. Possible "inf", "nan", etc. 
   if (!fast_parse_succeeded) { 
     std::string strlower(str); 
     std::transform(strlower.begin(), strlower.end(), strlower.begin(), [](int c) -> char { return static_cast<char>(::tolower(c)); }); 
     if (strlower == std::string("inf")) 
       tmp = std::numeric_limits<double>::infinity(); 
     else if (strlower == std::string("-inf")) 
       tmp = -std::numeric_limits<double>::infinity(); 
     else if (strlower == std::string("nan")) 
       tmp = std::numeric_limits<double>::quiet_NaN(); 
     else if (strlower == std::string("-nan")) 
       tmp = -std::numeric_limits<double>::quiet_NaN(); 
     else 
       Log::Fatal("Failed to parse double: %s", str.c_str()); 
   } 
  
   return static_cast<T>(tmp); 
 } 
+  const char* end = fast_double_parser::parse_number(p, out);
+
+  if (end != nullptr) {
+    return end;
+  }
+
+  // Rare path: Not in RFC 7159 format. Possible "inf", "nan", etc. Fallback to standard library:
+  char* end2;
+  errno = 0;  // This is Required before calling strtod.
+  *out = std::strtod(p, &end2);  // strtod is locale aware.
+  if (end2 == p) {
+    Log::Fatal("no conversion to double for: %s", p);
+  }
+  if (errno == ERANGE) {
+    Log::Fatal("convert to double got underflow or overflow: %s", p);
+  }
+  return end2;
+}
+
 inline static bool AtoiAndCheck(const char* p, int* out) {
   const char* after = Atoi(p, out);
   if (*after != '\0') {
@@ -1079,22 +1101,8 @@ struct __StringToTHelper<T, true> {
   T operator()(const std::string& str) const {
     double tmp;
 
-    // Fast (common) path: For numeric inputs in RFC 7159 format:
-    const bool fast_parse_succeeded = fast_double_parser::parse_number(str.c_str(), &tmp);
-
-    // Rare path: Not in RFC 7159 format. Possible "inf", "nan", etc.
-    if (!fast_parse_succeeded) {
-      std::string strlower(str);
-      std::transform(strlower.begin(), strlower.end(), strlower.begin(), [](int c) -> char { return static_cast<char>(::tolower(c)); });
-      if (strlower == std::string("inf"))
-        tmp = std::numeric_limits<double>::infinity();
-      else if (strlower == std::string("-inf"))
-        tmp = -std::numeric_limits<double>::infinity();
-      else if (strlower == std::string("nan"))
-        tmp = std::numeric_limits<double>::quiet_NaN();
-      else if (strlower == std::string("-nan"))
-        tmp = -std::numeric_limits<double>::quiet_NaN();
-      else
+    const char* end = Common::AtofPrecise(str.c_str(), &tmp);
+    if (end == str.c_str()) {
         Log::Fatal("Failed to parse double: %s", str.c_str());
     }
 

@@ -221,7 +221,8 @@ void Application::Predict() {
   if (config_.task == TaskType::KRefitTree) {
     // create predictor
     Predictor predictor(boosting_.get(), 0, -1, false, true, false, false, 1, 1);
-    predictor.Predict(config_.data.c_str(), config_.output_result.c_str(), config_.header, config_.predict_disable_shape_check);
+    predictor.Predict(config_.data.c_str(), config_.output_result.c_str(), config_.header, config_.predict_disable_shape_check,
+                      config_.precise_float_parser);
     TextReader<int> result_reader(config_.output_result.c_str(), false);
     result_reader.ReadAllLines();
     std::vector<std::vector<int>> pred_leaf(result_reader.Lines().size());
@@ -251,7 +252,8 @@ void Application::Predict() {
                         config_.pred_early_stop, config_.pred_early_stop_freq,
                         config_.pred_early_stop_margin);
     predictor.Predict(config_.data.c_str(),
-                      config_.output_result.c_str(), config_.header, config_.predict_disable_shape_check);
+                      config_.output_result.c_str(), config_.header, config_.predict_disable_shape_check,
+                      config_.precise_float_parser);
     Log::Info("Finished prediction");
   }
 }

@@ -160,13 +160,14 @@ class Predictor {
   * \param data_filename Filename of data
   * \param result_filename Filename of output result
   */
-  void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check) {
+  void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check, bool precise_float_parser) {
     auto writer = VirtualFileWriter::Make(result_filename);
     if (!writer->Init()) {
-      Log::Fatal("Prediction results file %s cannot be found", result_filename);
+      Log::Fatal("Prediction results file %s cannot be created", result_filename);
     }
     auto label_idx = header ? -1 : boosting_->LabelIdx();
-    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(data_filename, header, boosting_->MaxFeatureIdx() + 1, label_idx));
+    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(data_filename, header, boosting_->MaxFeatureIdx() + 1, label_idx,
+                                                               precise_float_parser));
 
     if (parser == nullptr) {
       Log::Fatal("Could not recognize the data format of data file %s", data_filename);

@@ -709,7 +709,8 @@ class Booster {
     Predictor predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib,
                         config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin);
     bool bool_data_has_header = data_has_header > 0 ? true : false;
-    predictor.Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check);
+    predictor.Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check,
+                      config.precise_float_parser);
   }
 
   void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) const {

@@ -261,6 +261,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "categorical_feature",
   "forcedbins_filename",
   "save_binary",
+  "precise_float_parser",
   "start_iteration_predict",
   "num_iteration_predict",
   "predict_raw_score",
@@ -527,6 +528,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetBool(params, "save_binary", &save_binary);
 
+  GetBool(params, "precise_float_parser", &precise_float_parser);
+
   GetInt(params, "start_iteration_predict", &start_iteration_predict);
 
   GetInt(params, "num_iteration_predict", &num_iteration_predict);
@@ -709,6 +712,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[ignore_column: " << ignore_column << "]\n";
   str_buf << "[categorical_feature: " << categorical_feature << "]\n";
   str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n";
+  str_buf << "[precise_float_parser: " << precise_float_parser << "]\n";
   str_buf << "[objective_seed: " << objective_seed << "]\n";
   str_buf << "[num_class: " << num_class << "]\n";
   str_buf << "[is_unbalance: " << is_unbalance << "]\n";

@@ -196,7 +196,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
   auto bin_filename = CheckCanLoadFromBin(filename);
   bool is_load_from_binary = false;
   if (bin_filename.size() == 0) {
-    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_));
+    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_,
+                                                               config_.precise_float_parser));
     if (parser == nullptr) {
       Log::Fatal("Could not recognize data format of %s", filename);
     }
@@ -267,7 +268,8 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
   }
   auto bin_filename = CheckCanLoadFromBin(filename);
   if (bin_filename.size() == 0) {
-    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_));
+    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_,
+                                                               config_.precise_float_parser));
     if (parser == nullptr) {
       Log::Fatal("Could not recognize data format of %s", filename);
     }

@@ -6,9 +6,6 @@
 
 #include <string>
 #include <algorithm>
-#include <fstream>
-#include <functional>
-#include <iostream>
 #include <memory>
 
 namespace LightGBM {
@@ -232,7 +229,7 @@ DataType GetDataType(const char* filename, bool header,
   return type;
 }
 
-Parser* Parser::CreateParser(const char* filename, bool header, int num_features, int label_idx) {
+Parser* Parser::CreateParser(const char* filename, bool header, int num_features, int label_idx, bool precise_float_parser) {
   const int n_read_line = 32;
   auto lines = ReadKLineFromFile(filename, header, n_read_line);
   int num_col = 0;
@@ -242,15 +239,16 @@ Parser* Parser::CreateParser(const char* filename, bool header, int num_features
   }
   std::unique_ptr<Parser> ret;
   int output_label_index = -1;
+  AtofFunc atof = precise_float_parser ? Common::AtofPrecise : Common::Atof;
   if (type == DataType::LIBSVM) {
     output_label_index = GetLabelIdxForLibsvm(lines[0], num_features, label_idx);
-    ret.reset(new LibSVMParser(output_label_index, num_col));
+    ret.reset(new LibSVMParser(output_label_index, num_col, atof));
   } else if (type == DataType::TSV) {
     output_label_index = GetLabelIdxForTSV(lines[0], num_features, label_idx);
-    ret.reset(new TSVParser(output_label_index, num_col));
+    ret.reset(new TSVParser(output_label_index, num_col, atof));
   } else if (type == DataType::CSV) {
     output_label_index = GetLabelIdxForCSV(lines[0], num_features, label_idx);
-    ret.reset(new CSVParser(output_label_index, num_col));
+    ret.reset(new CSVParser(output_label_index, num_col, atof));
   }
 
   if (output_label_index < 0 && label_idx >= 0) {

@@ -17,8 +17,8 @@ namespace LightGBM {
 
 class CSVParser: public Parser {
  public:
-  explicit CSVParser(int label_idx, int total_columns)
-    :label_idx_(label_idx), total_columns_(total_columns) {
+  explicit CSVParser(int label_idx, int total_columns, AtofFunc atof)
+    :label_idx_(label_idx), total_columns_(total_columns), atof_(atof) {
   }
   inline void ParseOneLine(const char* str,
     std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
@@ -27,7 +27,7 @@ class CSVParser: public Parser {
     int offset = 0;
     *out_label = 0.0f;
     while (*str != '\0') {
-      str = Common::Atof(str, &val);
+      str = atof_(str, &val);
       if (idx == label_idx_) {
         *out_label = val;
         offset = -1;
@@ -50,20 +50,21 @@ class CSVParser: public Parser {
  private:
   int label_idx_ = 0;
   int total_columns_ = -1;
+  AtofFunc atof_;
 };
 
 class TSVParser: public Parser {
  public:
-  explicit TSVParser(int label_idx, int total_columns)
-    :label_idx_(label_idx), total_columns_(total_columns) {
+  explicit TSVParser(int label_idx, int total_columns, AtofFunc atof)
+    :label_idx_(label_idx), total_columns_(total_columns), atof_(atof) {
   }
   inline void ParseOneLine(const char* str,
     std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
     int idx = 0;
     double val = 0.0f;
     int offset = 0;
     while (*str != '\0') {
-      str = Common::Atof(str, &val);
+      str = atof_(str, &val);
       if (idx == label_idx_) {
         *out_label = val;
         offset = -1;
@@ -86,12 +87,13 @@ class TSVParser: public Parser {
  private:
   int label_idx_ = 0;
   int total_columns_ = -1;
+  AtofFunc atof_;
 };
 
 class LibSVMParser: public Parser {
  public:
-  explicit LibSVMParser(int label_idx, int total_columns)
-    :label_idx_(label_idx), total_columns_(total_columns) {
+  explicit LibSVMParser(int label_idx, int total_columns, AtofFunc atof)
+    :label_idx_(label_idx), total_columns_(total_columns), atof_(atof) {
     if (label_idx > 0) {
       Log::Fatal("Label should be the first column in a LibSVM file");
     }
@@ -101,7 +103,7 @@ class LibSVMParser: public Parser {
     int idx = 0;
     double val = 0.0f;
     if (label_idx_ == 0) {
-      str = Common::Atof(str, &val);
+      str = atof_(str, &val);
       *out_label = val;
       str = Common::SkipSpaceAndTab(str);
     }
@@ -126,6 +128,7 @@ class LibSVMParser: public Parser {
  private:
   int label_idx_ = 0;
   int total_columns_ = -1;
+  AtofFunc atof_;
 };
 
 }  // namespace LightGBM