From b0137debe6e9cc92b65ec71b0fe8a56ea213c143 Mon Sep 17 00:00:00 2001
From: chjinche <49483542+chjinche@users.noreply.github.com>
Date: Tue, 16 Nov 2021 14:27:23 +0800
Subject: [PATCH] Add customized parser support (#4782)
* add customized parser support
* fix typo of parser_config_file description
* make delimiter as parameter of JoinedLines
---
README.md | 2 +
docs/Parameters.rst | 8 ++++
include/LightGBM/boosting.h | 2 +
include/LightGBM/config.h | 5 +++
include/LightGBM/dataset.h | 59 +++++++++++++++++++++++++
include/LightGBM/utils/common.h | 25 +++++++++++
include/LightGBM/utils/text_reader.h | 11 +++++
src/application/predictor.hpp | 6 ++-
src/boosting/gbdt.cpp | 3 ++
src/boosting/gbdt.h | 4 ++
src/boosting/gbdt_model_text.cpp | 31 ++++++++++++-
src/io/config_auto.cpp | 4 ++
src/io/dataset_loader.cpp | 36 ++++++++++++---
src/io/parser.cpp | 56 +++++++++++++++++++++++
tests/python_package_test/test_basic.py | 12 +++++
15 files changed, 253 insertions(+), 11 deletions(-)
diff --git a/README.md b/README.md
index 921bfb76f308..da11b743a924 100644
--- a/README.md
+++ b/README.md
@@ -117,6 +117,8 @@ MLflow (experiment tracking, model monitoring framework): https://github.com/mlf
`{mlr3extralearners}` (R `{mlr3}`-compliant interface): https://github.com/mlr-org/mlr3extralearners
+lightgbm-transform (feature transformation binding): https://github.com/microsoft/lightgbm-transform
+
Support
-------
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 75bef7add9bc..7ace90d9b34d 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -850,6 +850,14 @@ Dataset Parameters
- **Note**: setting this to ``true`` may lead to much slower text parsing
+- ``parser_config_file`` :raw-html:`🔗︎`, default = ``""``, type = string
+
+ - path to a ``.json`` file that specifies customized parser initialized configuration
+
+ - see `lightgbm-transform `__ for usage examples
+
+ - **Note**: ``lightgbm-transform`` is not maintained by LightGBM's maintainers. Bug reports or feature requests should go to `issues page `__
+
Predict Parameters
~~~~~~~~~~~~~~~~~~
diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h
index ddbcdbc18e44..7530495c0e17 100644
--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -314,6 +314,8 @@ class LIGHTGBM_EXPORT Boosting {
static Boosting* CreateBoosting(const std::string& type, const char* filename);
virtual bool IsLinear() const { return false; }
+
+ virtual std::string ParserConfigStr() const = 0;
};
class GBDTBase : public Boosting {
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index da43a5ec9782..50371f3a2d91 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -721,6 +721,11 @@ struct Config {
// desc = **Note**: setting this to ``true`` may lead to much slower text parsing
bool precise_float_parser = false;
+ // desc = path to a ``.json`` file that specifies customized parser initialized configuration
+ // desc = see `lightgbm-transform `__ for usage examples
+ // desc = **Note**: ``lightgbm-transform`` is not maintained by LightGBM's maintainers. Bug reports or feature requests should go to `issues page `__
+ std::string parser_config_file = "";
+
#pragma endregion
#pragma region Predict Parameters
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index abef980f5fbe..cf19429322ee 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -15,6 +15,7 @@
#include
#include
+#include