From e76c0a31c7e2ea5897ad3acbffd31f8678d8c324 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 18 Feb 2021 17:48:32 -0600 Subject: [PATCH] more changes --- CMakeLists.txt | 2 +- README.md | 2 +- docs/Experiments.rst | 2 +- docs/Features.rst | 10 ++++----- docs/Installation-Guide.rst | 2 +- docs/Parallel-Learning-Guide.rst | 21 ++++++++----------- docs/Parameters.rst | 10 ++++----- examples/binary_classification/train.conf | 6 +++--- .../binary_classification/train_linear.conf | 6 +++--- examples/lambdarank/train.conf | 6 +++--- examples/parallel_learning/README.md | 10 ++++----- examples/parallel_learning/train.conf | 6 +++--- examples/regression/train.conf | 6 +++--- examples/xendcg/train.conf | 6 +++--- include/LightGBM/config.h | 10 ++++----- include/LightGBM/dataset.h | 2 +- python-package/lightgbm/basic.py | 2 +- src/application/application.cpp | 2 +- src/io/config.cpp | 2 +- src/io/dataset_loader.cpp | 2 +- src/io/metadata.cpp | 4 ++-- 21 files changed, 58 insertions(+), 61 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3273ff135d81..29a786d3a506 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -OPTION(USE_MPI "Enable MPI-based parallel learning" OFF) +OPTION(USE_MPI "Enable MPI-based distributed learning" OFF) OPTION(USE_OPENMP "Enable OpenMP" ON) OPTION(USE_GPU "Enable GPU-accelerated training" OFF) OPTION(USE_SWIG "Enable SWIG to generate Java API" OFF) diff --git a/README.md b/README.md index c8554cb1ef97..a0dd5c55a899 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Next you may want to read: - [**Examples**](https://github.com/microsoft/LightGBM/tree/master/examples) showing command line usage of common tasks. - [**Features**](https://github.com/microsoft/LightGBM/blob/master/docs/Features.rst) and algorithms supported by LightGBM. - [**Parameters**](https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst) is an exhaustive list of customization you can make. -- [**Parallel Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst) and [**GPU Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/GPU-Tutorial.rst) can speed up computation. +- [**Distributed Learning Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst) and [**GPU Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/GPU-Tutorial.rst) can speed up computation. - [**Laurae++ interactive documentation**](https://sites.google.com/view/lauraepp/parameters) is a detailed guide for hyperparameters. - [**Optuna Hyperparameter Tuner**](https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258) provides automated tuning for LightGBM hyperparameters ([code examples](https://github.com/optuna/optuna/blob/master/examples/)). diff --git a/docs/Experiments.rst b/docs/Experiments.rst index 9aa2d5d85c4b..175dc065b5b1 100644 --- a/docs/Experiments.rst +++ b/docs/Experiments.rst @@ -239,7 +239,7 @@ Results | 16 | 42 s | 11GB | +----------+---------------+---------------------------+ -The results show that LightGBM achieves a linear speedup with parallel learning. +The results show that LightGBM achieves a linear speedup with distributed learning. GPU Experiments --------------- diff --git a/docs/Features.rst b/docs/Features.rst index 85c919466977..08b7bb2f20df 100644 --- a/docs/Features.rst +++ b/docs/Features.rst @@ -28,7 +28,7 @@ LightGBM uses histogram-based algorithms\ `[4, 5, 6] <#references>`__, which buc - No need to store additional information for pre-sorting feature values -- **Reduce communication cost for parallel learning** +- **Reduce communication cost for distributed learning** Sparse Optimization ------------------- @@ -68,14 +68,14 @@ More specifically, LightGBM sorts the histogram (for a categorical feature) acco Optimization in Network Communication ------------------------------------- -It only needs to use some collective communication algorithms, like "All reduce", "All gather" and "Reduce scatter", in parallel learning of LightGBM. +It only needs to use some collective communication algorithms, like "All reduce", "All gather" and "Reduce scatter", in distributed learning of LightGBM. LightGBM implements state-of-art algorithms\ `[9] <#references>`__. These collective communication algorithms can provide much better performance than point-to-point communication. -Optimization in Parallel Learning ---------------------------------- +Optimization in Distributed Learning +------------------------------------ -LightGBM provides the following parallel learning algorithms. +LightGBM provides the following distributed learning algorithms. Feature Parallel ~~~~~~~~~~~~~~~~ diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst index 33610f4ddafb..bceb5150f2c6 100644 --- a/docs/Installation-Guide.rst +++ b/docs/Installation-Guide.rst @@ -348,7 +348,7 @@ Build MPI Version The default build version of LightGBM is based on socket. LightGBM also supports MPI. `MPI`_ is a high performance communication approach with `RDMA`_ support. -If you need to run a parallel learning application with high performance communication, you can build the LightGBM with MPI support. +If you need to run a distributed learning application with high performance communication, you can build the LightGBM with MPI support. Windows ^^^^^^^ diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst index 6c027a6349b7..6e3fcb60ef6c 100644 --- a/docs/Parallel-Learning-Guide.rst +++ b/docs/Parallel-Learning-Guide.rst @@ -13,7 +13,7 @@ This section describes how distributed learning in LightGBM works. To learn how Choose Appropriate Parallel Algorithm ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -LightGBM provides 3 parallel learning algorithms. +LightGBM provides 3 distributed learning algorithms. +--------------------+---------------------------+ | Parallel Algorithm | How to Use | @@ -35,7 +35,7 @@ These algorithms are suited for different scenarios, which is listed in the foll | **#feature is large** | Feature Parallel | Voting Parallel | +-------------------------+-------------------+-----------------+ -More details about these parallel algorithms can be found in `optimization in parallel learning <./Features.rst#optimization-in-parallel-learning>`__. +More details about these parallel algorithms can be found in `optimization in distributed learning <./Features.rst#optimization-in-distributed-learning>`__. Integrations ------------ @@ -68,20 +68,17 @@ Kubeflow integrations for LightGBM are not maintained by LightGBM's maintainers. LightGBM CLI ^^^^^^^^^^^^ -Build Parallel Version -'''''''''''''''''''''' +Preparation +''''''''''' -Default build version support parallel learning based on the socket. +By default, distributed learning with LightGBM uses socket-based communication. If you need to build parallel version with MPI support, please refer to `Installation Guide <./Installation-Guide.rst#build-mpi-version>`__. -Preparation -''''''''''' - Socket Version ************** -It needs to collect IP of all machines that want to run parallel learning in and allocate one TCP port (assume 12345 here) for all machines, +It needs to collect IP of all machines that want to run distributed learning in and allocate one TCP port (assume 12345 here) for all machines, and change firewall rules to allow income of this port (12345). Then write these IP and ports in one file (assume ``mlist.txt``), like following: .. code:: @@ -92,7 +89,7 @@ and change firewall rules to allow income of this port (12345). Then write these MPI Version *********** -It needs to collect IP (or hostname) of all machines that want to run parallel learning in. +It needs to collect IP (or hostname) of all machines that want to run distributed learning in. Then write these IP in one file (assume ``mlist.txt``) like following: .. code:: @@ -102,8 +99,8 @@ Then write these IP in one file (assume ``mlist.txt``) like following: **Note**: For Windows users, need to start "smpd" to start MPI service. More details can be found `here`_. -Run Parallel Learning -''''''''''''''''''''' +Run Distributed Learning +'''''''''''''''''''''''' Socket Version ************** diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 38973770e5fe..678b7e95d66b 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -181,7 +181,7 @@ Core Parameters - ``voting``, voting parallel tree learner, aliases: ``voting_parallel`` - - refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details + - refer to `Distributed Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details - ``num_threads`` :raw-html:`🔗︎`, default = ``0``, type = int, aliases: ``num_thread``, ``nthread``, ``nthreads``, ``n_jobs`` @@ -195,7 +195,7 @@ Core Parameters - be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal** - - for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication + - for distributed learning, do not use all CPU cores because this will cause poor performance for the network communication - **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors @@ -714,7 +714,7 @@ Dataset Parameters - ``pre_partition`` :raw-html:`🔗︎`, default = ``false``, type = bool, aliases: ``is_pre_partition`` - - used for parallel learning (excluding the ``feature_parallel`` mode) + - used for distributed learning (excluding the ``feature_parallel`` mode) - ``true`` if training data are pre-partitioned, and different machines use different partitions @@ -1133,7 +1133,7 @@ Network Parameters - ``num_machines`` :raw-html:`🔗︎`, default = ``1``, type = int, aliases: ``num_machine``, constraints: ``num_machines > 0`` - - the number of machines for parallel learning application + - the number of machines for distributed learning application - this parameter is needed to be set in both **socket** and **mpi** versions @@ -1149,7 +1149,7 @@ Network Parameters - ``machine_list_filename`` :raw-html:`🔗︎`, default = ``""``, type = string, aliases: ``machine_list_file``, ``machine_list``, ``mlist`` - - path of file that lists machines for this parallel learning application + - path of file that lists machines for this distributed learning application - each line contains one IP and one port for one machine. The format is ``ip port`` (space as a separator) diff --git a/examples/binary_classification/train.conf b/examples/binary_classification/train.conf index e4ca69b1dcd3..f9788aae592a 100644 --- a/examples/binary_classification/train.conf +++ b/examples/binary_classification/train.conf @@ -98,13 +98,13 @@ output_model = LightGBM_model.txt # output_result= prediction.txt -# number of machines in parallel training, alias: num_machine +# number of machines in distributed training, alias: num_machine num_machines = 1 -# local listening port in parallel training, alias: local_port +# local listening port in distributed training, alias: local_port local_listen_port = 12400 -# machines list file for parallel training, alias: mlist +# machines list file for distributed training, alias: mlist machine_list_file = mlist.txt # force splits diff --git a/examples/binary_classification/train_linear.conf b/examples/binary_classification/train_linear.conf index 616d5fc39e35..e47cc58cd124 100644 --- a/examples/binary_classification/train_linear.conf +++ b/examples/binary_classification/train_linear.conf @@ -100,13 +100,13 @@ output_model = LightGBM_model.txt # output_result= prediction.txt -# number of machines in parallel training, alias: num_machine +# number of machines in distributed training, alias: num_machine num_machines = 1 -# local listening port in parallel training, alias: local_port +# local listening port in distributed training, alias: local_port local_listen_port = 12400 -# machines list file for parallel training, alias: mlist +# machines list file for distributed training, alias: mlist machine_list_file = mlist.txt # force splits diff --git a/examples/lambdarank/train.conf b/examples/lambdarank/train.conf index d3ead83f8d5c..16192f222f7f 100644 --- a/examples/lambdarank/train.conf +++ b/examples/lambdarank/train.conf @@ -103,11 +103,11 @@ output_model = LightGBM_model.txt # output_result= prediction.txt -# number of machines in parallel training, alias: num_machine +# number of machines in distributed training, alias: num_machine num_machines = 1 -# local listening port in parallel training, alias: local_port +# local listening port in distributed training, alias: local_port local_listen_port = 12400 -# machines list file for parallel training, alias: mlist +# machines list file for distributed training, alias: mlist machine_list_file = mlist.txt diff --git a/examples/parallel_learning/README.md b/examples/parallel_learning/README.md index 11c1d2607e28..e4252452c335 100644 --- a/examples/parallel_learning/README.md +++ b/examples/parallel_learning/README.md @@ -1,7 +1,7 @@ -Parallel Learning Example -========================= +Distributed Learning Example +============================ -Here is an example for LightGBM to perform parallel learning for 2 machines. +Here is an example for LightGBM to perform distributed learning for 2 machines. 1. Edit [mlist.txt](./mlist.txt): write the ip of these 2 machines that you want to run application on. @@ -16,6 +16,6 @@ Here is an example for LightGBM to perform parallel learning for 2 machines. ```"./lightgbm" config=train.conf``` -This parallel learning example is based on socket. LightGBM also supports parallel learning based on mpi. +This distributed learning example is based on socket. LightGBM also supports distributed learning based on mpi. -For more details about the usage of parallel learning, please refer to [this](https://github.com/microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst). +For more details about the usage of distributed learning, please refer to [this](https://github.com/microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst). diff --git a/examples/parallel_learning/train.conf b/examples/parallel_learning/train.conf index 6076a80887ca..dbc58b8234c8 100644 --- a/examples/parallel_learning/train.conf +++ b/examples/parallel_learning/train.conf @@ -98,11 +98,11 @@ output_model = LightGBM_model.txt # output_result= prediction.txt -# number of machines in parallel training, alias: num_machine +# number of machines in distributed training, alias: num_machine num_machines = 2 -# local listening port in parallel training, alias: local_port +# local listening port in distributed training, alias: local_port local_listen_port = 12400 -# machines list file for parallel training, alias: mlist +# machines list file for distributed training, alias: mlist machine_list_file = mlist.txt diff --git a/examples/regression/train.conf b/examples/regression/train.conf index b62e99d7dc27..7fac419a5ba4 100644 --- a/examples/regression/train.conf +++ b/examples/regression/train.conf @@ -101,11 +101,11 @@ output_model = LightGBM_model.txt # output_result= prediction.txt -# number of machines in parallel training, alias: num_machine +# number of machines in distributed training, alias: num_machine num_machines = 1 -# local listening port in parallel training, alias: local_port +# local listening port in distributed training, alias: local_port local_listen_port = 12400 -# machines list file for parallel training, alias: mlist +# machines list file for distributed training, alias: mlist machine_list_file = mlist.txt diff --git a/examples/xendcg/train.conf b/examples/xendcg/train.conf index 4715841ca64f..c98870ed8461 100644 --- a/examples/xendcg/train.conf +++ b/examples/xendcg/train.conf @@ -104,11 +104,11 @@ output_model = LightGBM_model.txt # output_result= prediction.txt -# number of machines in parallel training, alias: num_machine +# number of machines in distributed training, alias: num_machine num_machines = 1 -# local listening port in parallel training, alias: local_port +# local listening port in distributed training, alias: local_port local_listen_port = 12400 -# machines list file for parallel training, alias: mlist +# machines list file for distributed training, alias: mlist machine_list_file = mlist.txt diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 4d0686912091..b3b46b05656b 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -200,7 +200,7 @@ struct Config { // desc = ``feature``, feature parallel tree learner, aliases: ``feature_parallel`` // desc = ``data``, data parallel tree learner, aliases: ``data_parallel`` // desc = ``voting``, voting parallel tree learner, aliases: ``voting_parallel`` - // desc = refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details + // desc = refer to `Distributed Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details std::string tree_learner = "serial"; // alias = num_thread, nthread, nthreads, n_jobs @@ -209,7 +209,7 @@ struct Config { // desc = for the best speed, set this to the number of **real CPU cores**, not the number of threads (most CPUs use `hyper-threading `__ to generate 2 threads per CPU core) // desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows) // desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal** - // desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication + // desc = for distributed learning, do not use all CPU cores because this will cause poor performance for the network communication // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors int num_threads = 0; @@ -634,7 +634,7 @@ struct Config { bool feature_pre_filter = true; // alias = is_pre_partition - // desc = used for parallel learning (excluding the ``feature_parallel`` mode) + // desc = used for distributed learning (excluding the ``feature_parallel`` mode) // desc = ``true`` if training data are pre-partitioned, and different machines use different partitions bool pre_partition = false; @@ -961,7 +961,7 @@ struct Config { // check = >0 // alias = num_machine - // desc = the number of machines for parallel learning application + // desc = the number of machines for distributed learning application // desc = this parameter is needed to be set in both **socket** and **mpi** versions int num_machines = 1; @@ -976,7 +976,7 @@ struct Config { int time_out = 120; // alias = machine_list_file, machine_list, mlist - // desc = path of file that lists machines for this parallel learning application + // desc = path of file that lists machines for this distributed learning application // desc = each line contains one IP and one port for one machine. The format is ``ip port`` (space as a separator) // desc = **Note**: can be used only in CLI version std::string machine_list_filename = ""; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 90f48e70c744..61989e221bcc 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -80,7 +80,7 @@ class Metadata { /*! * \brief Partition meta data according to local used indices if need - * \param num_all_data Number of total training data, including other machines' data on parallel learning + * \param num_all_data Number of total training data, including other machines' data on distributed learning * \param used_data_indices Indices of local used training data */ void CheckOrPartition(data_size_t num_all_data, diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b036c4f81b2a..39a4d8e9da57 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2333,7 +2333,7 @@ def set_network(self, machines, local_listen_port=12400, listen_time_out : int, optional (default=120) Socket time-out in minutes. num_machines : int, optional (default=1) - The number of machines for parallel learning application. + The number of machines for distributed learning application. Returns ------- diff --git a/src/application/application.cpp b/src/application/application.cpp index 62583db72b6c..e82cfcada98f 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -105,7 +105,7 @@ void Application::LoadData() { config_.num_class, config_.data.c_str()); // load Training data if (config_.is_data_based_parallel) { - // load data for parallel training + // load data for distributed training train_data_.reset(dataset_loader.LoadFromFile(config_.data.c_str(), Network::rank(), Network::num_machines())); } else { diff --git a/src/io/config.cpp b/src/io/config.cpp index dc04cb972d23..fbb9e339933f 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -374,7 +374,7 @@ void Config::CheckParamConflict() { } if (is_parallel && (monotone_constraints_method == std::string("intermediate") || monotone_constraints_method == std::string("advanced"))) { // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints. - Log::Warning("Cannot use \"intermediate\" or \"advanced\" monotone constraints in parallel learning, auto set to \"basic\" method."); + Log::Warning("Cannot use \"intermediate\" or \"advanced\" monotone constraints in distributed learning, auto set to \"basic\" method."); monotone_constraints_method = "basic"; } if (feature_fraction_bynode != 1.0 && (monotone_constraints_method == std::string("intermediate") || monotone_constraints_method == std::string("advanced"))) { diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 05c65ee16744..545ffcaad849 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -183,7 +183,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac // don't support query id in data file when training in parallel if (num_machines > 1 && !config_.pre_partition) { if (group_idx_ > 0) { - Log::Fatal("Using a query id without pre-partitioning the data file is not supported for parallel training.\n" + Log::Fatal("Using a query id without pre-partitioning the data file is not supported for distributed training.\n" "Please use an additional query file or pre-partition the data"); } } diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 8ab4da8d74f2..63a1690906a2 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -22,7 +22,7 @@ Metadata::Metadata() { void Metadata::Init(const char* data_filename) { data_filename_ = data_filename; - // for lambdarank, it needs query data for partition data in parallel learning + // for lambdarank, it needs query data for partition data in distributed learning LoadQueryBoundaries(); LoadWeights(); LoadQueryWeights(); @@ -187,7 +187,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector(used_data_indices.size()); // check weights