Skip to content

Commit

Permalink
stable multi-threading sum reduction (#3385)
Browse files Browse the repository at this point in the history
* Update serial_tree_learner.cpp

* Update src/treelearner/serial_tree_learner.cpp

* stable multi-threading reduction

* Update src/treelearner/serial_tree_learner.cpp

* more fixes

* Apply suggestions from code review

* Apply suggestions from code review

* Update src/boosting/gbdt.cpp
  • Loading branch information
guolinke authored Sep 30, 2020
1 parent f8f6c51 commit 692c9a5
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 19 deletions.
41 changes: 41 additions & 0 deletions include/LightGBM/utils/threading.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class Threading {
BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
block_size);
}

template <typename INDEX_T>
static inline void BlockInfo(int num_threads, INDEX_T cnt,
INDEX_T min_cnt_per_block, int* out_nblock,
Expand All @@ -38,6 +39,7 @@ class Threading {
*block_size = cnt;
}
}

template <typename INDEX_T>
static inline void BlockInfoForceSize(int num_threads, INDEX_T cnt,
INDEX_T min_cnt_per_block,
Expand All @@ -55,6 +57,14 @@ class Threading {
}
}

template <typename INDEX_T>
static inline void BlockInfoForceSize(INDEX_T cnt, INDEX_T min_cnt_per_block,
int* out_nblock, INDEX_T* block_size) {
int num_threads = OMP_NUM_THREADS();
BlockInfoForceSize<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
block_size);
}

template <typename INDEX_T>
static inline int For(
INDEX_T start, INDEX_T end, INDEX_T min_block_size,
Expand All @@ -74,6 +84,37 @@ class Threading {
OMP_THROW_EX();
return n_block;
}

template <typename INDEX_T, typename VAL1_T, typename VAL2_T>
static inline int SumReduction(
INDEX_T start, INDEX_T end, INDEX_T min_block_size,
const std::function<void(int, INDEX_T, INDEX_T, VAL1_T* res1,
VAL2_T* res2)>& inner_fun,
VAL1_T* res1, VAL2_T* res2) {
int n_block = 1;
INDEX_T num_inner = end - start;
BlockInfoForceSize<INDEX_T>(end - start, min_block_size, &n_block,
&num_inner);
std::vector<VAL1_T> val_1s(n_block, static_cast<VAL1_T>(0));
std::vector<VAL2_T> val_2s(n_block, static_cast<VAL2_T>(0));
OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < n_block; ++i) {
OMP_LOOP_EX_BEGIN();
INDEX_T inner_start = start + num_inner * i;
INDEX_T inner_end = std::min(end, inner_start + num_inner);
inner_fun(i, inner_start, inner_end, &val_1s[i], &val_2s[i]);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
*res1 = 0;
*res2 = 0;
for (int i = 0; i < n_block; ++i) {
*res1 += val_1s[i];
*res2 += val_2s[i];
}
return n_block;
}
};

template <typename INDEX_T, bool TWO_BUFFER>
Expand Down
41 changes: 22 additions & 19 deletions src/treelearner/leaf_splits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_

#include <LightGBM/meta.h>
#include <LightGBM/utils/threading.h>

#include <limits>
#include <vector>
Expand Down Expand Up @@ -67,15 +68,16 @@ class LeafSplits {
num_data_in_leaf_ = num_data_;
leaf_index_ = 0;
data_indices_ = nullptr;
double tmp_sum_gradients = 0.0f;
double tmp_sum_hessians = 0.0f;
#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
tmp_sum_gradients += gradients[i];
tmp_sum_hessians += hessians[i];
}
sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians;
Threading::SumReduction<data_size_t, double, double>(
0, num_data_in_leaf_, 2048,
[=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
*s1 = *s2 = 0;
for (data_size_t i = start; i < end; ++i) {
*s1 += gradients[i];
*s2 += hessians[i];
}
},
&sum_gradients_, &sum_hessians_);
}

/*!
Expand All @@ -88,16 +90,17 @@ class LeafSplits {
void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) {
leaf_index_ = leaf;
data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
double tmp_sum_gradients = 0.0f;
double tmp_sum_hessians = 0.0f;
#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
data_size_t idx = data_indices_[i];
tmp_sum_gradients += gradients[idx];
tmp_sum_hessians += hessians[idx];
}
sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians;
Threading::SumReduction<data_size_t, double, double>(
0, num_data_in_leaf_, 2048,
[=](int, data_size_t start, data_size_t end, double* s1, double* s2) {
*s1 = *s2 = 0;
for (data_size_t i = start; i < end; ++i) {
data_size_t idx = data_indices_[i];
*s1 += gradients[idx];
*s2 += hessians[idx];
}
},
&sum_gradients_, &sum_hessians_);
}


Expand Down

0 comments on commit 692c9a5

Please sign in to comment.