From 31733892c97a0eb790e09fa390a80f1d97a7a87b Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Feb 2022 16:27:56 +0100 Subject: [PATCH 01/56] Add currently used SYCL implementation (hipSYCL or DPC++) to output. --- src/plssvm/backends/SYCL/csvm.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 4a8777dec..9f70416fc 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -63,7 +63,12 @@ csvm::csvm(const parameter ¶ms) : } if (print_info_) { - fmt::print("Using SYCL as backend.\n"); +#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL + fmt::print("Using SYCL (hipSYCL) as backend.\n"); +#endif +#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP + fmt::print("Using SYCL (DPC++) as backend.\n"); +#endif } // get all available devices wrt the requested target platform From 33fb1d261425d3c6f3d646c3b8753465865923d5 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Feb 2022 16:39:30 +0100 Subject: [PATCH 02/56] Add timing information to predict functionality. --- src/plssvm/backends/gpu_csvm.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/plssvm/backends/gpu_csvm.cpp b/src/plssvm/backends/gpu_csvm.cpp index 8a50aedf4..79cf2c012 100644 --- a/src/plssvm/backends/gpu_csvm.cpp +++ b/src/plssvm/backends/gpu_csvm.cpp @@ -39,6 +39,9 @@ gpu_csvm::gpu_csvm(const parameter ¶ms) : template auto gpu_csvm::predict(const std::vector> &points) -> std::vector { + // time prediction + auto start_time = std::chrono::steady_clock::now(); + using namespace plssvm::operators; PLSSVM_ASSERT(data_ptr_ != nullptr, "No data is provided!"); // exception in constructor @@ -103,6 +106,11 @@ auto gpu_csvm::predict(const std::vector(end_time - start_time)); + } + return out; } @@ -380,4 +388,4 @@ template class gpu_csvm, ::sycl template class gpu_csvm, ::sycl::queue>; #endif -} // namespace plssvm::detail \ No newline at end of file +} // namespace plssvm::detail From 3a5f7ab3a374b25dd951dba51f8b6dd5c80cd024 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Feb 2022 16:46:34 +0100 Subject: [PATCH 03/56] Add -sycl-std=2020 flags to hipSYCL and DPC++. --- src/plssvm/backends/SYCL/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt index 8d484c644..b65956d58 100644 --- a/src/plssvm/backends/SYCL/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/CMakeLists.txt @@ -58,10 +58,10 @@ if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL") # set backend compiler to hipSYCL (= 1) target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=1) # silence unknown options warnings - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Wno-unknown-warning-option) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -Wno-unknown-warning-option) elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++") # enable DPC++ SYCL support - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl) target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl) # nvidia targets From fa8ee450c3c9a7cc81e640e41a902b41b9135788 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 4 Feb 2022 17:41:07 +0100 Subject: [PATCH 04/56] Change predict kernel to use hierarchical SYCL notation if hipSYCL is used (faster on the CPU) --- .../plssvm/backends/SYCL/predict_kernel.hpp | 28 +++++----- src/plssvm/backends/SYCL/csvm.cpp | 52 +++++++++++++++++-- 2 files changed, 65 insertions(+), 15 deletions(-) diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp index 98b666676..d8568facf 100644 --- a/include/plssvm/backends/SYCL/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/predict_kernel.hpp @@ -72,12 +72,15 @@ class device_kernel_w_linear { * @brief Predicts the labels for data points using the polynomial kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data points + * @tparam U the type of the `sycl::item` */ -template +template class device_kernel_predict_poly { public: /// The type of the data. using real_type = T; + /// The `sycl::item` type. + using sycl_item_type = U; /** * @brief Construct a new device kernel to predict the labels for data points using the polynomial kernel function. @@ -99,12 +102,11 @@ class device_kernel_predict_poly { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - const kernel_index_type data_point_index = nd_idx.get_global_id(0); - const kernel_index_type predict_point_index = nd_idx.get_global_id(1); + void operator()(sycl_item_type idx) const { + const kernel_index_type data_point_index = idx.get_global_id(0); + const kernel_index_type predict_point_index = idx.get_global_id(1); real_type temp = 0; if (predict_point_index < num_predict_points_) { @@ -140,12 +142,15 @@ class device_kernel_predict_poly { * @brief Predicts the labels for data points using the radial basis functions kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data points + * @tparam U the type of the `sycl::item` */ -template +template class device_kernel_predict_radial { public: /// The type of the data. using real_type = T; + /// The `sycl::item` type + using sycl_item_type = U; /** * @brief Construct a new device kernel to predict the labels for data points using the radial basis function kernel function. @@ -165,12 +170,11 @@ class device_kernel_predict_radial { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - const kernel_index_type data_point_index = nd_idx.get_global_id(0); - const kernel_index_type predict_point_index = nd_idx.get_global_id(1); + void operator()(sycl_item_type idx) const { + const kernel_index_type data_point_index = idx.get_global_id(0); + const kernel_index_type predict_point_index = idx.get_global_id(1); real_type temp = 0; if (predict_point_index < num_predict_points_) { diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 4a8777dec..c5f7282fb 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -180,16 +180,62 @@ void csvm::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe } template -void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) { +void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) { const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); switch (kernel_) { case kernel_type::linear: break; case kernel_type::polynomial: - devices_[0].parallel_for(execution_range, device_kernel_predict_poly(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, degree_, gamma_, coef0_)); +#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL + { + ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; + ::sycl::range<2> local_range{ range.block[0], range.block[1] }; + devices_[0].submit([&](::sycl::handler& cgh) { + real_type *out_d_ptr = out_d.get(); + const real_type *data_d_ptr = data_d_[0].get(); + const real_type *data_last_d_ptr = data_last_d_[0].get(); + const real_type *alpha_d_ptr = alpha_d.get(); + const std::size_t num_data_points = num_data_points_; + const real_type *point_d_ptr = point_d.get(); + const std::size_t num_predict_points = p_num_predict_points; + const std::size_t num_features = num_features_; + const int degree = degree_; + const real_type gamma = gamma_; + const real_type coef0 = coef0_; + + cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { + group.parallel_for_work_item(device_kernel_predict_poly>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0)); + }); + }); + } +#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP + devices_[0].parallel_for(execution_range, device_kernel_predict_poly>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_)); +#endif break; case kernel_type::rbf: - devices_[0].parallel_for(execution_range, device_kernel_predict_radial(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, gamma_)); +#if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL + { + ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; + ::sycl::range<2> local_range{ range.block[0], range.block[1] }; + devices_[0].submit([&](::sycl::handler& cgh) { + real_type *out_d_ptr = out_d.get(); + const real_type *data_d_ptr = data_t_[0].get(); + const real_type *data_last_d_ptr = data_last_d_[0].get(); + const real_type *alpha_d_ptr = alpha_d.get(); + const std::size_t num_data_points = num_data_points_; + const real_type *point_d_ptr = point_d.get(); + const std::size_t num_predict_points = p_num_predict_points; + const std::size_t num_features = num_features_; + const real_type gamma = gamma_; + + cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { + group.parallel_for_work_item(device_kernel_predict_radial>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma)); + }); + }); + } +#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP + devices_[0].parallel_for(execution_range, device_kernel_predict_radial>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_)); +#endif break; } } From 727e7002027d30edc9f2918527b0bd1cd1ae1135 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sun, 6 Feb 2022 18:49:07 +0100 Subject: [PATCH 05/56] Add missing prediction timing output to OpenMP backend and add number of predict points to GPU backends. --- src/plssvm/backends/OpenMP/csvm.cpp | 8 ++++++++ src/plssvm/backends/gpu_csvm.cpp | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp index e9d1ccaa0..f0fcb961d 100644 --- a/src/plssvm/backends/OpenMP/csvm.cpp +++ b/src/plssvm/backends/OpenMP/csvm.cpp @@ -184,6 +184,9 @@ void csvm::update_w() { template auto csvm::predict(const std::vector> &points) -> std::vector { + // time prediction + auto start_time = std::chrono::steady_clock::now(); + using namespace plssvm::operators; PLSSVM_ASSERT(data_ptr_ != nullptr, "No data is provided!"); // exception in constructor @@ -228,6 +231,11 @@ auto csvm::predict(const std::vector> &points) -> std: } } + auto end_time = std::chrono::steady_clock::now(); + if (print_info_) { + fmt::print("Predicted {} data points in {}.\n", points.size(), std::chrono::duration_cast(end_time - start_time)); + } + return out; } diff --git a/src/plssvm/backends/gpu_csvm.cpp b/src/plssvm/backends/gpu_csvm.cpp index 79cf2c012..b07a12e89 100644 --- a/src/plssvm/backends/gpu_csvm.cpp +++ b/src/plssvm/backends/gpu_csvm.cpp @@ -108,7 +108,7 @@ auto gpu_csvm::predict(const std::vector(end_time - start_time)); + fmt::print("Predicted {} data points in {}.\n", points.size(), std::chrono::duration_cast(end_time - start_time)); } return out; From 195d39120940e90a8e483a4188974b991bcbab0c Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sun, 6 Feb 2022 19:12:27 +0100 Subject: [PATCH 06/56] Add [[maybe_unused]] attribute. --- src/plssvm/backends/SYCL/csvm.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index c5f7282fb..97dab6674 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -181,7 +181,8 @@ void csvm::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe template void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) { - const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); + [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); + switch (kernel_) { case kernel_type::linear: break; From 8bba28abf470af03c61eff2d8740142557149034 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Feb 2022 10:04:19 +0100 Subject: [PATCH 07/56] First try to reformulate SYCL SVM kernel using hiearchical kernels. --- include/plssvm/backends/SYCL/svm_kernel.hpp | 190 +++++++++++++++++++- src/plssvm/backends/SYCL/csvm.cpp | 9 +- 2 files changed, 191 insertions(+), 8 deletions(-) diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp index a0b1a669b..9ea3546aa 100644 --- a/include/plssvm/backends/SYCL/svm_kernel.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel.hpp @@ -11,6 +11,7 @@ #pragma once +#include "plssvm/detail/execution_range.hpp" #include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL #include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE @@ -176,15 +177,192 @@ class device_kernel_poly { * @param[in] gamma the gamma parameter used in the polynomial kernel function * @param[in] coef0 the coef0 parameter used in the polynomial kernel function */ - device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : - data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} + device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : + queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} /** * @brief Function call operator overload performing the actual calculation. * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { + void operator()() const { + + queue_.submit([&](::sycl::handler& cgh) { + const real_type *q = q_; + real_type *ret = ret_; + const real_type *d = d_; + const real_type *data_d = data_d_; + const real_type QA_cost = QA_cost_; + const real_type cost = cost_; + const kernel_index_type num_rows = num_rows_; + const kernel_index_type num_cols = num_cols_; + const real_type add = add_; + const int degree = degree_; + const real_type gamma = gamma_; + const real_type coef0 = coef0_; + + cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + //const std::size_t gi = group.get_group_id(0); + //const std::size_t gj = group.get_group_id(1); + + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } + } + }); + + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + + if (i >= j) { + i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; + } + } + } + }); + + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + + if (i >= j) { + i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; + } + } + } + }); + + } + + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + kernel_index_type i = group.get_group_id(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group.get_group_id(1) * INTERNAL_BLOCK_SIZE; + + if (i >= j) { + i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast(degree)) + QA_cost - q[i + y] - q[j + x]) * add; + if (i + x > j + y) { + // upper triangular matrix + atomic_op{ ret[i + y] } += temp * d[j + x]; + ret_jx += temp * d[i + y]; + } else if (i + x == j + y) { + // diagonal + ret_jx += (temp + cost * add) * d[i + y]; + } + } + atomic_op{ ret[j + x] } += ret_jx; + } + } + }); + /* + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + kernel_index_type i = gi * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = gj * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + + real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; + real_type data_j[INTERNAL_BLOCK_SIZE]; + + if (i >= j) { + i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + // cache data + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + ::sycl::group_barrier(group); + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; + } + } +// ::sycl::group_barrier(group); + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + data_j[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + matr[k][l] += data_i * data_j[k]; + } + } + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::pow(gamma * matr[x][y] + coef0, static_cast(degree)) + QA_cost - q[i + y] - q[j + x]) * add; + if (i + x > j + y) { + // upper triangular matrix + atomic_op{ ret[i + y] } += temp * d[j + x]; + ret_jx += temp * d[i + y]; + } else if (i + x == j + y) { + // diagonal + ret_jx += (temp + cost * add) * d[i + y]; + } + } + atomic_op{ ret[j + x] } += ret_jx; + } + } + }); + */ + }); + }); + /* kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; @@ -244,11 +422,13 @@ class device_kernel_poly { atomic_op{ ret_[j + x] } += ret_jx; } } + */ } private: - local_accessor data_intern_i_; - local_accessor data_intern_j_; + ::sycl::queue& queue_; + ::sycl::range<2> global_range_; + ::sycl::range<2> local_range_; const real_type *q_; real_type *ret_; diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 97dab6674..dea4188e8 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -159,10 +159,13 @@ void csvm::run_svm_kernel(const std::size_t device, const ::plssvm::detail::e }); break; case kernel_type::polynomial: + { PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!"); - devices_[device].submit([&](::sycl::handler &cgh) { - cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); - }); + device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)(); + //devices_[device].submit([&](::sycl::handler &cgh) { + // cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); + //}); + } break; case kernel_type::rbf: PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!"); From 025609fda736ae9eb3947e1d521e55956ae1131a Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Feb 2022 10:58:22 +0100 Subject: [PATCH 08/56] Change get_group_id() to operator[] since the former currently isn't implemented in DPC++. --- include/plssvm/backends/SYCL/svm_kernel.hpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp index 9ea3546aa..4492c0db9 100644 --- a/include/plssvm/backends/SYCL/svm_kernel.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel.hpp @@ -206,9 +206,6 @@ class device_kernel_poly { real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - //const std::size_t gi = group.get_group_id(0); - //const std::size_t gj = group.get_group_id(1); - ::sycl::private_memory private_matr{ group }; ::sycl::private_memory private_data_j{ group }; @@ -224,8 +221,8 @@ class device_kernel_poly { for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; if (i >= j) { i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; @@ -246,8 +243,8 @@ class device_kernel_poly { }); group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; if (i >= j) { i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; @@ -272,8 +269,8 @@ class device_kernel_poly { } group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group.get_group_id(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group.get_group_id(1) * INTERNAL_BLOCK_SIZE; + kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; if (i >= j) { i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; From 9b611759a220eee4e742aeb00ff90b256b60cd41 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Feb 2022 14:07:47 +0100 Subject: [PATCH 09/56] Rewrite other SYCL SVM kernel to also use hierarchical form. --- include/plssvm/backends/SYCL/svm_kernel.hpp | 722 ++++++++++---------- src/plssvm/backends/SYCL/csvm.cpp | 96 ++- 2 files changed, 394 insertions(+), 424 deletions(-) diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp index 4492c0db9..1ac1df6b6 100644 --- a/include/plssvm/backends/SYCL/svm_kernel.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel.hpp @@ -11,25 +11,17 @@ #pragma once -#include "plssvm/detail/execution_range.hpp" +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::atomic_op #include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL #include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE +#include "plssvm/detail/execution_range.hpp" // plssvm::detail::execution_range -#include "sycl/sycl.hpp" // sycl::nd_item, sycl::handler, sycl::accessor, sycl::access::mode, sycl::access::target, sycl::range, sycl::group_barrier, sycl::pow, - // sycl::exp, sycl::atomic_ref, sycl::memory_order, sycl::memory_scope, sycl::access::address_space +#include "sycl/sycl.hpp" // sycl::queue, sycl::handler, sycl::h_item, sycl::range, sycl::private_memory, sycl::pow, sycl::exp #include // std::size_t namespace plssvm::sycl { -// TODO: change to ::sycl::local_accessor once implemented in the SYCL implementations -/** - * @brief Shortcut alias for a SYCL local accessor. - * @tparam T the type of the accessed values - */ -template -using local_accessor = ::sycl::accessor; - /** * @brief Calculates the C-SVM kernel using the linear kernel function. * @details Supports multi-GPU execution. @@ -43,7 +35,8 @@ class device_kernel_linear { /** * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel. - * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued + * @param[in] range the execution range of the kernel * @param[in] q the `q` vector * @param[out] ret the result vector * @param[in] d the right-hand side of the equation @@ -55,88 +48,141 @@ class device_kernel_linear { * @param[in] add denotes whether the values are added or subtracted from the result vector * @param[in] id the id of the device */ - device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : - data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} + device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : + queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; - real_type data_j[INTERNAL_BLOCK_SIZE]; - - if (i >= j) { - i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - // cache data - for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) { - ::sycl::group_barrier(nd_idx.get_group()); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(1) == idx) { - data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; + void operator()() const { + queue_.submit([&](::sycl::handler &cgh) { + const real_type *q = q_; + real_type *ret = ret_; + const real_type *d = d_; + const real_type *data_d = data_d_; + const real_type QA_cost = QA_cost_; + const real_type cost = cost_; + const kernel_index_type num_rows = num_rows_; + const kernel_index_type feature_range = feature_range_; + const real_type add = add_; + const kernel_index_type device = device_; + + cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(0) == idx_2) { - data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; + + // matrix + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } } - } - ::sycl::group_barrier(nd_idx.get_group()); + }); + + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < feature_range * num_rows; vec_index += num_rows) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)]; + } + } + } + }); + + // implicit group barrier + + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; + } + } + } + }); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; + // implicit group barrier } - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - matr[k][l] += data_i * data_j[k]; - } - } - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - real_type temp; - if (device_ == 0) { - temp = (matr[x][y] + QA_cost_ - q_[i + y] - q_[j + x]) * add_; - } else { - temp = matr[x][y] * add_; - } - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret_[i + y] } += temp * d_[j + x]; - ret_jx += temp * d_[i + y]; - } else if (i + x == j + y) { - // diagonal - if (device_ == 0) { - ret_jx += (temp + cost_ * add_) * d_[i + y]; - } else { - ret_jx += temp * d_[i + y]; + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + real_type temp; + if (device == 0) { + temp = (private_matr(idx)[x][y] + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add; + } else { + temp = private_matr(idx)[x][y] * add; + } + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x]; + ret_jx += temp * d[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + if (device == 0) { + ret_jx += (temp + cost * add) * d[private_i(idx) + y]; + } else { + ret_jx += temp * d[private_i(idx) + y]; + } + } + } + atomic_op{ ret[private_j(idx) + x] } += ret_jx; } } - } - atomic_op{ ret_[j + x] } += ret_jx; - } - } + }); + }); + }); } private: - local_accessor data_intern_i_; - local_accessor data_intern_j_; + ::sycl::queue &queue_; + ::sycl::range<2> global_range_; + ::sycl::range<2> local_range_; const real_type *q_; real_type *ret_; @@ -163,7 +209,8 @@ class device_kernel_poly { /** * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel. - * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued + * @param[in] range the execution range of the kernel * @param[in] q the `q` vector * @param[out] ret the result vector * @param[in] d the right-hand side of the equation @@ -182,248 +229,127 @@ class device_kernel_poly { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ void operator()() const { - - queue_.submit([&](::sycl::handler& cgh) { - const real_type *q = q_; - real_type *ret = ret_; - const real_type *d = d_; - const real_type *data_d = data_d_; - const real_type QA_cost = QA_cost_; - const real_type cost = cost_; - const kernel_index_type num_rows = num_rows_; - const kernel_index_type num_cols = num_cols_; - const real_type add = add_; - const int degree = degree_; - const real_type gamma = gamma_; - const real_type coef0 = coef0_; - - cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { - // allocate shared memory - real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - - ::sycl::private_memory private_matr{ group }; - ::sycl::private_memory private_data_j{ group }; - - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { - private_matr(idx)[i][j] = real_type{ 0.0 }; - } - } - }); - - for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { - - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - if (i >= j) { - i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(1) == idx_1) { - data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; + queue_.submit([&](::sycl::handler &cgh) { + const real_type *q = q_; + real_type *ret = ret_; + const real_type *d = d_; + const real_type *data_d = data_d_; + const real_type QA_cost = QA_cost_; + const real_type cost = cost_; + const kernel_index_type num_rows = num_rows_; + const kernel_index_type num_cols = num_cols_; + const real_type add = add_; + const int degree = degree_; + const real_type gamma = gamma_; + const real_type coef0 = coef0_; + + cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(0) == idx_2) { - data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; - } - } - } - }); - - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - if (i >= j) { - i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; - } - } - } - }); - - } - - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - if (i >= j) { - i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast(degree)) + QA_cost - q[i + y] - q[j + x]) * add; - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret[i + y] } += temp * d[j + x]; - ret_jx += temp * d[i + y]; - } else if (i + x == j + y) { - // diagonal - ret_jx += (temp + cost * add) * d[i + y]; + // matrix + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; } } - atomic_op{ ret[j + x] } += ret_jx; - } - } - }); - /* - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = gi * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = gj * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; - real_type data_j[INTERNAL_BLOCK_SIZE]; - - if (i >= j) { - i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - // cache data - for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { - ::sycl::group_barrier(group); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(1) == idx_1) { - data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; - } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(0) == idx_2) { - data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; - } - } -// ::sycl::group_barrier(group); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - data_j[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - matr[k][l] += data_i * data_j[k]; - } - } - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::pow(gamma * matr[x][y] + coef0, static_cast(degree)) + QA_cost - q[i + y] - q[j + x]) * add; - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret[i + y] } += temp * d[j + x]; - ret_jx += temp * d[i + y]; - } else if (i + x == j + y) { - // diagonal - ret_jx += (temp + cost * add) * d[i + y]; - } - } - atomic_op{ ret[j + x] } += ret_jx; - } - } - }); - */ - }); - }); - /* - kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; - real_type data_j[INTERNAL_BLOCK_SIZE]; - - if (i >= j) { - i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - // cache data - for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { - ::sycl::group_barrier(nd_idx.get_group()); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(1) == idx) { - data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; - } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(0) == idx_2) { - data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; - } - } - ::sycl::group_barrier(nd_idx.get_group()); + }); + + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)]; + } + } + } + }); + + // implicit group barrier + + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; + } + } + } + }); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; + // implicit group barrier } - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - matr[k][l] += data_i * data_j[k]; - } - } - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::pow(gamma_ * matr[x][y] + coef0_, static_cast(degree_)) + QA_cost_ - q_[i + y] - q_[j + x]) * add_; - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret_[i + y] } += temp * d_[j + x]; - ret_jx += temp * d_[i + y]; - } else if (i + x == j + y) { - // diagonal - ret_jx += (temp + cost_ * add_) * d_[i + y]; + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast(degree)) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add; + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x]; + ret_jx += temp * d[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + ret_jx += (temp + cost * add) * d[private_i(idx) + y]; + } + } + atomic_op{ ret[private_j(idx) + x] } += ret_jx; + } } - } - atomic_op{ ret_[j + x] } += ret_jx; - } - } - */ + }); + }); + }); } private: - ::sycl::queue& queue_; + ::sycl::queue &queue_; ::sycl::range<2> global_range_; ::sycl::range<2> local_range_; @@ -454,7 +380,8 @@ class device_kernel_radial { /** * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel. - * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued + * @param[in] range the execution range of the kernel * @param[in] q the `q` vector * @param[out] ret the result vector * @param[in] d the right-hand side of the equation @@ -466,79 +393,132 @@ class device_kernel_radial { * @param[in] add denotes whether the values are added or subtracted from the result vector * @param[in] gamma the gamma parameter used in the rbf kernel function */ - device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : - data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} + device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : + queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; - real_type data_j[INTERNAL_BLOCK_SIZE]; - - if (i >= j) { - i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - // cache data - for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { - ::sycl::group_barrier(nd_idx.get_group()); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(1) == idx) { - data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; + void operator()() const { + queue_.submit([&](::sycl::handler &cgh) { + const real_type *q = q_; + real_type *ret = ret_; + const real_type *d = d_; + const real_type *data_d = data_d_; + const real_type QA_cost = QA_cost_; + const real_type cost = cost_; + const kernel_index_type num_rows = num_rows_; + const kernel_index_type num_cols = num_cols_; + const real_type add = add_; + const real_type gamma = gamma_; + + cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(0) == idx_2) { - data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; + + // matrix + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } } - } - ::sycl::group_barrier(nd_idx.get_group()); + }); + + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)]; + } + } + } + }); + + // implicit group barrier + + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]); + } + } + } + }); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; + // implicit group barrier } - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]); - } - } - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::exp(-gamma_ * matr[x][y]) + QA_cost_ - q_[i + y] - q_[j + x]) * add_; - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret_[i + y] } += temp * d_[j + x]; - ret_jx += temp * d_[i + y]; - } else if (i + x == j + y) { - // diagonal - ret_jx += (temp + cost_ * add_) * d_[i + y]; + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::exp(-gamma * private_matr(idx)[x][y]) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add; + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x]; + ret_jx += temp * d[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + ret_jx += (temp + cost * add) * d[private_i(idx) + y]; + } + } + atomic_op{ ret[private_j(idx) + x] } += ret_jx; + } } - } - atomic_op{ ret_[j + x] } += ret_jx; - } - } + }); + }); + }); } private: - local_accessor data_intern_i_; - local_accessor data_intern_j_; + ::sycl::queue &queue_; + ::sycl::range<2> global_range_; + ::sycl::range<2> local_range_; const real_type *q_; real_type *ret_; diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index dea4188e8..332a54975 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -151,27 +151,17 @@ void csvm::run_q_kernel(const std::size_t device, const ::plssvm::detail::exe template void csvm::run_svm_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, const real_type add, const std::size_t num_features) { - const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); switch (kernel_) { case kernel_type::linear: - devices_[device].submit([&](::sycl::handler &cgh) { - cgh.parallel_for(execution_range, device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)); - }); + device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)(); break; case kernel_type::polynomial: - { PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!"); device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)(); - //devices_[device].submit([&](::sycl::handler &cgh) { - // cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); - //}); - } break; case kernel_type::rbf: PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!"); - devices_[device].submit([&](::sycl::handler &cgh) { - cgh.parallel_for(execution_range, device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)); - }); + device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)(); break; } } @@ -185,62 +175,62 @@ void csvm::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe template void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) { [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); - + switch (kernel_) { case kernel_type::linear: break; case kernel_type::polynomial: #if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL - { - ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; - ::sycl::range<2> local_range{ range.block[0], range.block[1] }; - devices_[0].submit([&](::sycl::handler& cgh) { - real_type *out_d_ptr = out_d.get(); - const real_type *data_d_ptr = data_d_[0].get(); - const real_type *data_last_d_ptr = data_last_d_[0].get(); - const real_type *alpha_d_ptr = alpha_d.get(); - const std::size_t num_data_points = num_data_points_; - const real_type *point_d_ptr = point_d.get(); - const std::size_t num_predict_points = p_num_predict_points; - const std::size_t num_features = num_features_; - const int degree = degree_; - const real_type gamma = gamma_; - const real_type coef0 = coef0_; - - cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { - group.parallel_for_work_item(device_kernel_predict_poly>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0)); - }); + { + ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; + ::sycl::range<2> local_range{ range.block[0], range.block[1] }; + devices_[0].submit([&](::sycl::handler &cgh) { + real_type *out_d_ptr = out_d.get(); + const real_type *data_d_ptr = data_d_[0].get(); + const real_type *data_last_d_ptr = data_last_d_[0].get(); + const real_type *alpha_d_ptr = alpha_d.get(); + const std::size_t num_data_points = num_data_points_; + const real_type *point_d_ptr = point_d.get(); + const std::size_t num_predict_points = p_num_predict_points; + const std::size_t num_features = num_features_; + const int degree = degree_; + const real_type gamma = gamma_; + const real_type coef0 = coef0_; + + cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { + group.parallel_for_work_item(device_kernel_predict_poly>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0)); }); - } + }); + } #elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP devices_[0].parallel_for(execution_range, device_kernel_predict_poly>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_)); #endif - break; + break; case kernel_type::rbf: #if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL - { - ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; - ::sycl::range<2> local_range{ range.block[0], range.block[1] }; - devices_[0].submit([&](::sycl::handler& cgh) { - real_type *out_d_ptr = out_d.get(); - const real_type *data_d_ptr = data_t_[0].get(); - const real_type *data_last_d_ptr = data_last_d_[0].get(); - const real_type *alpha_d_ptr = alpha_d.get(); - const std::size_t num_data_points = num_data_points_; - const real_type *point_d_ptr = point_d.get(); - const std::size_t num_predict_points = p_num_predict_points; - const std::size_t num_features = num_features_; - const real_type gamma = gamma_; - - cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { - group.parallel_for_work_item(device_kernel_predict_radial>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma)); - }); + { + ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; + ::sycl::range<2> local_range{ range.block[0], range.block[1] }; + devices_[0].submit([&](::sycl::handler &cgh) { + real_type *out_d_ptr = out_d.get(); + const real_type *data_d_ptr = data_t_[0].get(); + const real_type *data_last_d_ptr = data_last_d_[0].get(); + const real_type *alpha_d_ptr = alpha_d.get(); + const std::size_t num_data_points = num_data_points_; + const real_type *point_d_ptr = point_d.get(); + const std::size_t num_predict_points = p_num_predict_points; + const std::size_t num_features = num_features_; + const real_type gamma = gamma_; + + cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { + group.parallel_for_work_item(device_kernel_predict_radial>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma)); }); - } + }); + } #elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP devices_[0].parallel_for(execution_range, device_kernel_predict_radial>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_)); #endif - break; + break; } } From a5c7600996713d335f5a2e433a7aee43c56b1b82 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Feb 2022 16:27:48 +0100 Subject: [PATCH 10/56] Add explicit CUDA target arch to DPC++ compile flags to prevent runtime jitting. --- src/plssvm/backends/SYCL/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt index b65956d58..cef2b4073 100644 --- a/src/plssvm/backends/SYCL/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/CMakeLists.txt @@ -68,6 +68,10 @@ elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++") if(DEFINED PLSSVM_NVIDIA_TARGET_ARCHS) target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda) target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda) + foreach(PLSSVM_NVIDIA_TARGET_ARCH_NAME ${PLSSVM_NVIDIA_TARGET_ARCHS}) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME}) + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME}) + endforeach() endif() # amd targets if(DEFINED PLSSVM_AMD_TARGET_ARCHS) From 1ab61ff5823ed3c59497c6c88dc63f9583f0198b Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 4 Feb 2022 17:41:07 +0100 Subject: [PATCH 11/56] Change predict kernel to use hierarchical SYCL notation if hipSYCL is used (faster on the CPU) --- .../plssvm/backends/SYCL/predict_kernel.hpp | 28 +++++----- src/plssvm/backends/SYCL/csvm.cpp | 52 +++++++++++++++++-- 2 files changed, 65 insertions(+), 15 deletions(-) diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp index 98b666676..d8568facf 100644 --- a/include/plssvm/backends/SYCL/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/predict_kernel.hpp @@ -72,12 +72,15 @@ class device_kernel_w_linear { * @brief Predicts the labels for data points using the polynomial kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data points + * @tparam U the type of the `sycl::item` */ -template +template class device_kernel_predict_poly { public: /// The type of the data. using real_type = T; + /// The `sycl::item` type. + using sycl_item_type = U; /** * @brief Construct a new device kernel to predict the labels for data points using the polynomial kernel function. @@ -99,12 +102,11 @@ class device_kernel_predict_poly { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - const kernel_index_type data_point_index = nd_idx.get_global_id(0); - const kernel_index_type predict_point_index = nd_idx.get_global_id(1); + void operator()(sycl_item_type idx) const { + const kernel_index_type data_point_index = idx.get_global_id(0); + const kernel_index_type predict_point_index = idx.get_global_id(1); real_type temp = 0; if (predict_point_index < num_predict_points_) { @@ -140,12 +142,15 @@ class device_kernel_predict_poly { * @brief Predicts the labels for data points using the radial basis functions kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data points + * @tparam U the type of the `sycl::item` */ -template +template class device_kernel_predict_radial { public: /// The type of the data. using real_type = T; + /// The `sycl::item` type + using sycl_item_type = U; /** * @brief Construct a new device kernel to predict the labels for data points using the radial basis function kernel function. @@ -165,12 +170,11 @@ class device_kernel_predict_radial { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - const kernel_index_type data_point_index = nd_idx.get_global_id(0); - const kernel_index_type predict_point_index = nd_idx.get_global_id(1); + void operator()(sycl_item_type idx) const { + const kernel_index_type data_point_index = idx.get_global_id(0); + const kernel_index_type predict_point_index = idx.get_global_id(1); real_type temp = 0; if (predict_point_index < num_predict_points_) { diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 9f70416fc..106e70817 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -185,16 +185,62 @@ void csvm::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe } template -void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) { +void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) { const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); switch (kernel_) { case kernel_type::linear: break; case kernel_type::polynomial: - devices_[0].parallel_for(execution_range, device_kernel_predict_poly(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, degree_, gamma_, coef0_)); +#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL + { + ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; + ::sycl::range<2> local_range{ range.block[0], range.block[1] }; + devices_[0].submit([&](::sycl::handler& cgh) { + real_type *out_d_ptr = out_d.get(); + const real_type *data_d_ptr = data_d_[0].get(); + const real_type *data_last_d_ptr = data_last_d_[0].get(); + const real_type *alpha_d_ptr = alpha_d.get(); + const std::size_t num_data_points = num_data_points_; + const real_type *point_d_ptr = point_d.get(); + const std::size_t num_predict_points = p_num_predict_points; + const std::size_t num_features = num_features_; + const int degree = degree_; + const real_type gamma = gamma_; + const real_type coef0 = coef0_; + + cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { + group.parallel_for_work_item(device_kernel_predict_poly>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0)); + }); + }); + } +#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP + devices_[0].parallel_for(execution_range, device_kernel_predict_poly>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_)); +#endif break; case kernel_type::rbf: - devices_[0].parallel_for(execution_range, device_kernel_predict_radial(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, gamma_)); +#if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL + { + ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; + ::sycl::range<2> local_range{ range.block[0], range.block[1] }; + devices_[0].submit([&](::sycl::handler& cgh) { + real_type *out_d_ptr = out_d.get(); + const real_type *data_d_ptr = data_t_[0].get(); + const real_type *data_last_d_ptr = data_last_d_[0].get(); + const real_type *alpha_d_ptr = alpha_d.get(); + const std::size_t num_data_points = num_data_points_; + const real_type *point_d_ptr = point_d.get(); + const std::size_t num_predict_points = p_num_predict_points; + const std::size_t num_features = num_features_; + const real_type gamma = gamma_; + + cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { + group.parallel_for_work_item(device_kernel_predict_radial>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma)); + }); + }); + } +#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP + devices_[0].parallel_for(execution_range, device_kernel_predict_radial>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_)); +#endif break; } } From cd5180e66c7306e86a3ee1f207839152e3f825a4 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sun, 6 Feb 2022 19:12:27 +0100 Subject: [PATCH 12/56] Add [[maybe_unused]] attribute. --- src/plssvm/backends/SYCL/csvm.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 106e70817..674b1fea6 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -186,7 +186,8 @@ void csvm::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe template void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) { - const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); + [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); + switch (kernel_) { case kernel_type::linear: break; From 088afcc85461da939775bd20c03518009941e33f Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Feb 2022 10:04:19 +0100 Subject: [PATCH 13/56] First try to reformulate SYCL SVM kernel using hiearchical kernels. --- include/plssvm/backends/SYCL/svm_kernel.hpp | 190 +++++++++++++++++++- src/plssvm/backends/SYCL/csvm.cpp | 9 +- 2 files changed, 191 insertions(+), 8 deletions(-) diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp index a0b1a669b..9ea3546aa 100644 --- a/include/plssvm/backends/SYCL/svm_kernel.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel.hpp @@ -11,6 +11,7 @@ #pragma once +#include "plssvm/detail/execution_range.hpp" #include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL #include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE @@ -176,15 +177,192 @@ class device_kernel_poly { * @param[in] gamma the gamma parameter used in the polynomial kernel function * @param[in] coef0 the coef0 parameter used in the polynomial kernel function */ - device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : - data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} + device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : + queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} /** * @brief Function call operator overload performing the actual calculation. * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { + void operator()() const { + + queue_.submit([&](::sycl::handler& cgh) { + const real_type *q = q_; + real_type *ret = ret_; + const real_type *d = d_; + const real_type *data_d = data_d_; + const real_type QA_cost = QA_cost_; + const real_type cost = cost_; + const kernel_index_type num_rows = num_rows_; + const kernel_index_type num_cols = num_cols_; + const real_type add = add_; + const int degree = degree_; + const real_type gamma = gamma_; + const real_type coef0 = coef0_; + + cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + //const std::size_t gi = group.get_group_id(0); + //const std::size_t gj = group.get_group_id(1); + + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } + } + }); + + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + + if (i >= j) { + i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; + } + } + } + }); + + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + + if (i >= j) { + i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; + } + } + } + }); + + } + + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + kernel_index_type i = group.get_group_id(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group.get_group_id(1) * INTERNAL_BLOCK_SIZE; + + if (i >= j) { + i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast(degree)) + QA_cost - q[i + y] - q[j + x]) * add; + if (i + x > j + y) { + // upper triangular matrix + atomic_op{ ret[i + y] } += temp * d[j + x]; + ret_jx += temp * d[i + y]; + } else if (i + x == j + y) { + // diagonal + ret_jx += (temp + cost * add) * d[i + y]; + } + } + atomic_op{ ret[j + x] } += ret_jx; + } + } + }); + /* + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + kernel_index_type i = gi * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = gj * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + + real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; + real_type data_j[INTERNAL_BLOCK_SIZE]; + + if (i >= j) { + i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + // cache data + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + ::sycl::group_barrier(group); + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; + } + } +// ::sycl::group_barrier(group); + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + data_j[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + matr[k][l] += data_i * data_j[k]; + } + } + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::pow(gamma * matr[x][y] + coef0, static_cast(degree)) + QA_cost - q[i + y] - q[j + x]) * add; + if (i + x > j + y) { + // upper triangular matrix + atomic_op{ ret[i + y] } += temp * d[j + x]; + ret_jx += temp * d[i + y]; + } else if (i + x == j + y) { + // diagonal + ret_jx += (temp + cost * add) * d[i + y]; + } + } + atomic_op{ ret[j + x] } += ret_jx; + } + } + }); + */ + }); + }); + /* kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; @@ -244,11 +422,13 @@ class device_kernel_poly { atomic_op{ ret_[j + x] } += ret_jx; } } + */ } private: - local_accessor data_intern_i_; - local_accessor data_intern_j_; + ::sycl::queue& queue_; + ::sycl::range<2> global_range_; + ::sycl::range<2> local_range_; const real_type *q_; real_type *ret_; diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 674b1fea6..7d2786962 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -164,10 +164,13 @@ void csvm::run_svm_kernel(const std::size_t device, const ::plssvm::detail::e }); break; case kernel_type::polynomial: + { PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!"); - devices_[device].submit([&](::sycl::handler &cgh) { - cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); - }); + device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)(); + //devices_[device].submit([&](::sycl::handler &cgh) { + // cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); + //}); + } break; case kernel_type::rbf: PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!"); From 9c4088c8583a6512eef20aa6b8495e51574f3ef7 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Feb 2022 10:58:22 +0100 Subject: [PATCH 14/56] Change get_group_id() to operator[] since the former currently isn't implemented in DPC++. --- include/plssvm/backends/SYCL/svm_kernel.hpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp index 9ea3546aa..4492c0db9 100644 --- a/include/plssvm/backends/SYCL/svm_kernel.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel.hpp @@ -206,9 +206,6 @@ class device_kernel_poly { real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - //const std::size_t gi = group.get_group_id(0); - //const std::size_t gj = group.get_group_id(1); - ::sycl::private_memory private_matr{ group }; ::sycl::private_memory private_data_j{ group }; @@ -224,8 +221,8 @@ class device_kernel_poly { for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; if (i >= j) { i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; @@ -246,8 +243,8 @@ class device_kernel_poly { }); group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; if (i >= j) { i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; @@ -272,8 +269,8 @@ class device_kernel_poly { } group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group.get_group_id(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group.get_group_id(1) * INTERNAL_BLOCK_SIZE; + kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; if (i >= j) { i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; From 0cf9aa6eac9c185c6c1fa602bc10273d2492d795 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Feb 2022 14:07:47 +0100 Subject: [PATCH 15/56] Rewrite other SYCL SVM kernel to also use hierarchical form. --- include/plssvm/backends/SYCL/svm_kernel.hpp | 722 ++++++++++---------- src/plssvm/backends/SYCL/csvm.cpp | 96 ++- 2 files changed, 394 insertions(+), 424 deletions(-) diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp index 4492c0db9..1ac1df6b6 100644 --- a/include/plssvm/backends/SYCL/svm_kernel.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel.hpp @@ -11,25 +11,17 @@ #pragma once -#include "plssvm/detail/execution_range.hpp" +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::atomic_op #include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL #include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE +#include "plssvm/detail/execution_range.hpp" // plssvm::detail::execution_range -#include "sycl/sycl.hpp" // sycl::nd_item, sycl::handler, sycl::accessor, sycl::access::mode, sycl::access::target, sycl::range, sycl::group_barrier, sycl::pow, - // sycl::exp, sycl::atomic_ref, sycl::memory_order, sycl::memory_scope, sycl::access::address_space +#include "sycl/sycl.hpp" // sycl::queue, sycl::handler, sycl::h_item, sycl::range, sycl::private_memory, sycl::pow, sycl::exp #include // std::size_t namespace plssvm::sycl { -// TODO: change to ::sycl::local_accessor once implemented in the SYCL implementations -/** - * @brief Shortcut alias for a SYCL local accessor. - * @tparam T the type of the accessed values - */ -template -using local_accessor = ::sycl::accessor; - /** * @brief Calculates the C-SVM kernel using the linear kernel function. * @details Supports multi-GPU execution. @@ -43,7 +35,8 @@ class device_kernel_linear { /** * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel. - * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued + * @param[in] range the execution range of the kernel * @param[in] q the `q` vector * @param[out] ret the result vector * @param[in] d the right-hand side of the equation @@ -55,88 +48,141 @@ class device_kernel_linear { * @param[in] add denotes whether the values are added or subtracted from the result vector * @param[in] id the id of the device */ - device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : - data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} + device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : + queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; - real_type data_j[INTERNAL_BLOCK_SIZE]; - - if (i >= j) { - i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - // cache data - for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) { - ::sycl::group_barrier(nd_idx.get_group()); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(1) == idx) { - data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; + void operator()() const { + queue_.submit([&](::sycl::handler &cgh) { + const real_type *q = q_; + real_type *ret = ret_; + const real_type *d = d_; + const real_type *data_d = data_d_; + const real_type QA_cost = QA_cost_; + const real_type cost = cost_; + const kernel_index_type num_rows = num_rows_; + const kernel_index_type feature_range = feature_range_; + const real_type add = add_; + const kernel_index_type device = device_; + + cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(0) == idx_2) { - data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; + + // matrix + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } } - } - ::sycl::group_barrier(nd_idx.get_group()); + }); + + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < feature_range * num_rows; vec_index += num_rows) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)]; + } + } + } + }); + + // implicit group barrier + + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; + } + } + } + }); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; + // implicit group barrier } - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - matr[k][l] += data_i * data_j[k]; - } - } - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - real_type temp; - if (device_ == 0) { - temp = (matr[x][y] + QA_cost_ - q_[i + y] - q_[j + x]) * add_; - } else { - temp = matr[x][y] * add_; - } - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret_[i + y] } += temp * d_[j + x]; - ret_jx += temp * d_[i + y]; - } else if (i + x == j + y) { - // diagonal - if (device_ == 0) { - ret_jx += (temp + cost_ * add_) * d_[i + y]; - } else { - ret_jx += temp * d_[i + y]; + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + real_type temp; + if (device == 0) { + temp = (private_matr(idx)[x][y] + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add; + } else { + temp = private_matr(idx)[x][y] * add; + } + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x]; + ret_jx += temp * d[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + if (device == 0) { + ret_jx += (temp + cost * add) * d[private_i(idx) + y]; + } else { + ret_jx += temp * d[private_i(idx) + y]; + } + } + } + atomic_op{ ret[private_j(idx) + x] } += ret_jx; } } - } - atomic_op{ ret_[j + x] } += ret_jx; - } - } + }); + }); + }); } private: - local_accessor data_intern_i_; - local_accessor data_intern_j_; + ::sycl::queue &queue_; + ::sycl::range<2> global_range_; + ::sycl::range<2> local_range_; const real_type *q_; real_type *ret_; @@ -163,7 +209,8 @@ class device_kernel_poly { /** * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel. - * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued + * @param[in] range the execution range of the kernel * @param[in] q the `q` vector * @param[out] ret the result vector * @param[in] d the right-hand side of the equation @@ -182,248 +229,127 @@ class device_kernel_poly { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ void operator()() const { - - queue_.submit([&](::sycl::handler& cgh) { - const real_type *q = q_; - real_type *ret = ret_; - const real_type *d = d_; - const real_type *data_d = data_d_; - const real_type QA_cost = QA_cost_; - const real_type cost = cost_; - const kernel_index_type num_rows = num_rows_; - const kernel_index_type num_cols = num_cols_; - const real_type add = add_; - const int degree = degree_; - const real_type gamma = gamma_; - const real_type coef0 = coef0_; - - cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { - // allocate shared memory - real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - - ::sycl::private_memory private_matr{ group }; - ::sycl::private_memory private_data_j{ group }; - - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { - private_matr(idx)[i][j] = real_type{ 0.0 }; - } - } - }); - - for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { - - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - if (i >= j) { - i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(1) == idx_1) { - data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; + queue_.submit([&](::sycl::handler &cgh) { + const real_type *q = q_; + real_type *ret = ret_; + const real_type *d = d_; + const real_type *data_d = data_d_; + const real_type QA_cost = QA_cost_; + const real_type cost = cost_; + const kernel_index_type num_rows = num_rows_; + const kernel_index_type num_cols = num_cols_; + const real_type add = add_; + const int degree = degree_; + const real_type gamma = gamma_; + const real_type coef0 = coef0_; + + cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(0) == idx_2) { - data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; - } - } - } - }); - - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - if (i >= j) { - i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; - } - } - } - }); - - } - - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - if (i >= j) { - i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast(degree)) + QA_cost - q[i + y] - q[j + x]) * add; - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret[i + y] } += temp * d[j + x]; - ret_jx += temp * d[i + y]; - } else if (i + x == j + y) { - // diagonal - ret_jx += (temp + cost * add) * d[i + y]; + // matrix + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; } } - atomic_op{ ret[j + x] } += ret_jx; - } - } - }); - /* - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - kernel_index_type i = gi * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = gj * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; - real_type data_j[INTERNAL_BLOCK_SIZE]; - - if (i >= j) { - i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - // cache data - for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { - ::sycl::group_barrier(group); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(1) == idx_1) { - data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; - } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(0) == idx_2) { - data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; - } - } -// ::sycl::group_barrier(group); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - data_j[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - matr[k][l] += data_i * data_j[k]; - } - } - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::pow(gamma * matr[x][y] + coef0, static_cast(degree)) + QA_cost - q[i + y] - q[j + x]) * add; - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret[i + y] } += temp * d[j + x]; - ret_jx += temp * d[i + y]; - } else if (i + x == j + y) { - // diagonal - ret_jx += (temp + cost * add) * d[i + y]; - } - } - atomic_op{ ret[j + x] } += ret_jx; - } - } - }); - */ - }); - }); - /* - kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; - real_type data_j[INTERNAL_BLOCK_SIZE]; - - if (i >= j) { - i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - // cache data - for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { - ::sycl::group_barrier(nd_idx.get_group()); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(1) == idx) { - data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; - } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(0) == idx_2) { - data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; - } - } - ::sycl::group_barrier(nd_idx.get_group()); + }); + + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)]; + } + } + } + }); + + // implicit group barrier + + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; + } + } + } + }); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; + // implicit group barrier } - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - matr[k][l] += data_i * data_j[k]; - } - } - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::pow(gamma_ * matr[x][y] + coef0_, static_cast(degree_)) + QA_cost_ - q_[i + y] - q_[j + x]) * add_; - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret_[i + y] } += temp * d_[j + x]; - ret_jx += temp * d_[i + y]; - } else if (i + x == j + y) { - // diagonal - ret_jx += (temp + cost_ * add_) * d_[i + y]; + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast(degree)) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add; + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x]; + ret_jx += temp * d[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + ret_jx += (temp + cost * add) * d[private_i(idx) + y]; + } + } + atomic_op{ ret[private_j(idx) + x] } += ret_jx; + } } - } - atomic_op{ ret_[j + x] } += ret_jx; - } - } - */ + }); + }); + }); } private: - ::sycl::queue& queue_; + ::sycl::queue &queue_; ::sycl::range<2> global_range_; ::sycl::range<2> local_range_; @@ -454,7 +380,8 @@ class device_kernel_radial { /** * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel. - * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued + * @param[in] range the execution range of the kernel * @param[in] q the `q` vector * @param[out] ret the result vector * @param[in] d the right-hand side of the equation @@ -466,79 +393,132 @@ class device_kernel_radial { * @param[in] add denotes whether the values are added or subtracted from the result vector * @param[in] gamma the gamma parameter used in the rbf kernel function */ - device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : - data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} + device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : + queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - - real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; - real_type data_j[INTERNAL_BLOCK_SIZE]; - - if (i >= j) { - i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - - // cache data - for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { - ::sycl::group_barrier(nd_idx.get_group()); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(1) == idx) { - data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; + void operator()() const { + queue_.submit([&](::sycl::handler &cgh) { + const real_type *q = q_; + real_type *ret = ret_; + const real_type *d = d_; + const real_type *data_d = data_d_; + const real_type QA_cost = QA_cost_; + const real_type cost = cost_; + const kernel_index_type num_rows = num_rows_; + const kernel_index_type num_cols = num_cols_; + const real_type add = add_; + const real_type gamma = gamma_; + + cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (nd_idx.get_local_id(0) == idx_2) { - data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; + + // matrix + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } } - } - ::sycl::group_barrier(nd_idx.get_group()); + }); + + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)]; + } + } + } + }); + + // implicit group barrier + + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]); + } + } + } + }); - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; + // implicit group barrier } - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]); - } - } - } - - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::exp(-gamma_ * matr[x][y]) + QA_cost_ - q_[i + y] - q_[j + x]) * add_; - if (i + x > j + y) { - // upper triangular matrix - atomic_op{ ret_[i + y] } += temp * d_[j + x]; - ret_jx += temp * d_[i + y]; - } else if (i + x == j + y) { - // diagonal - ret_jx += (temp + cost_ * add_) * d_[i + y]; + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::exp(-gamma * private_matr(idx)[x][y]) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add; + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x]; + ret_jx += temp * d[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + ret_jx += (temp + cost * add) * d[private_i(idx) + y]; + } + } + atomic_op{ ret[private_j(idx) + x] } += ret_jx; + } } - } - atomic_op{ ret_[j + x] } += ret_jx; - } - } + }); + }); + }); } private: - local_accessor data_intern_i_; - local_accessor data_intern_j_; + ::sycl::queue &queue_; + ::sycl::range<2> global_range_; + ::sycl::range<2> local_range_; const real_type *q_; real_type *ret_; diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 7d2786962..9c7d180df 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -156,27 +156,17 @@ void csvm::run_q_kernel(const std::size_t device, const ::plssvm::detail::exe template void csvm::run_svm_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, const real_type add, const std::size_t num_features) { - const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); switch (kernel_) { case kernel_type::linear: - devices_[device].submit([&](::sycl::handler &cgh) { - cgh.parallel_for(execution_range, device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)); - }); + device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)(); break; case kernel_type::polynomial: - { PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!"); device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)(); - //devices_[device].submit([&](::sycl::handler &cgh) { - // cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); - //}); - } break; case kernel_type::rbf: PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!"); - devices_[device].submit([&](::sycl::handler &cgh) { - cgh.parallel_for(execution_range, device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)); - }); + device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)(); break; } } @@ -190,62 +180,62 @@ void csvm::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe template void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) { [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); - + switch (kernel_) { case kernel_type::linear: break; case kernel_type::polynomial: #if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL - { - ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; - ::sycl::range<2> local_range{ range.block[0], range.block[1] }; - devices_[0].submit([&](::sycl::handler& cgh) { - real_type *out_d_ptr = out_d.get(); - const real_type *data_d_ptr = data_d_[0].get(); - const real_type *data_last_d_ptr = data_last_d_[0].get(); - const real_type *alpha_d_ptr = alpha_d.get(); - const std::size_t num_data_points = num_data_points_; - const real_type *point_d_ptr = point_d.get(); - const std::size_t num_predict_points = p_num_predict_points; - const std::size_t num_features = num_features_; - const int degree = degree_; - const real_type gamma = gamma_; - const real_type coef0 = coef0_; - - cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { - group.parallel_for_work_item(device_kernel_predict_poly>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0)); - }); + { + ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; + ::sycl::range<2> local_range{ range.block[0], range.block[1] }; + devices_[0].submit([&](::sycl::handler &cgh) { + real_type *out_d_ptr = out_d.get(); + const real_type *data_d_ptr = data_d_[0].get(); + const real_type *data_last_d_ptr = data_last_d_[0].get(); + const real_type *alpha_d_ptr = alpha_d.get(); + const std::size_t num_data_points = num_data_points_; + const real_type *point_d_ptr = point_d.get(); + const std::size_t num_predict_points = p_num_predict_points; + const std::size_t num_features = num_features_; + const int degree = degree_; + const real_type gamma = gamma_; + const real_type coef0 = coef0_; + + cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { + group.parallel_for_work_item(device_kernel_predict_poly>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0)); }); - } + }); + } #elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP devices_[0].parallel_for(execution_range, device_kernel_predict_poly>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_)); #endif - break; + break; case kernel_type::rbf: #if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL - { - ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; - ::sycl::range<2> local_range{ range.block[0], range.block[1] }; - devices_[0].submit([&](::sycl::handler& cgh) { - real_type *out_d_ptr = out_d.get(); - const real_type *data_d_ptr = data_t_[0].get(); - const real_type *data_last_d_ptr = data_last_d_[0].get(); - const real_type *alpha_d_ptr = alpha_d.get(); - const std::size_t num_data_points = num_data_points_; - const real_type *point_d_ptr = point_d.get(); - const std::size_t num_predict_points = p_num_predict_points; - const std::size_t num_features = num_features_; - const real_type gamma = gamma_; - - cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { - group.parallel_for_work_item(device_kernel_predict_radial>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma)); - }); + { + ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; + ::sycl::range<2> local_range{ range.block[0], range.block[1] }; + devices_[0].submit([&](::sycl::handler &cgh) { + real_type *out_d_ptr = out_d.get(); + const real_type *data_d_ptr = data_t_[0].get(); + const real_type *data_last_d_ptr = data_last_d_[0].get(); + const real_type *alpha_d_ptr = alpha_d.get(); + const std::size_t num_data_points = num_data_points_; + const real_type *point_d_ptr = point_d.get(); + const std::size_t num_predict_points = p_num_predict_points; + const std::size_t num_features = num_features_; + const real_type gamma = gamma_; + + cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { + group.parallel_for_work_item(device_kernel_predict_radial>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma)); }); - } + }); + } #elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP devices_[0].parallel_for(execution_range, device_kernel_predict_radial>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_)); #endif - break; + break; } } From 5cbcbc198c800bacfed7fcd678f32732893c59b2 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Tue, 8 Feb 2022 13:01:38 +0100 Subject: [PATCH 16/56] Fix error in rewritten SYCL predict function. --- .../plssvm/backends/SYCL/predict_kernel.hpp | 26 ++++----- src/plssvm/backends/SYCL/csvm.cpp | 56 ++----------------- 2 files changed, 15 insertions(+), 67 deletions(-) diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp index d8568facf..c9fce1a0b 100644 --- a/include/plssvm/backends/SYCL/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/predict_kernel.hpp @@ -72,15 +72,12 @@ class device_kernel_w_linear { * @brief Predicts the labels for data points using the polynomial kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data points - * @tparam U the type of the `sycl::item` */ -template +template class device_kernel_predict_poly { public: /// The type of the data. using real_type = T; - /// The `sycl::item` type. - using sycl_item_type = U; /** * @brief Construct a new device kernel to predict the labels for data points using the polynomial kernel function. @@ -102,11 +99,11 @@ class device_kernel_predict_poly { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] idx the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(sycl_item_type idx) const { - const kernel_index_type data_point_index = idx.get_global_id(0); - const kernel_index_type predict_point_index = idx.get_global_id(1); + void operator()(::sycl::id<2> idx) const { + const kernel_index_type data_point_index = idx[0]; + const kernel_index_type predict_point_index = idx[1]; real_type temp = 0; if (predict_point_index < num_predict_points_) { @@ -142,15 +139,12 @@ class device_kernel_predict_poly { * @brief Predicts the labels for data points using the radial basis functions kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data points - * @tparam U the type of the `sycl::item` */ -template +template class device_kernel_predict_radial { public: /// The type of the data. using real_type = T; - /// The `sycl::item` type - using sycl_item_type = U; /** * @brief Construct a new device kernel to predict the labels for data points using the radial basis function kernel function. @@ -170,11 +164,11 @@ class device_kernel_predict_radial { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] idx the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(sycl_item_type idx) const { - const kernel_index_type data_point_index = idx.get_global_id(0); - const kernel_index_type predict_point_index = idx.get_global_id(1); + void operator()(::sycl::id<2> idx) const { + const kernel_index_type data_point_index = idx[0]; + const kernel_index_type predict_point_index = idx[1]; real_type temp = 0; if (predict_point_index < num_predict_points_) { diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 9c7d180df..eddfcf9b9 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -178,64 +178,18 @@ void csvm::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe } template -void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) { +void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) { [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); switch (kernel_) { case kernel_type::linear: break; case kernel_type::polynomial: -#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL - { - ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; - ::sycl::range<2> local_range{ range.block[0], range.block[1] }; - devices_[0].submit([&](::sycl::handler &cgh) { - real_type *out_d_ptr = out_d.get(); - const real_type *data_d_ptr = data_d_[0].get(); - const real_type *data_last_d_ptr = data_last_d_[0].get(); - const real_type *alpha_d_ptr = alpha_d.get(); - const std::size_t num_data_points = num_data_points_; - const real_type *point_d_ptr = point_d.get(); - const std::size_t num_predict_points = p_num_predict_points; - const std::size_t num_features = num_features_; - const int degree = degree_; - const real_type gamma = gamma_; - const real_type coef0 = coef0_; - - cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { - group.parallel_for_work_item(device_kernel_predict_poly>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0)); - }); - }); - } -#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP - devices_[0].parallel_for(execution_range, device_kernel_predict_poly>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_)); -#endif - break; + devices_[0].parallel_for(::sycl::range<2>{ num_data_points_, num_predict_points }, device_kernel_predict_poly(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, degree_, gamma_, coef0_)); + break; case kernel_type::rbf: -#if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL - { - ::sycl::range<2> global_range{ range.grid[0], range.grid[1] }; - ::sycl::range<2> local_range{ range.block[0], range.block[1] }; - devices_[0].submit([&](::sycl::handler &cgh) { - real_type *out_d_ptr = out_d.get(); - const real_type *data_d_ptr = data_t_[0].get(); - const real_type *data_last_d_ptr = data_last_d_[0].get(); - const real_type *alpha_d_ptr = alpha_d.get(); - const std::size_t num_data_points = num_data_points_; - const real_type *point_d_ptr = point_d.get(); - const std::size_t num_predict_points = p_num_predict_points; - const std::size_t num_features = num_features_; - const real_type gamma = gamma_; - - cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) { - group.parallel_for_work_item(device_kernel_predict_radial>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma)); - }); - }); - } -#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP - devices_[0].parallel_for(execution_range, device_kernel_predict_radial>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_)); -#endif - break; + devices_[0].parallel_for(::sycl::range<2>{ num_data_points_, num_predict_points }, device_kernel_predict_radial(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, gamma_)); + break; } } From 52ab64064fe43847c8dfbdf449fe73379237b7ad Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 17 Feb 2022 09:19:47 +0100 Subject: [PATCH 17/56] Remove erroneous hipSYCL compile flag --- src/plssvm/backends/SYCL/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt index cef2b4073..ebc235008 100644 --- a/src/plssvm/backends/SYCL/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/CMakeLists.txt @@ -58,7 +58,7 @@ if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL") # set backend compiler to hipSYCL (= 1) target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=1) # silence unknown options warnings - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -Wno-unknown-warning-option) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Wno-unknown-warning-option) elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++") # enable DPC++ SYCL support target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl) From 26a7d3650cacb710c612fa3047791a71053f317d Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 28 Feb 2022 10:37:59 +0100 Subject: [PATCH 18/56] Add timing output for OpenCL kernel JIT compilation. --- src/plssvm/backends/OpenCL/csvm.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp index 71381a86e..fe1e27cd8 100644 --- a/src/plssvm/backends/OpenCL/csvm.cpp +++ b/src/plssvm/backends/OpenCL/csvm.cpp @@ -21,9 +21,11 @@ #include "plssvm/parameter.hpp" // plssvm::parameter #include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "fmt/chrono.h" // can directly print std::chrono literals #include "fmt/core.h" // fmt::print, fmt::format #include "fmt/ostream.h" // can use fmt using operator<< overloads +#include // std::chrono #include // std::terminate #include // std::string #include // std::pair, std::make_pair, std::move @@ -91,6 +93,8 @@ csvm::csvm(const parameter ¶ms) : fmt::print("\n"); } + auto jit_start_time = std::chrono::steady_clock::now(); + // get kernel names std::pair kernel_names = detail::kernel_type_to_function_name(kernel_); // build necessary kernel @@ -110,6 +114,11 @@ csvm::csvm(const parameter ¶ms) : break; } + auto jit_end_time = std::chrono::steady_clock::now(); + if (print_info_) { + fmt::print("OpenCL kernel JIT compilation done in {}.\n", std::chrono::duration_cast(jit_end_time - jit_start_time)); + } + // sanity checks for the number of OpenCL kernels PLSSVM_ASSERT(devices_.size() == q_kernel_.size(), fmt::format("Number of kernels for the q kernel ({}) must match the number of devices ({})!", q_kernel_.size(), devices_.size())); PLSSVM_ASSERT(devices_.size() == svm_kernel_.size(), fmt::format("Number of kernels for the svm kernel ({}) must match the number of devices ({})!", svm_kernel_.size(), devices_.size())); From de9bcf12db8b6a5bb1e524969c86e2f2f60422ec Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 2 Mar 2022 09:52:51 +0100 Subject: [PATCH 19/56] Update SYCL DPC++ CMake code to allow AOT compilation for CPUs and Intel GPUs. Fix a bug that disallowed multiple targets or multiple architectures per target. --- CMakeLists.txt | 13 ++--- src/plssvm/backends/SYCL/CMakeLists.txt | 67 ++++++++++++++++++++----- 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 967b5cfe5..2dfaf064b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -255,28 +255,29 @@ foreach(PLSSVM_PLATFORM ${PLSSVM_TARGET_PLATFORMS}) if(PLSSVM_PLATFORM MATCHES "^cpu") # parse provided CPU architectures parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_CPU_TARGET_ARCHS PLSSVM_NUM_CPU_TARGET_ARCHS) - if(NOT PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 0) - message(FATAL_ERROR "Target platform \"cpu\" must not have any architecture specifications!") + if(PLSSVM_NUM_CPU_TARGET_ARCHS GREATER 1) + message(FATAL_ERROR "Target platform \"cpu\" must at most have one architecture specification!") endif() target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_CPU_TARGET) elseif(PLSSVM_PLATFORM MATCHES "^nvidia") # parse provided NVIDIA GPU architectures parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_NVIDIA_TARGET_ARCHS PLSSVM_NUM_NVIDIA_TARGET_ARCHS) if(PLSSVM_NUM_NVIDIA_TARGET_ARCHS EQUAL 0) - message(FATAL_ERROR "Target platform \"nvidia\" must at least have one architecture specifications!") + message(FATAL_ERROR "Target platform \"nvidia\" must at least have one architecture specification!") endif() target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_NVIDIA_TARGET) elseif(PLSSVM_PLATFORM MATCHES "^amd") # parse provided AMD GPU architectures parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_AMD_TARGET_ARCHS PLSSVM_NUM_AMD_TARGET_ARCHS) if(PLSSVM_NUM_AMD_TARGET_ARCHS EQUAL 0) - message(FATAL_ERROR "Target platform \"amd\" must at least have one architecture specifications!") + message(FATAL_ERROR "Target platform \"amd\" must at least have one architecture specification!") endif() target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_AMD_TARGET) elseif(PLSSVM_PLATFORM MATCHES "^intel") + # parse provided Intel GPU architectures parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_INTEL_TARGET_ARCHS PLSSVM_NUM_INTEL_TARGET_ARCHS) - if(NOT PLSSVM_NUM_INTEL_TARGET_ARCHS EQUAL 0) - message(FATAL_ERROR "Target platform \"intel\" must not have any architecture specifications!") + if(PLSSVM_NUM_INTEL_TARGET_ARCHS EQUAL 0) + message(FATAL_ERROR "Target platform \"intel\" must at least have one architecture specification!") endif() target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_INTEL_TARGET) else() diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt index ebc235008..2d0253b41 100644 --- a/src/plssvm/backends/SYCL/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/CMakeLists.txt @@ -8,17 +8,27 @@ message(CHECK_START "Checking for SYCL backend") # reformat PLSSVM_TARGET_PLATFORMS to be usable with HIPSYCL_TARGETS (in case hipSYCL may be available) -set(HIPSYCL_TARGETS ${PLSSVM_TARGET_PLATFORMS} CACHE STRING "" FORCE) +set(HIPSYCL_TARGETS "${PLSSVM_TARGET_PLATFORMS}" CACHE STRING "" FORCE) list(TRANSFORM HIPSYCL_TARGETS REPLACE "cpu" "omp") list(TRANSFORM HIPSYCL_TARGETS REPLACE "nvidia" "cuda") list(TRANSFORM HIPSYCL_TARGETS REPLACE "amd" "hip") list(TRANSFORM HIPSYCL_TARGETS REPLACE "intel" "spirv") +# remove CPU and Intel GPU target architectures since they are not supported when using hipSYCL +if(DEFINED PLSSVM_CPU_TARGET_ARCHS AND PLSSVM_NUM_CPU_TARGET_ARCHS GREATER 0) + string(REPLACE ";" "," PLSSVM_CPU_TARGET_ARCHS_COMMA "${PLSSVM_CPU_TARGET_ARCHS}") + string(REPLACE ":${PLSSVM_CPU_TARGET_ARCHS_COMMA}" "" HIPSYCL_TARGETS "${HIPSYCL_TARGETS}") +endif() +if(DEFINED PLSSVM_INTEL_TARGET_ARCHS) + string(REPLACE ";" "," PLSSVM_INTEL_TARGET_ARCHS_COMMA "${PLSSVM_INTEL_TARGET_ARCHS}") + string(REPLACE ":${PLSSVM_INTEL_TARGET_ARCHS_COMMA}" "" HIPSYCL_TARGETS "${HIPSYCL_TARGETS}") +endif() # check if hipSYCL is used as SYCL compiler find_package(hipSYCL CONFIG) if(hipSYCL_FOUND) set(PLSSVM_SYCL_BACKEND_COMPILER "hipSYCL" CACHE STRING "" FORCE) message(CHECK_PASS "found hipSYCL") + message(STATUS "Setting HIPSYCL_TARGETS to \"${HIPSYCL_TARGETS}\".") else() # if not, check if DPC++ is used instead try_compile(PLSSVM_SYCL_BACKEND_CHECK_FOR_DPCPP_COMPILER @@ -59,29 +69,60 @@ if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL") target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=1) # silence unknown options warnings target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Wno-unknown-warning-option) + + # print note that Intel GPU architecture specifications are ignored when using hipSYCL + if(DEFINED PLSSVM_INTEL_TARGET_ARCHS) + message(STATUS "Ignoring specified Intel architectures \"${PLSSVM_INTEL_TARGET_ARCHS}\" in favor of SPIR-V when using hipSYCL!") + endif() elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++") # enable DPC++ SYCL support target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl) target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl) + set(PLSSVM_DPCPP_FSYCL_TARGETS "") + # cpu targets + if(DEFINED PLSSVM_CPU_TARGET_ARCHS) + # assemble -fsycl-targets + list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "spir64_x86_64") + # add target specific flags for AOT + if(PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}") + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}") + endif() + endif() # nvidia targets if(DEFINED PLSSVM_NVIDIA_TARGET_ARCHS) - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda) - target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda) - foreach(PLSSVM_NVIDIA_TARGET_ARCH_NAME ${PLSSVM_NVIDIA_TARGET_ARCHS}) - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME}) - target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME}) - endforeach() + # assemble -fsycl-targets + list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "nvptx64-nvidia-cuda") + # add target specific flags for AOT + list(JOIN PLSSVM_NVIDIA_TARGET_ARCHS "," PLSSVM_NVIDIA_TARGET_ARCHS_STRING) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda "-cuda-gpu-arch=${PLSSVM_NVIDIA_TARGET_ARCHS_STRING}") + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda "-cuda-gpu-arch=${PLSSVM_NVIDIA_TARGET_ARCHS_STRING}") endif() # amd targets if(DEFINED PLSSVM_AMD_TARGET_ARCHS) - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=amdgcn-amd-amdhsa) - target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=amdgcn-amd-amdhsa) - foreach(PLSSVM_AMD_TARGET_ARCH_NAME ${PLSSVM_AMD_TARGET_ARCHS}) - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCH_NAME}) - target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCH_NAME}) - endforeach() + # assemble -fsycl-targets + list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "amdgcn-amd-amdhsa") + # add target specific flags for AOT + if(NOT PLSSVM_NUM_AMD_TARGET_ARCHS EQUAL 1) + message(FATAL_ERROR "DPC++ currently only supports a single AMD architecture specification but ${PLSSVM_NUM_AMD_TARGET_ARCHS} were provided!") + endif() + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS}) + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS}) + endif() + # intel targets + if(DEFINED PLSSVM_INTEL_TARGET_ARCHS) + # assemble -fsycl-targets + list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "spir64_gen") + # add target specific flags for AOT + list(JOIN PLSSVM_INTEL_TARGET_ARCHS "," PLSSVM_INTEL_TARGET_ARCHS_STRING) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}") + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}") endif() + # set -fsycl-targets + list(JOIN PLSSVM_DPCPP_FSYCL_TARGETS "," PLSSVM_DPCPP_FSYCL_TARGETS_STRING) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=${PLSSVM_DPCPP_FSYCL_TARGETS_STRING}) + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=${PLSSVM_DPCPP_FSYCL_TARGETS_STRING}) # set backend compiler to DPC++ (= 0) target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=0) From 99f07f462b67f8168b469d9991a6a950c87c4c28 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 2 Mar 2022 09:53:56 +0100 Subject: [PATCH 20/56] Update summary string to also include CPU architecture if possible and remove architecture where not used (e.g., OpenCL since everything is JIT compiled). --- cmake/assemble_summary_string.cmake | 8 ++++++-- src/plssvm/backends/OpenCL/CMakeLists.txt | 5 +++++ src/plssvm/backends/SYCL/CMakeLists.txt | 5 +++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cmake/assemble_summary_string.cmake b/cmake/assemble_summary_string.cmake index 80e73c24e..d4c31d6c7 100644 --- a/cmake/assemble_summary_string.cmake +++ b/cmake/assemble_summary_string.cmake @@ -8,7 +8,11 @@ function(assemble_summary_string out_var) set(PLSSVM_SUMMARY_STRING_ASSEMBLE "") if(DEFINED PLSSVM_CPU_TARGET_ARCHS) # add cpu platform - string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " cpu,") + if(PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 0) + string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " cpu,") + else() + string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " cpu (${PLSSVM_CPU_TARGET_ARCHS}),") + endif() endif() if(DEFINED PLSSVM_NVIDIA_TARGET_ARCHS) # add nvidia platform @@ -20,7 +24,7 @@ function(assemble_summary_string out_var) endif() if(DEFINED PLSSVM_INTEL_TARGET_ARCHS) # add intel platform - string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " intel,") + string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " intel (${PLSSVM_INTEL_TARGET_ARCHS}),") endif() # remove last comma string(REGEX REPLACE ",$" "" PLSSVM_SUMMARY_STRING_ASSEMBLE "${PLSSVM_SUMMARY_STRING_ASSEMBLE}") diff --git a/src/plssvm/backends/OpenCL/CMakeLists.txt b/src/plssvm/backends/OpenCL/CMakeLists.txt index cd28a2913..0273ca652 100644 --- a/src/plssvm/backends/OpenCL/CMakeLists.txt +++ b/src/plssvm/backends/OpenCL/CMakeLists.txt @@ -62,4 +62,9 @@ set(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_TARGETS_TO_INSTALL} PARENT_SCOPE) set(PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_COMPILER " - OpenCL:") include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake) assemble_summary_string(PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS) +# do not print any special target architecture information +string(REPLACE " (${PLSSVM_CPU_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}") +string(REPLACE " (${PLSSVM_NVIDIA_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}") +string(REPLACE " (${PLSSVM_AMD_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}") +string(REPLACE " (${PLSSVM_INTEL_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}") set(PLSSVM_OPENCL_BACKEND_SUMMARY_STRING "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_COMPILER}${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}" PARENT_SCOPE) \ No newline at end of file diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt index 2d0253b41..ce64e03bd 100644 --- a/src/plssvm/backends/SYCL/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/CMakeLists.txt @@ -155,5 +155,10 @@ set(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_TARGETS_TO_INSTALL} PARENT_SCOPE) set(PLSSVM_SYCL_BACKEND_SUMMARY_STRING_COMPILER " - SYCL (${PLSSVM_SYCL_BACKEND_COMPILER}):") include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake) assemble_summary_string(PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS) +# do not print CPU and Intel GPU target architectures when using hipSYCL +if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL") + string(REPLACE " (${PLSSVM_CPU_TARGET_ARCHS})" "" PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}") + string(REPLACE " (${PLSSVM_INTEL_TARGET_ARCHS})" "" PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}") +endif() set(PLSSVM_SYCL_BACKEND_SUMMARY_STRING "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_COMPILER}${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}" PARENT_SCOPE) From f76f6bcdd36b6e367ad6b79530edade7db7de39b Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 2 Mar 2022 10:16:34 +0100 Subject: [PATCH 21/56] Update gpu_name_to_arch script: new name, add more AMD targets, add support for Intel (i)GPUs, output possible -DPLSSVM_TARGET_PLATFORMS string. --- utility_scripts/gpu_name_to_arch.py | 162 ---------- utility_scripts/plssvm_target_platforms.py | 344 +++++++++++++++++++++ 2 files changed, 344 insertions(+), 162 deletions(-) delete mode 100644 utility_scripts/gpu_name_to_arch.py create mode 100644 utility_scripts/plssvm_target_platforms.py diff --git a/utility_scripts/gpu_name_to_arch.py b/utility_scripts/gpu_name_to_arch.py deleted file mode 100644 index 06bc30fc8..000000000 --- a/utility_scripts/gpu_name_to_arch.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -######################################################################################################################## -# Authors: Alexander Van Craen, Marcel Breyer # -# Copyright (C): 2018-today The PLSSVM project - All Rights Reserved # -# License: This file is part of the PLSSVM project which is released under the MIT license. # -# See the LICENSE.md file in the project root for full license information. # -######################################################################################################################## - -import argparse -import re - -# parse command line arguments -parser = argparse.ArgumentParser() -parser.add_argument( - "--name", help="the full name of the GPU (e.g. GeForce RTX 3080)") -args = parser.parse_args() - -if args.name is None: - # for nvidia GPUs - import GPUtil - # for AMD GPUs - import pyamdgpuinfo - - gpu_names = [] - # check for possible NVIDIA GPUs - gpu_names.extend([gpu.name for gpu in GPUtil.getGPUs()]) - # check for possible AMD GPUs - gpu_names.extend([pyamdgpuinfo.get_gpu( - gpu_id).name for gpu_id in range(pyamdgpuinfo.detect_gpus())]) - if not gpu_names: - # error if no GPUs where found - raise RuntimeError("Couldn't find any NVIDIA or AMD GPU(s)!") - else: - print("Found {} GPU(s):".format(len(gpu_names))) -else: - # use provided GPU name - gpu_names = [args.name] - -# mapping of NVIDIA compute capabilities given the GPU name -# only GPUs with a compute capability greater or equal than 6.0 are support -# https://developer.nvidia.com/cuda-gpus -nvidia_compute_capability_mapping = { - # Datacenter Products - "NVIDIA A100": "sm_80", - "NVIDIA A40": "sm_86", - "NVIDIA A30": "sm_80", - "NVIDIA A10": "sm_86", - "NVIDIA A16": "sm_86", - "NVIDIA T4": "sm_75", - "NVIDIA V100": "sm_70", - "Tesla P100": "sm_60", - "Tesla P40": "sm_61", - "Tesla P4": "sm_61", - # NVIDIA Quadro and NVIDIA RTX - "RTX A6000": "sm_86", - "RTX A5000": "sm_86", - "RTX A4000": "sm_86", - "T1000": "sm_75", - "T600": "sm_75", - "T400": "sm_75", - "Quadro RTX 8000": "sm_75", - "Quadro RTX 6000": "sm_75", - "Quadro RTX 5000": "sm_75", - "Quadro RTX 4000": "sm_75", - "Quadro GV100": "sm_70", - "Quadro GP100": "sm_60", - "Quadro P6000": "sm_61", - "Quadro P5000": "sm_61", - "Quadro P4000": "sm_61", - "Quadro P2200": "sm_61", - "Quadro P2000": "sm_61", - "Quadro P1000": "sm_61", - "Quadro P620": "sm_61", - "Quadro P600": "sm_61", - "Quadro P400": "sm_61", - "RTX A3000": "sm_86", - "RTX A2000": "sm_86", - "RTX 5000": "sm_75", - "RTX 4000": "sm_75", - "RTX 3000": "sm_75", - "T2000": "sm_75", - "T1200": "sm_75", - "T500": "sm_75", - "P620": "sm_61", - "P520": "sm_61", - "Quadro P5200": "sm_61", - "Quadro P4200": "sm_61", - "Quadro P3200": "sm_61", - "Quadro P3000": "sm_61", - "Quadro P500": "sm_61", - # GeForce and TITAN Products - "GeForce RTX 3060 Ti": "sm_86", - "GeForce RTX 3060": "sm_86", - "GeForce RTX 3090": "sm_86", - "GeForce RTX 3080": "sm_86", - "GeForce RTX 3070": "sm_86", - "GeForce GTX 1650 Ti": "sm_75", - "NVIDIA TITAN RTX": "sm_75", - "GeForce RTX 2080 Ti": "sm_75", - "GeForce RTX 2080": "sm_75", - "GeForce RTX 2070": "sm_75", - "GeForce RTX 2060": "sm_75", - "NVIDIA TITAN V": "sm_70", - "NVIDIA TITAN Xp": "sm_61", - "NVIDIA TITAN X": "sm_61", - "GeForce GTX 1080 Ti": "sm_61", - "GeForce GTX 1080": "sm_61", - "GeForce GTX 1070 Ti": "sm_61", - "GeForce GTX 1070": "sm_61", - "GeForce GTX 1060": "sm_61", - "GeForce GTX 1050": "sm_61", - "GeForce RTX 3050 Ti": "sm_86", - "GeForce RTX 3050": "sm_86", - # Jetson Products - "Jetson AGX Xavier": "sm_72", -} - -# mapping of AMD architectures given the GPU name -# https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/master/ROCm_Compiler_SDK/ROCm-Native-ISA.rst#id145 -amd_arch_mapping = { - "Radeon Pro VII": "gfx906", - "Radeon VII": "gfx906", - "Radeon Instinct MI50": "gfx906", - "Radeon Instinct MI6": "gfx906", - "Ryzen 3 2200G": "gfx902", - "Ryzen 5 2400G": "gfx902", - "Radeon Vega Frontier Edition": "gfx900", - "Radeon RX Vega 56": "gfx900", - "Radeon RX Vega 64": "gfx900", - "Radeon RX Vega 64 Liquid Cooled": "gfx900", - "Radeon Instinct MI25": "gfx900", - "Radeon RX 460": "gfx803", - "Radeon RX 470": "gfx803", - "Radeon RX 480": "gfx803", - "Radeon R9 Nano": "gfx803", - "Radeon R9 Fury": "gfx803", - "Radeon R9 FuryX": "gfx803", - "Radeon Pro Duo FirePro S9300x2": "gfx803", - "Radeon Instinct MI8": "gfx803", -} - -# output mapped name -for name in gpu_names: - found_name = False - for key in nvidia_compute_capability_mapping: - if re.search(key, name, re.IGNORECASE): - print(" {}: {}".format(name, nvidia_compute_capability_mapping[key])) - found_name = True - break - - for key in amd_arch_mapping: - name_cleaned = name.replace("AMD", "").strip() - name_cleaned = name_cleaned.replace("(TM) ", "").strip() - if re.search(key, name_cleaned, re.IGNORECASE): - print(" {}: {}".format(name_cleaned, amd_arch_mapping[key])) - found_name = True - break - - if not found_name: - raise RuntimeError("Unrecognized GPU name '{}'".format(name)) diff --git a/utility_scripts/plssvm_target_platforms.py b/utility_scripts/plssvm_target_platforms.py new file mode 100644 index 000000000..3047513ce --- /dev/null +++ b/utility_scripts/plssvm_target_platforms.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +######################################################################################################################## +# Authors: Alexander Van Craen, Marcel Breyer # +# Copyright (C): 2018-today The PLSSVM project - All Rights Reserved # +# License: This file is part of the PLSSVM project which is released under the MIT license. # +# See the LICENSE.md file in the project root for full license information. # +######################################################################################################################## + +import argparse +import re + +import cpuinfo # get CPU SIMD information +import GPUtil # get NVIDIA GPU information +import pyamdgpuinfo # get AMD GPU information +import pylspci # get Intel GPU information + +# parse command line arguments +parser = argparse.ArgumentParser() +parser.add_argument("--quiet", help="only output the final PLSSVM_TARGET_PLATFORMS string", action="store_true") +args = parser.parse_args() + + +def cond_print(msg=""): + if not args.quiet: + print(msg) + + +# mapping of NVIDIA compute capabilities given the GPU name +# only GPUs with compute capability greater or equal than 6.0 are support +# https://developer.nvidia.com/cuda-gpus +nvidia_compute_capability_mapping = { + # Datacenter Products + "NVIDIA A100": "sm_80", + "NVIDIA A40": "sm_86", + "NVIDIA A30": "sm_80", + "NVIDIA A10": "sm_86", + "NVIDIA A16": "sm_86", + "NVIDIA T4": "sm_75", + "NVIDIA V100": "sm_70", + "Tesla P100": "sm_60", + "Tesla P40": "sm_61", + "Tesla P4": "sm_61", + # NVIDIA Quadro and NVIDIA RTX + "RTX A6000": "sm_86", + "RTX A5000": "sm_86", + "RTX A4000": "sm_86", + "T1000": "sm_75", + "T600": "sm_75", + "T400": "sm_75", + "Quadro RTX 8000": "sm_75", + "Quadro RTX 6000": "sm_75", + "Quadro RTX 5000": "sm_75", + "Quadro RTX 4000": "sm_75", + "Quadro GV100": "sm_70", + "Quadro GP100": "sm_60", + "Quadro P6000": "sm_61", + "Quadro P5000": "sm_61", + "Quadro P4000": "sm_61", + "Quadro P2200": "sm_61", + "Quadro P2000": "sm_61", + "Quadro P1000": "sm_61", + "Quadro P620": "sm_61", + "Quadro P600": "sm_61", + "Quadro P400": "sm_61", + "RTX A3000": "sm_86", + "RTX A2000": "sm_86", + "RTX 5000": "sm_75", + "RTX 4000": "sm_75", + "RTX 3000": "sm_75", + "T2000": "sm_75", + "T1200": "sm_75", + "T500": "sm_75", + "P620": "sm_61", + "P520": "sm_61", + "Quadro P5200": "sm_61", + "Quadro P4200": "sm_61", + "Quadro P3200": "sm_61", + "Quadro P3000": "sm_61", + "Quadro P500": "sm_61", + # GeForce and TITAN Products + "GeForce RTX 3060 Ti": "sm_86", + "GeForce RTX 3060": "sm_86", + "GeForce RTX 3090": "sm_86", + "GeForce RTX 3080": "sm_86", + "GeForce RTX 3070": "sm_86", + "GeForce GTX 1650 Ti": "sm_75", + "NVIDIA TITAN RTX": "sm_75", + "GeForce RTX 2080 Ti": "sm_75", + "GeForce RTX 2080": "sm_75", + "GeForce RTX 2070": "sm_75", + "GeForce RTX 2060": "sm_75", + "NVIDIA TITAN V": "sm_70", + "NVIDIA TITAN Xp": "sm_61", + "NVIDIA TITAN X": "sm_61", + "GeForce GTX 1080 Ti": "sm_61", + "GeForce GTX 1080": "sm_61", + "GeForce GTX 1070 Ti": "sm_61", + "GeForce GTX 1070": "sm_61", + "GeForce GTX 1060": "sm_61", + "GeForce GTX 1050": "sm_61", + "GeForce RTX 3050 Ti": "sm_86", + "GeForce RTX 3050": "sm_86", + # Jetson Products + "Jetson AGX Xavier": "sm_72", +} + +# mapping of AMD architectures given the GPU name +# https://llvm.org/docs/AMDGPUUsage.html +amd_arch_mapping = { + # AMD Radeon GPUs + "Radeon RX 6700 XT": "gfx1031", + "Radeon RX 6800": "gfx1030", + "Radeon RX 6800 XT": "gfx1030", + "Radeon RX 6900 XT": "gfx1030", + "Radeon RX 5500": "gfx1012", + "Radeon RX 5500 XT": "gfx1012", + "Radeon Pro V520": "gfx1011", + "Radeon RX 5700": "gfx1010", + "Radeon RX 5700 XT": "gfx1010", + "Radeon Pro 5600 XT": "gfx1010", + "Radeon Pro 5600M": "gfx1010", + "Radeon Instinct MI100 Accelerator": "gfx908", + "Radeon Pro VII": "gfx906", + "Radeon VII": "gfx906", + "Radeon Instinct MI50": "gfx906", + "Radeon Instinct MI60": "gfx906", + "Radeon Vega Frontier Edition": "gfx900", + "Radeon RX Vega 56": "gfx900", + "Radeon RX Vega 64": "gfx900", + "Radeon RX Vega 64 Liquid": "gfx900", + "Radeon Instinct MI25": "gfx900", + "Radeon RX 460": "gfx803", + "Radeon Instinct MI6": "gfx803", + "Radeon RX 470": "gfx803", + "Radeon RX 480": "gfx803", + "Radeon Instinct MI8": "gfx803", + "Radeon R9 Nano": "gfx803", + "Radeon R9 Fury": "gfx803", + "Radeon R9 FuryX": "gfx803", + "Radeon Pro Duo": "gfx803", + "Radeon R9 285": "gfx802", + "Radeon R9 380": "gfx802", + "Radeon R9 385": "gfx802", + # AMD Ryzen iGPUs + "Ryzen 7 4700G": "gfx90c", + "Ryzen 7 4700GE": "gfx90c", + "Ryzen 5 4600G": "gfx90c", + "Ryzen 5 4600GE": "gfx90c", + "Ryzen 3 4300G": "gfx90c", + "Ryzen 3 4300GE": "gfx90c", + "Ryzen Pro 4000G": "gfx90c", + "Ryzen 7 Pro 4700G": "gfx90c", + "Ryzen 7 Pro 4750GE": "gfx90c", + "Ryzen 5 Pro 4650G": "gfx90c", + "Ryzen 5 Pro 4650GE": "gfx90c", + "Ryzen 3 Pro 4350G": "gfx90c", + "Ryzen 3 Pro 4350GE": "gfx90c", + "Ryzen 3 2200G": "gfx902", + "Ryzen 5 2400G": "gfx902", + # other AMD targets + "FirePro S7150": "gfx805", + "FirePro S7100": "gfx805", + "FirePro W7100": "gfx805", + "Mobile FirePro M7170": "gfx805", + "FirePro S9300x2": "gfx803", + "A6-8500P": "gfx801", + "Pro A6-8500B": "gfx801", + "A8-8600P": "gfx801", + "Pro A8-8600B": "gfx801", + "FX-8800P": "gfx801", + "Pro A12-8800B": "gfx801", + "A10-8700P": "gfx801", + "Pro A10-8700B": "gfx801", + "A10-8780P": "gfx801", + "A10-9600P": "gfx801", + "A12-9700P": "gfx801", + "A12-9730P": "gfx801", + "FX-9800P": "gfx801", + "FX-9830P": "gfx801", + "E2-9010": "gfx801", + "A6-9210": "gfx801", + "A9-9410": "gfx801", +} + +# mapping of Intel architecture names +# https://dgpu-docs.intel.com/devices/hardware-table.html +# https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html +intel_arch_mapping = { + # Skylake + "skl": ["192A", "1932", "193B", "193A", "193D", "1923", "1926", "1927", "192B", "192D", "1912", "191B", "1913", + "1915", "1917", "191A", "1916", "1921", "191D", "191E", "1902", "1906", "190B", "190A", "190E"], + # Gemini Lake + "glk": ["3185", "3184"], + # Apollo Lake + "Gen9": ["1A85", "5A85", "0A84", "1A84", "5A84"], + # Kaby Lake + "kbl": ["593B", "5923", "5926", "5927", "5917", "5912", "591B", "5916", "5921", "591A", "591D", "591E", "591C", + "87C0", "5913", "5915", "5902", "5906", "590B", "590A", "5908", "590E"], + # Coffee Lake + "cfl": ["3EA5", "3EA8", "3EA6", "3EA7", "3EA2", "3E90", "3E93", "3E99", "3E9C", "3EA1", "9BA5", "9BA8", "3EA4", + "9B21", "9BA0", "9BA2", "9BA4", "9BAA", "9BAB", "9BAC", "87CA", "3EA3", "9B41", "9BC0", "9BC2", "9BC4", + "9BCA", "9BCB", "9BCC", "3E91", "3E92", "3E98", "3E9B", "9BC5", "9BC8", "3E96", "3E9A", "3E94", "9BC6", + "9BE6", "9BF6", "3EA9", "3EA0"], + # Ice Lake + "icllp": ["8A71", "8A56", "8A58", "8A5B", "8A5D", "8A54", "8A5A", "8A5C", "8A57", "8A59", "8A50", "8A51", "8A52", + "8A53"], + # Tiger Lake + "tgllp": ["9A60", "9A68", "9A70", "9A40", "9A49", "9A78", "9AC0", "9AC9", "9AD9", "9AF8"], + # Xe MAX + "dg1": ["4905"], +} +intel_arch_to_name_mapping = { + "skl": "Skylake with Intel Processor Graphics Gen9", + "kbl": "Kaby Lake with Intel Processor Graphics Gen9", + "cfl": "Coffee Lake with Intel Processor Graphics Gen9", + "glk": "Gemini Lake with Intel Processor Graphics Gen9", + "icllp": "Ice Lake with Intel Processor Graphics Gen11", + "tgllp": "Tiger Lake with Intel Processor Graphics Gen12", + "dg1": "Intel Iris Xe MAX graphics", + "Gen9": "Intel Processor Graphics Gen9", + "Gen11": "Intel Processor Graphics Gen11", + "Gen12LP": "Intel Processor Graphics Gen12 (Lower Power)", +} + + +# construct PLSSVM_TARGET_PLATFORMS string +plssvm_target_platforms = "" + +# CPU SIMD information for cpu target +simd_version_support = { + "avx512": False, + "avx2": False, + "avx": False, + "sse4_2": False, +} + +cpu_info = cpuinfo.get_cpu_info() + +for flag in cpu_info["flags"]: + for key in simd_version_support: + if flag == key: + simd_version_support[key] = True + if flag.startswith("avx512"): + simd_version_support["avx512"] = True + +cond_print("{}: {}\n".format(cpu_info["brand_raw"], simd_version_support)) + +newest_simd_version = "" +for key in simd_version_support: + if simd_version_support[key]: + newest_simd_version = key + break +plssvm_target_platforms += "cpu" + ("" if "".__eq__(newest_simd_version) else ":") + newest_simd_version + + +# NVIDIA GPU information +nvidia_gpu_names = [gpu.name for gpu in GPUtil.getGPUs()] +nvidia_num_gpus = len(nvidia_gpu_names) + +if nvidia_num_gpus > 0: + nvidia_gpus = {x: nvidia_gpu_names.count(x) for x in nvidia_gpu_names} + nvidia_gpu_sm = {} + # get NVIDIA SM from GPU name + for name in nvidia_gpus: + found_name = False + for key in nvidia_compute_capability_mapping: + if re.search(key, name, re.IGNORECASE): + nvidia_gpu_sm[name] = nvidia_compute_capability_mapping[key] + found_name = True + break + + if not found_name: + raise RuntimeError("Unrecognized GPU name '{}'".format(name)) + + cond_print("Found {} NVIDIA GPU(s):".format(nvidia_num_gpus)) + for name in nvidia_gpus: + cond_print(" {}x {}: {}".format(nvidia_gpus[name], name, nvidia_gpu_sm[name])) + cond_print() + + plssvm_target_platforms += ";nvidia:" + ",".join({str(sm) for sm in nvidia_gpu_sm.values()}) + + +# AMD GPU information +amd_gpu_names = [pyamdgpuinfo.get_gpu(gpu_id).name for gpu_id in range(pyamdgpuinfo.detect_gpus())] +amd_num_gpus = len(amd_gpu_names) + +if amd_num_gpus > 0: + amd_gpus = {x: amd_gpu_names.count(x) for x in amd_gpu_names} + amd_gpu_arch = {} + # get AMD gfx from GPU name + for name in amd_gpus: + found_name = False + for key in amd_arch_mapping: + name_cleaned = name.replace("AMD", "").strip() + name_cleaned = name_cleaned.replace("(TM) ", "").strip() + if re.search(key, name_cleaned, re.IGNORECASE): + amd_gpu_arch[name] = amd_arch_mapping[key] + found_name = True + break + + if not found_name: + raise RuntimeError("Unrecognized GPU name '{}'".format(name)) + + cond_print("Found {} AMD GPU(s):".format(amd_num_gpus)) + for name in amd_gpus: + cond_print(" {}x {}: {}".format(amd_gpus[name], name, amd_gpu_arch[name])) + cond_print() + + plssvm_target_platforms += ";amd:" + ",".join({str(sm) for sm in amd_gpu_arch.values()}) + + +# Intel GPU information +intel_gpu_names = [] +for device in pylspci.parsers.SimpleParser().run(): + if re.search("VGA", str(device.cls), re.IGNORECASE): + for key in intel_arch_mapping: + if any(re.search(arch, str(device.device), re.IGNORECASE) for arch in intel_arch_mapping[key]): + intel_gpu_names.append(str(device.device)) + break +intel_num_gpus = len(intel_gpu_names) + +if intel_num_gpus > 0: + intel_gpus = {x: intel_gpu_names.count(x) for x in intel_gpu_names} + intel_gpu_arch = {} + for name in intel_gpus: + for key in intel_arch_mapping: + if any(re.search(arch, name, re.IGNORECASE) for arch in intel_arch_mapping[key]): + intel_gpu_arch[name] = key + break + + cond_print("Found {} Intel (i)GPU(s):".format(intel_num_gpus)) + for name in intel_gpus: + cond_print(" {}x {} ({}): {}".format(intel_gpus[name], name, + intel_arch_to_name_mapping[intel_gpu_arch[name]], + intel_gpu_arch[name])) + cond_print() + + plssvm_target_platforms += ";intel:" + ",".join({str(sm) for sm in intel_gpu_arch.values()}) + + +cond_print("Possible -DPLSSVM_TARGET_PLATFORMS entries:") +print("\"{}\"".format(plssvm_target_platforms)) From 05086fffc5f3c97426310682a23aff0861fc5161 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 2 Mar 2022 10:29:59 +0100 Subject: [PATCH 22/56] Update README to reflect changes in SYCL DPC++ target platforms handling and in the plssvm_target_platforms.py script. --- README.md | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 3f963626d..731844cfa 100644 --- a/README.md +++ b/README.md @@ -54,40 +54,50 @@ Building the library can be done using the normal CMake approach: The **required** CMake option `PLSSVM_TARGET_PLATFORMS` is used to determine for which targets the backends should be compiled. Valid targets are: - - `cpu`: compile for the CPU; **no** architectural specifications is allowed - - `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g. `nvidia:sm_86,sm_70` - - `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g. `amd:gfx906` - - `intel`: compile for Intel GPUs; **no** architectural specification is allowed + - `cpu`: compile for the CPU; an **optional** architectural specifications is allowed but only used when compiling with DPC++, e.g., `cpu:avx2` + - `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g., `nvidia:sm_86,sm_70` + - `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g., `amd:gfx906` + - `intel`: compile for Intel GPUs; **at least one** architectural specification is necessary, e.g., `intel:skl` At least one of the above targets must be present. -To retrieve the architectural specification, given an NVIDIA or AMD GPU name, a simple Python3 script `utility/gpu_name_to_arch.py` is provided -(requiring Python3 [`argparse`](https://docs.python.org/3/library/argparse.html) as dependency): +Note that when using DPC++ only a single architectural specification for `cpu` or `amd` is allowed. + +To retrieve the architectural specifications of the current system, a simple Python3 script `utility/plssvm_target_platforms.py` is provided +(required Python3 dependencies: +[`argparse`](https://docs.python.org/3/library/argparse.html), [`py-cpuinfo`](https://pypi.org/project/py-cpuinfo/), +[`GPUtil`](https://pypi.org/project/GPUtil/), [`pyamdgpuinfo`](https://pypi.org/project/pyamdgpuinfo/), and +[`pylspci`](https://pypi.org/project/pylspci/)) ```bash -> python3 utility/gpu_name_to_arch.py --help -usage: gpu_name_to_arch.py [-h] [--name NAME] +> python3 utility/plssvm_target_platforms.py --help +usage: plssvm_target_platforms.py [-h] [--quiet] optional arguments: - -h, --help show this help message and exit - --name NAME the full name of the GPU (e.g. GeForce RTX 3080) + -h, --help show this help message and exit + --quiet only output the final PLSSVM_TARGET_PLATFORMS string ``` Example invocations: ```bash -> python3 utility_scripts/gpu_name_to_arch.py --name "GeForce RTX 3080" -sm_86 -> python3 utility_scripts/gpu_name_to_arch.py --name "Radeon VII" -gfx906 -``` +> python3 utility_scripts/plssvm_target_platforms.py +Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz: {'avx512': True, 'avx2': True, 'avx': True, 'sse4_2': True} + +Found 1 NVIDIA GPU(s): + 1x NVIDIA GeForce RTX 3080: sm_86 -If no GPU name is provided, the script tries to automatically detect any NVIDIA or AMD GPU -(requires the Python3 dependencies [`GPUtil`](https://pypi.org/project/GPUtil/) and [`pyamdgpuinfo`](https://pypi.org/project/pyamdgpuinfo/)). +Possible -DPLSSVM_TARGET_PLATFORMS entries: +cpu:avx512;nvidia:sm_86 + +> python3 utility_scripts/plssvm_target_platforms.py --quiet +cpu:avx512;intel:dg1 +``` If the architectural information for the requested GPU could not be retrieved, one option would be to have a look at: - for NVIDIA GPUs: [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) - - for AMD GPUs: [ROCm Documentation](https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/master/ROCm_Compiler_SDK/ROCm-Native-ISA.rst) + - for AMD GPUs: [clang AMDGPU backend usage](https://llvm.org/docs/AMDGPUUsage.html) + - for Intel GPUs and CPUs: [Ahead of Time Compilation](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html) and [Intel graphics processor table](https://dgpu-docs.intel.com/devices/hardware-table.html) #### Optional CMake Options From f240e25b4df18d50919d7ddc9ecd9e90e17d4d37 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 2 Mar 2022 10:50:32 +0100 Subject: [PATCH 23/56] Update README: add DPC++ and hipSYCL to SYCL in the beginning, add Python requirements file, fix bug in code sample. --- README.md | 5 +++-- install/python_requirements.txt | 11 +++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 install/python_requirements.txt diff --git a/README.md b/README.md index 731844cfa..8e9a1c184 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ The currently available backends are: - [OpenMP](https://www.openmp.org/) - [CUDA](https://developer.nvidia.com/cuda-zone) - [OpenCL](https://www.khronos.org/opencl/) - - [SYCL](https://www.khronos.org/sycl/) + - [SYCL](https://www.khronos.org/sycl/) (tested implementations are [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL)) ## Getting Started @@ -20,6 +20,7 @@ General dependencies: - [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful) - [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up file parsing + - multiple Python3 modules used in the utility scripts;
to install all modules use `pip install --user -r install/python_requirements.txt` Additional dependencies for the OpenMP backend: - compiler with OpenMP support @@ -300,7 +301,7 @@ A simple C++ program (`main.cpp`) using this library could look like: #include #include -int main(i) { +int main() { try { // parse SVM parameter from command line plssvm::parameter params; diff --git a/install/python_requirements.txt b/install/python_requirements.txt new file mode 100644 index 000000000..4e86a4e9a --- /dev/null +++ b/install/python_requirements.txt @@ -0,0 +1,11 @@ +### optional and required python packages +argparse +sklearn +py-cpuinfo +GPUtil +pyamdgpuinfo +pylspci +numpy +pandas +arff +matplotlib \ No newline at end of file From 489915cc6b7a46156900a1fd4bd874509682a714 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 2 Mar 2022 11:05:08 +0100 Subject: [PATCH 24/56] Remove leading > in code blocks. --- README.md | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 8e9a1c184..d4ed4d1f2 100644 --- a/README.md +++ b/README.md @@ -44,11 +44,11 @@ Additional dependencies if `PLSSVM_ENABLE_TESTING` and `PLSSVM_GENERATE_TEST_FIL Building the library can be done using the normal CMake approach: ```bash -> git clone git@gitlab-sim.informatik.uni-stuttgart.de:vancraar/Bachelor-Code.git SVM -> cd SVM/SVM -> mkdir build && cd build -> cmake -DPLSSVM_TARGET_PLATFORMS="..." [optional_options] .. -> cmake --build . +git clone git@github.com:SC-SGS/PLSSVM.git +cd PLSSVM +mkdir build && cd build +cmake -DPLSSVM_TARGET_PLATFORMS="..." [optional_options] .. +cmake --build . ``` #### Target Platform Selection @@ -71,7 +71,7 @@ To retrieve the architectural specifications of the current system, a simple Pyt [`pylspci`](https://pypi.org/project/pylspci/)) ```bash -> python3 utility/plssvm_target_platforms.py --help +python3 utility/plssvm_target_platforms.py --help usage: plssvm_target_platforms.py [-h] [--quiet] optional arguments: @@ -82,7 +82,7 @@ optional arguments: Example invocations: ```bash -> python3 utility_scripts/plssvm_target_platforms.py +python3 utility_scripts/plssvm_target_platforms.py Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz: {'avx512': True, 'avx2': True, 'avx': True, 'sse4_2': True} Found 1 NVIDIA GPU(s): @@ -91,7 +91,7 @@ Found 1 NVIDIA GPU(s): Possible -DPLSSVM_TARGET_PLATFORMS entries: cpu:avx512;nvidia:sm_86 -> python3 utility_scripts/plssvm_target_platforms.py --quiet +python3 utility_scripts/plssvm_target_platforms.py --quiet cpu:avx512;intel:dg1 ``` @@ -146,7 +146,7 @@ To use DPC++ as compiler simply set the `CMAKE_CXX_COMPILER` to the respective D To run the tests after building the library (with `PLSSVM_ENABLE_TESTING` set to `ON`) use: ```bash -> ctest +ctest ``` ### Generating test coverage results @@ -155,10 +155,10 @@ To enable the generation of test coverage reports using `locv` the library must Additionally, it's advisable to use smaller test files to shorten the `ctest` step. ```bash -> cmake -DCMAKE_BUILD_TYPE=Coverage -DPLSSVM_TARGET_PLATFORMS="..." \ - -DPLSSVM_TEST_FILE_NUM_DATA_POINTS=100 \ - -DPLSSVM_TEST_FILE_NUM_FEATURES=50 .. -> cmake --build . -- coverage +cmake -DCMAKE_BUILD_TYPE=Coverage -DPLSSVM_TARGET_PLATFORMS="..." \ + -DPLSSVM_TEST_FILE_NUM_DATA_POINTS=100 \ + -DPLSSVM_TEST_FILE_NUM_FEATURES=50 .. +cmake --build . -- coverage ``` The resulting `html` coverage report is located in the `coverage` folder in the build directory. @@ -167,7 +167,7 @@ The resulting `html` coverage report is located in the `coverage` folder in the If doxygen is installed and `PLSSVM_ENABLE_DOCUMENTATION` is set to `ON` the documentation can be build using ```bash -> make doc +make doc ``` The documentation of the current state of the main branch can be found [here](https://sc-sgs.github.io/PLSSVM/). @@ -176,7 +176,7 @@ The documentation of the current state of the main branch can be found [here](ht The library supports the `install` target: ```bash -> cmake --build . -- install +cmake --build . -- install ``` ## Usage @@ -193,7 +193,7 @@ In order to use all functionality, the following Python3 modules must be install [`mpl_toolkits`](https://pypi.org/project/matplotlib/) ```bash -> python3 utility_scripts/generate_data**.py --help +python3 utility_scripts/generate_data**.py --help usage: generate_data.py [-h] --output OUTPUT --format FORMAT [--problem PROBLEM] --samples SAMPLES [--test_samples TEST_SAMPLES] --features FEATURES [--plot] optional arguments: @@ -211,13 +211,13 @@ optional arguments: An example invocation generating a data set consisting of blobs with 1000 data points with 200 features each could look like: ```bash -> python3 generate_data.py --ouput data_file --format libsvm --problem blobs --samples 1000 --features 200 +python3 generate_data.py --ouput data_file --format libsvm --problem blobs --samples 1000 --features 200 ``` ### Training ```bash -> ./svm-train --help +./svm-train --help LS-SVM with multiple (GPU-)backends Usage: ./svm-train [OPTION...] training_set_file [model_file] @@ -243,13 +243,13 @@ Usage: An example invocation using the CUDA backend could look like: ```bash -> ./svm-train --backend cuda --input /path/to/data_file +./svm-train --backend cuda --input /path/to/data_file ``` Another example targeting NVIDIA GPUs using the SYCL backend looks like: ```bash -> ./svm-train --backend sycl --target_platform gpu_nvidia --input /path/to/data_file +./svm-train --backend sycl --target_platform gpu_nvidia --input /path/to/data_file ``` The `--target_platform=automatic` flags works for the different backends as follows: @@ -262,7 +262,7 @@ The `--target_platform=automatic` flags works for the different backends as foll ### Predicting ```bash -> ./svm-predict --help +./svm-predict --help LS-SVM with multiple (GPU-)backends Usage: ./svm-predict [OPTION...] test_file model_file [output_file] @@ -279,13 +279,13 @@ Usage: An example invocation could look like: ```bash -> ./svm-predict --backend cuda --test /path/to/test_file --model /path/to/model_file +./svm-predict --backend cuda --test /path/to/test_file --model /path/to/model_file ``` Another example targeting NVIDIA GPUs using the SYCL backend looks like: ```bash -> ./svm-predict --backend sycl --target_platform gpu_nvidia --test /path/to/test_file --model /path/to/model_file +./svm-predict --backend sycl --target_platform gpu_nvidia --test /path/to/test_file --model /path/to/model_file ``` The `--target_platform=automatic` flags works like in the training (`./svm-train`) case. From 97f28deef4b52a731fe395e225c1e18eb6a15396 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Mar 2022 09:44:27 +0100 Subject: [PATCH 25/56] Add SYCL kernel invocation type to be able to change from the nd_range formulation to the hierarchical using a command line switch (SYCL only). --- include/plssvm/backends/SYCL/csvm.hpp | 9 +- .../backends/SYCL/kernel_invocation_type.hpp | 46 +++ ...kernel.hpp => svm_kernel_hierarchical.hpp} | 12 +- .../backends/SYCL/svm_kernel_nd_range.hpp | 378 ++++++++++++++++++ include/plssvm/core.hpp | 2 + include/plssvm/parameter.hpp | 10 +- include/plssvm/parameter_train.hpp | 1 + src/main_predict.cpp | 10 +- src/main_train.cpp | 11 +- ...kernel.cpp => svm_kernel_hierarchical.cpp} | 0 src/plssvm/backends/SYCL/CMakeLists.txt | 1 + src/plssvm/backends/SYCL/csvm.cpp | 100 +++-- .../backends/SYCL/kernel_invocation_type.cpp | 49 +++ src/plssvm/parameter.cpp | 30 +- src/plssvm/parameter_train.cpp | 4 + 15 files changed, 609 insertions(+), 54 deletions(-) create mode 100644 include/plssvm/backends/SYCL/kernel_invocation_type.hpp rename include/plssvm/backends/SYCL/{svm_kernel.hpp => svm_kernel_hierarchical.hpp} (95%) create mode 100644 include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp rename src/plssvm/backends/OpenMP/{svm_kernel.cpp => svm_kernel_hierarchical.cpp} (100%) create mode 100644 src/plssvm/backends/SYCL/kernel_invocation_type.cpp diff --git a/include/plssvm/backends/SYCL/csvm.hpp b/include/plssvm/backends/SYCL/csvm.hpp index 1ecda3705..4d2018fff 100644 --- a/include/plssvm/backends/SYCL/csvm.hpp +++ b/include/plssvm/backends/SYCL/csvm.hpp @@ -11,8 +11,9 @@ #pragma once -#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::sycl::detail::device_ptr -#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm +#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::sycl::detail::device_ptr +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm #include "sycl/sycl.hpp" // sycl::queue @@ -105,6 +106,10 @@ class csvm : public ::plssvm::detail::gpu_csvm; diff --git a/include/plssvm/backends/SYCL/kernel_invocation_type.hpp b/include/plssvm/backends/SYCL/kernel_invocation_type.hpp new file mode 100644 index 000000000..1bd291d04 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel_invocation_type.hpp @@ -0,0 +1,46 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines all available kernel invoke types when using SYCL. + */ + +#pragma once + +#include // forward declare std::ostream and std::istream + +namespace plssvm::sycl { + +/** + * @brief Enum class for all possible SYCL kernel invocation types. + */ +enum class kernel_invocation_type { + /** Use the best kernel invocation type for the current SYCL implementation and target hardware platform. */ + automatic, + /** Use the [*nd_range* invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke). */ + nd_range, + /** Use the SYCL specific [hierarchical invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_hierarchical_invoke). */ + hierarchical +}; + +/** + * @brief Output the @p invocation type to the given output-stream @p out. + * @param[in,out] out the output-stream to write the backend type to + * @param[in] invocation the SYCL kernel invocation type + * @return the output-stream + */ +std::ostream &operator<<(std::ostream &out, kernel_invocation_type invocation); + +/** + * @brief Use the input-stream @p in to initialize the @p invocation type. + * @param[in,out] in input-stream to extract the backend type from + * @param[in] invocation the SYCL kernel invocation type + * @return the input-stream + */ +std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation); + +} // namespace plssvm::sycl diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp similarity index 95% rename from include/plssvm/backends/SYCL/svm_kernel.hpp rename to include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp index 1ac1df6b6..bb6c0a75a 100644 --- a/include/plssvm/backends/SYCL/svm_kernel.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp @@ -28,7 +28,7 @@ namespace plssvm::sycl { * @tparam T the type of the data */ template -class device_kernel_linear { +class hierarchical_device_kernel_linear { public: /// The type of the data. using real_type = T; @@ -48,7 +48,7 @@ class device_kernel_linear { * @param[in] add denotes whether the values are added or subtracted from the result vector * @param[in] id the id of the device */ - device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : + hierarchical_device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} /** @@ -202,7 +202,7 @@ class device_kernel_linear { * @tparam T the type of the data */ template -class device_kernel_poly { +class hierarchical_device_kernel_poly { public: /// The type of the data. using real_type = T; @@ -224,7 +224,7 @@ class device_kernel_poly { * @param[in] gamma the gamma parameter used in the polynomial kernel function * @param[in] coef0 the coef0 parameter used in the polynomial kernel function */ - device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : + hierarchical_device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} /** @@ -373,7 +373,7 @@ class device_kernel_poly { * @tparam T the type of the data */ template -class device_kernel_radial { +class hierarchical_device_kernel_radial { public: /// The type of the data. using real_type = T; @@ -393,7 +393,7 @@ class device_kernel_radial { * @param[in] add denotes whether the values are added or subtracted from the result vector * @param[in] gamma the gamma parameter used in the rbf kernel function */ - device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : + hierarchical_device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} /** diff --git a/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp new file mode 100644 index 000000000..7d39c9867 --- /dev/null +++ b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp @@ -0,0 +1,378 @@ +/** +* @file +* @author Alexander Van Craen +* @author Marcel Breyer +* @copyright 2018-today The PLSSVM project - All Rights Reserved +* @license This file is part of the PLSSVM project which is released under the MIT license. +* See the LICENSE.md file in the project root for full license information. +* +* @brief Defines the kernel functions for the C-SVM using the SYCL backend. +*/ + +#pragma once + +#include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL +#include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE + +#include "sycl/sycl.hpp" // sycl::nd_item, sycl::handler, sycl::accessor, sycl::access::mode, sycl::access::target, sycl::range, sycl::group_barrier, sycl::pow, + // sycl::exp, sycl::atomic_ref, sycl::memory_order, sycl::memory_scope, sycl::access::address_space + +#include // std::size_t + +namespace plssvm::sycl { + +// TODO: change to ::sycl::local_accessor once implemented in the SYCL implementations +/** +* @brief Shortcut alias for a SYCL local accessor. +* @tparam T the type of the accessed values +*/ +template +using local_accessor = ::sycl::accessor; + +/** +* @brief Calculates the C-SVM kernel using the linear kernel function. +* @details Supports multi-GPU execution. +* @tparam T the type of the data +*/ +template +class nd_range_device_kernel_linear { + public: + /// The type of the data. + using real_type = T; + + /** + * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel. + * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] feature_range number of features used for the calculation on the device @p id + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] id the id of the device + */ + nd_range_device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : + data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + */ + void operator()(::sycl::nd_item<2> nd_idx) const { + kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + + real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; + real_type data_j[INTERNAL_BLOCK_SIZE]; + + if (i >= j) { + i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + // cache data + for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) { + ::sycl::group_barrier(nd_idx.get_group()); + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx = block_id % THREAD_BLOCK_SIZE; + if (nd_idx.get_local_id(1) == idx) { + data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (nd_idx.get_local_id(0) == idx_2) { + data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; + } + } + ::sycl::group_barrier(nd_idx.get_group()); + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + matr[k][l] += data_i * data_j[k]; + } + } + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + real_type temp; + if (device_ == 0) { + temp = (matr[x][y] + QA_cost_ - q_[i + y] - q_[j + x]) * add_; + } else { + temp = matr[x][y] * add_; + } + if (i + x > j + y) { + // upper triangular matrix + atomic_op{ ret_[i + y] } += temp * d_[j + x]; + ret_jx += temp * d_[i + y]; + } else if (i + x == j + y) { + // diagonal + if (device_ == 0) { + ret_jx += (temp + cost_ * add_) * d_[i + y]; + } else { + ret_jx += temp * d_[i + y]; + } + } + } + atomic_op{ ret_[j + x] } += ret_jx; + } + } + } + + private: + local_accessor data_intern_i_; + local_accessor data_intern_j_; + + const real_type *q_; + real_type *ret_; + const real_type *d_; + const real_type *data_d_; + const real_type QA_cost_; + const real_type cost_; + const kernel_index_type num_rows_; + const kernel_index_type feature_range_; + const real_type add_; + const kernel_index_type device_; +}; + +/** +* @brief Calculates the C-SVM kernel using the polynomial kernel function. +* @details Currently only single GPU execution is supported. +* @tparam T the type of the data +*/ +template +class nd_range_device_kernel_poly { + public: + /// The type of the data. + using real_type = T; + + /** + * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel. + * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] num_cols the number of rows in the data matrix + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] degree the degree parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] coef0 the coef0 parameter used in the polynomial kernel function + */ + nd_range_device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : + data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + */ + void operator()(::sycl::nd_item<2> nd_idx) const { + kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + + real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; + real_type data_j[INTERNAL_BLOCK_SIZE]; + + if (i >= j) { + i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + // cache data + for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { + ::sycl::group_barrier(nd_idx.get_group()); + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx = block_id % THREAD_BLOCK_SIZE; + if (nd_idx.get_local_id(1) == idx) { + data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (nd_idx.get_local_id(0) == idx_2) { + data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; + } + } + ::sycl::group_barrier(nd_idx.get_group()); + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + matr[k][l] += data_i * data_j[k]; + } + } + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::pow(gamma_ * matr[x][y] + coef0_, static_cast(degree_)) + QA_cost_ - q_[i + y] - q_[j + x]) * add_; + if (i + x > j + y) { + // upper triangular matrix + atomic_op{ ret_[i + y] } += temp * d_[j + x]; + ret_jx += temp * d_[i + y]; + } else if (i + x == j + y) { + // diagonal + ret_jx += (temp + cost_ * add_) * d_[i + y]; + } + } + atomic_op{ ret_[j + x] } += ret_jx; + } + } + } + + private: + local_accessor data_intern_i_; + local_accessor data_intern_j_; + + const real_type *q_; + real_type *ret_; + const real_type *d_; + const real_type *data_d_; + const real_type QA_cost_; + const real_type cost_; + const kernel_index_type num_rows_; + const kernel_index_type num_cols_; + const real_type add_; + const int degree_; + const real_type gamma_; + const real_type coef0_; +}; + +/** +* @brief Calculates the C-SVM kernel using the radial basis functions kernel function. +* @details Currently only single GPU execution is supported. +* @tparam T the type of the data +*/ +template +class nd_range_device_kernel_radial { + public: + /// The type of the data. + using real_type = T; + + /** + * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel. + * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] num_cols the number of rows in the data matrix + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] gamma the gamma parameter used in the rbf kernel function + */ + nd_range_device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : + data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + */ + void operator()(::sycl::nd_item<2> nd_idx) const { + kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + + real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; + real_type data_j[INTERNAL_BLOCK_SIZE]; + + if (i >= j) { + i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + + // cache data + for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { + ::sycl::group_barrier(nd_idx.get_group()); + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx = block_id % THREAD_BLOCK_SIZE; + if (nd_idx.get_local_id(1) == idx) { + data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (nd_idx.get_local_id(0) == idx_2) { + data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; + } + } + ::sycl::group_barrier(nd_idx.get_group()); + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]); + } + } + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::exp(-gamma_ * matr[x][y]) + QA_cost_ - q_[i + y] - q_[j + x]) * add_; + if (i + x > j + y) { + // upper triangular matrix + atomic_op{ ret_[i + y] } += temp * d_[j + x]; + ret_jx += temp * d_[i + y]; + } else if (i + x == j + y) { + // diagonal + ret_jx += (temp + cost_ * add_) * d_[i + y]; + } + } + atomic_op{ ret_[j + x] } += ret_jx; + } + } + } + + private: + local_accessor data_intern_i_; + local_accessor data_intern_j_; + + const real_type *q_; + real_type *ret_; + const real_type *d_; + const real_type *data_d_; + const real_type QA_cost_; + const real_type cost_; + const kernel_index_type num_rows_; + const kernel_index_type num_cols_; + const real_type add_; + const real_type gamma_; +}; + +} // namespace plssvm::sycl diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp index 28e2c5d77..62336def6 100644 --- a/include/plssvm/core.hpp +++ b/include/plssvm/core.hpp @@ -25,6 +25,8 @@ #include "plssvm/exceptions/exceptions.hpp" #include "plssvm/version/version.hpp" +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" + /// The main namespace containing all public API functions. namespace plssvm {} diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp index ccc102124..47d9ee323 100644 --- a/include/plssvm/parameter.hpp +++ b/include/plssvm/parameter.hpp @@ -11,9 +11,10 @@ #pragma once -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include // forward declare std::ostream #include // std::shared_ptr @@ -191,6 +192,9 @@ class parameter { /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD or Intel. target_platform target = target_platform::automatic; + /// The kernel invocation type when using SYCL as backend. + sycl::kernel_invocation_type sycl_kernel_invocation_type = sycl::kernel_invocation_type::automatic; + /// The name of the data/test file to parse. std::string input_filename{}; /// The name of the model file to write the learned support vectors to/to parse the saved model from. diff --git a/include/plssvm/parameter_train.hpp b/include/plssvm/parameter_train.hpp index 223b9c9d6..9bb4903dc 100644 --- a/include/plssvm/parameter_train.hpp +++ b/include/plssvm/parameter_train.hpp @@ -36,6 +36,7 @@ class parameter_train : public parameter { using base_type::kernel; using base_type::print_info; using base_type::target; + using base_type::sycl_kernel_invocation_type; using base_type::input_filename; using base_type::model_filename; diff --git a/src/main_predict.cpp b/src/main_predict.cpp index e83442ad9..ab2a42856 100644 --- a/src/main_predict.cpp +++ b/src/main_predict.cpp @@ -17,7 +17,7 @@ #include // std::chrono #include // std::exception #include // std::ofstream -#include // std::cerr, std::endl +#include // std::cerr, std::clog, std::endl #include // std::vector // perform calculations in single precision if requested @@ -32,6 +32,14 @@ int main(int argc, char *argv[]) { // parse SVM parameter from command line plssvm::parameter_predict params{ argc, argv }; + // warn if kernel invocation type nd_range or hierarchical are explicitly set but SYCL isn't the current backend + if (params.backend != plssvm::backend_type::sycl && params.sycl_kernel_invocation_type != plssvm::sycl::kernel_invocation_type::automatic) { + std::clog << fmt::format( + "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}", + params.sycl_kernel_invocation_type) + << std::endl; + } + // output used parameter if (params.print_info) { fmt::print("\n"); diff --git a/src/main_train.cpp b/src/main_train.cpp index a14ae55a1..62e1e2ddf 100644 --- a/src/main_train.cpp +++ b/src/main_train.cpp @@ -10,10 +10,11 @@ #include "plssvm/core.hpp" +#include "fmt/core.h" // std::format #include "fmt/ostream.h" // use operator<< to output enum class #include // std::exception -#include // std::cerr, std::endl +#include // std::cerr, std::clog, std::endl // perform calculations in single precision if requested #ifdef PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION @@ -27,6 +28,14 @@ int main(int argc, char *argv[]) { // parse SVM parameter from command line plssvm::parameter_train params{ argc, argv }; + // warn if kernel invocation type nd_range or hierarchical are explicitly set but SYCL isn't the current backend + if (params.backend != plssvm::backend_type::sycl && params.sycl_kernel_invocation_type != plssvm::sycl::kernel_invocation_type::automatic) { + std::clog << fmt::format( + "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}", + params.sycl_kernel_invocation_type) + << std::endl; + } + // output used parameter if (params.print_info) { fmt::print("\n"); diff --git a/src/plssvm/backends/OpenMP/svm_kernel.cpp b/src/plssvm/backends/OpenMP/svm_kernel_hierarchical.cpp similarity index 100% rename from src/plssvm/backends/OpenMP/svm_kernel.cpp rename to src/plssvm/backends/OpenMP/svm_kernel_hierarchical.cpp diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt index cef2b4073..b38db8494 100644 --- a/src/plssvm/backends/SYCL/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/CMakeLists.txt @@ -45,6 +45,7 @@ set(PLSSVM_SYCL_SOURCES ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp + ${CMAKE_CURRENT_LIST_DIR}/kernel_invocation_type.cpp ${CMAKE_CURRENT_LIST_DIR}/../gpu_csvm.cpp ) diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index eddfcf9b9..5f6dc5890 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -8,20 +8,21 @@ #include "plssvm/backends/SYCL/csvm.hpp" -#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::detail::sycl::device_ptr -#include "plssvm/backends/SYCL/detail/utility.hpp" // plssvm::detail::sycl::get_device_list, plssvm::detail::sycl::device_synchronize -#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::sycl::backend_exception -#include "plssvm/backends/SYCL/predict_kernel.hpp" // plssvm::sycl::kernel_w, plssvm::sycl::predict_points_poly, plssvm::sycl::predict_points_rbf -#include "plssvm/backends/SYCL/q_kernel.hpp" // plssvm::sycl::device_kernel_q_linear, plssvm::sycl::device_kernel_q_poly, plssvm::sycl::device_kernel_q_radial -#include "plssvm/backends/SYCL/svm_kernel.hpp" // plssvm::sycl::device_kernel_linear, plssvm::sycl::device_kernel_poly, plssvm::sycl::device_kernel_radial -#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm -#include "plssvm/constants.hpp" // plssvm::kernel_index_type -#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include "plssvm/detail/execution_range.hpp" // plssvm::detail::execution_range -#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::detail::sycl::device_ptr +#include "plssvm/backends/SYCL/detail/utility.hpp" // plssvm::detail::sycl::get_device_list, plssvm::detail::sycl::device_synchronize +#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::sycl::backend_exception +#include "plssvm/backends/SYCL/predict_kernel.hpp" // plssvm::sycl::kernel_w, plssvm::sycl::predict_points_poly, plssvm::sycl::predict_points_rbf +#include "plssvm/backends/SYCL/q_kernel.hpp" // plssvm::sycl::device_kernel_q_linear, plssvm::sycl::device_kernel_q_poly, plssvm::sycl::device_kernel_q_radial +#include "plssvm/backends/SYCL/svm_kernel_hierarchical.hpp" // plssvm::sycl::hierarchical_device_kernel_linear, plssvm::sycl::hierarchical_device_kernel_poly, plssvm::sycl::hierarchical_device_kernel_radial +#include "plssvm/backends/SYCL/svm_kernel_nd_range.hpp" // plssvm::sycl::nd_range_device_kernel_linear, plssvm::sycl::nd_range_device_kernel_poly, plssvm::sycl::nd_range_device_kernel_radial +#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm +#include "plssvm/constants.hpp" // plssvm::kernel_index_type +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/execution_range.hpp" // plssvm::detail::execution_range +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "fmt/core.h" // fmt::print, fmt::format #include "fmt/ostream.h" // can use fmt using operator<< overloads @@ -35,7 +36,7 @@ namespace plssvm::sycl { template csvm::csvm(const parameter ¶ms) : - base_type{ params } { + base_type{ params }, invocation_type_{ params.sycl_kernel_invocation_type } { // check whether the requested target platform has been enabled switch (target_) { case target_platform::automatic: @@ -62,13 +63,25 @@ csvm::csvm(const parameter ¶ms) : break; } + // set correct kernel invocation type if "automatic" has been provided + if (invocation_type_ == kernel_invocation_type::automatic) { + // always use nd_range except for hipSYCL on the CPU + // TODO: automatic target_platform + if (target_ == target_platform::cpu && PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL) { + invocation_type_ = kernel_invocation_type::hierarchical; + } else { + invocation_type_ = kernel_invocation_type::nd_range; + } + } + if (print_info_) { #if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL - fmt::print("Using SYCL (hipSYCL) as backend.\n"); + constexpr std::string_view sycl_implementation_name = "hipSYCL"; #endif #if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP - fmt::print("Using SYCL (DPC++) as backend.\n"); + constexpr std::string_view sycl_implementation_name = "DPC++"; #endif + fmt::print("Using SYCL ({}) as backend with the kernel invocation type \"{}\".\n", sycl_implementation_name, invocation_type_); } // get all available devices wrt the requested target platform @@ -118,17 +131,31 @@ void csvm::device_synchronize(queue_type &queue) { } template -::sycl::nd_range execution_range_to_native(const ::plssvm::detail::execution_range &range) { +::sycl::nd_range execution_range_to_native(const ::plssvm::detail::execution_range &range, const kernel_invocation_type invocation_type) { + PLSSVM_ASSERT(invocation_type != kernel_invocation_type::automatic, "The SYCL kernel invocation type may not be automatic anymore at this point!"); + + // set grid value based on used kernel invocation type + const auto fill_grid = [&](const std::size_t i) { + switch (invocation_type) { + case kernel_invocation_type::nd_range: + return range.grid[i] * range.block[i]; + case kernel_invocation_type::hierarchical: + return range.grid[i]; + case kernel_invocation_type::automatic: + throw backend_exception{ "Can't create native execution range from kernel invocation type automatic!" }; + } + }; + if constexpr (I == 1) { - ::sycl::range<1> grid{ range.grid[0] * range.block[0] }; + ::sycl::range<1> grid{ fill_grid(0) }; ::sycl::range<1> block{ range.block[0] }; return ::sycl::nd_range<1>{ grid, block }; } else if constexpr (I == 2) { - ::sycl::range<2> grid{ range.grid[0] * range.block[0], range.grid[1] * range.block[1] }; + ::sycl::range<2> grid{ fill_grid(0), fill_grid(1) }; ::sycl::range<2> block{ range.block[0], range.block[1] }; return ::sycl::nd_range<2>{ grid, block }; } else if constexpr (I == 3) { - ::sycl::range<3> grid{ range.grid[0] * range.block[0], range.grid[1] * range.block[1], range.grid[2] * range.block[2] }; + ::sycl::range<3> grid{ fill_grid(0), fill_grid(1), fill_grid(2) }; ::sycl::range<3> block{ range.block[0], range.block[1], range.block[2] }; return ::sycl::nd_range<3>{ grid, block }; } else { @@ -138,7 +165,7 @@ ::sycl::nd_range execution_range_to_native(const ::plssvm::detail::execution_ template void csvm::run_q_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, const std::size_t num_features) { - const ::sycl::nd_range execution_range = execution_range_to_native<1>(range); + const ::sycl::nd_range execution_range = execution_range_to_native<1>(range, kernel_invocation_type::nd_range); switch (kernel_) { case kernel_type::linear: devices_[device].parallel_for(execution_range, device_kernel_q_linear(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_features)); @@ -156,30 +183,49 @@ void csvm::run_q_kernel(const std::size_t device, const ::plssvm::detail::exe template void csvm::run_svm_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, const real_type add, const std::size_t num_features) { + const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, invocation_type_); switch (kernel_) { case kernel_type::linear: - device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)(); + if (invocation_type_ == kernel_invocation_type::nd_range) { + devices_[device].submit([&](::sycl::handler &cgh) { + cgh.parallel_for(execution_range, nd_range_device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)); + }); + } else { + hierarchical_device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)(); + } break; case kernel_type::polynomial: PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!"); - device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)(); + if (invocation_type_ == kernel_invocation_type::nd_range) { + devices_[device].submit([&](::sycl::handler &cgh) { + cgh.parallel_for(execution_range, nd_range_device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); + }); + } else { + hierarchical_device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)(); + } break; case kernel_type::rbf: PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!"); - device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)(); + if (invocation_type_ == kernel_invocation_type::nd_range) { + devices_[device].submit([&](::sycl::handler &cgh) { + cgh.parallel_for(execution_range, nd_range_device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)); + }); + } else { + hierarchical_device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)(); + } break; } } template void csvm::run_w_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, const std::size_t num_features) { - const ::sycl::nd_range execution_range = execution_range_to_native<1>(range); + const ::sycl::nd_range execution_range = execution_range_to_native<1>(range, kernel_invocation_type::nd_range); devices_[device].parallel_for(execution_range, device_kernel_w_linear(w_d.get(), data_d_[device].get(), data_last_d_[device].get(), alpha_d.get(), num_data_points_, num_features)); } template void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) { - [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); + [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, kernel_invocation_type::nd_range); switch (kernel_) { case kernel_type::linear: diff --git a/src/plssvm/backends/SYCL/kernel_invocation_type.cpp b/src/plssvm/backends/SYCL/kernel_invocation_type.cpp new file mode 100644 index 000000000..89a8f2348 --- /dev/null +++ b/src/plssvm/backends/SYCL/kernel_invocation_type.cpp @@ -0,0 +1,49 @@ +/** +* @author Alexander Van Craen +* @author Marcel Breyer +* @copyright 2018-today The PLSSVM project - All Rights Reserved +* @license This file is part of the PLSSVM project which is released under the MIT license. +* See the LICENSE.md file in the project root for full license information. +*/ + +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" + +#include "plssvm/detail/string_utility.hpp" // plssvm::detail::to_lower_case + +#include // std::ios::failbit +#include // std::istream +#include // std::ostream +#include // std::string + +namespace plssvm::sycl { + +std::ostream &operator<<(std::ostream &out, const kernel_invocation_type target) { + switch (target) { + case kernel_invocation_type::automatic: + return out << "automatic"; + case kernel_invocation_type::nd_range: + return out << "nd_range"; + case kernel_invocation_type::hierarchical: + return out << "hierarchical"; + } + return out << "unknown"; +} + +std::istream &operator>>(std::istream &in, kernel_invocation_type &target) { + std::string str; + in >> str; + detail::to_lower_case(str); + + if (str == "automatic") { + target = kernel_invocation_type::automatic; + } else if (str == "nd_range") { + target = kernel_invocation_type::nd_range; + } else if (str == "hierarchical") { + target = kernel_invocation_type::hierarchical; + } else { + in.setstate(std::ios::failbit); + } + return in; +} + +} // namespace plssvm::sycl \ No newline at end of file diff --git a/src/plssvm/parameter.cpp b/src/plssvm/parameter.cpp index aa2d5a760..bc1adf4d4 100644 --- a/src/plssvm/parameter.cpp +++ b/src/plssvm/parameter.cpp @@ -535,20 +535,21 @@ void parameter::parse_test_file(const std::string &filename) { template std::ostream &operator<<(std::ostream &out, const parameter ¶ms) { return out << fmt::format( - "kernel_type {}\n" - "degree {}\n" - "gamma {}\n" - "coef0 {}\n" - "cost {}\n" - "epsilon {}\n" - "print_info {}\n" - "backend {}\n" - "target platform {}\n" - "input_filename '{}'\n" - "model_filename '{}'\n" - "predict_filename '{}'\n" - "rho {}\n" - "real_type {}\n", + "kernel_type {}\n" + "degree {}\n" + "gamma {}\n" + "coef0 {}\n" + "cost {}\n" + "epsilon {}\n" + "print_info {}\n" + "backend {}\n" + "target platform {}\n" + "SYCL kernel invocation type {}\n" + "input_filename '{}'\n" + "model_filename '{}'\n" + "predict_filename '{}'\n" + "rho {}\n" + "real_type {}\n", params.kernel, params.degree, params.gamma, @@ -558,6 +559,7 @@ std::ostream &operator<<(std::ostream &out, const parameter ¶ms) { params.print_info, params.backend, params.target, + params.sycl_kernel_invocation_type, params.input_filename, params.model_filename, params.predict_filename, diff --git a/src/plssvm/parameter_train.cpp b/src/plssvm/parameter_train.cpp index 29e716f2c..b4cf3c55c 100644 --- a/src/plssvm/parameter_train.cpp +++ b/src/plssvm/parameter_train.cpp @@ -50,6 +50,7 @@ parameter_train::parameter_train(int argc, char **argv) { ("e,epsilon", "set the tolerance of termination criterion", cxxopts::value()->default_value(fmt::format("{}", epsilon))) ("b,backend", "choose the backend: openmp|cuda|opencl|sycl", cxxopts::value()->default_value(detail::as_lower_case(fmt::format("{}", backend)))) ("p,target_platform", "choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel", cxxopts::value()->default_value(detail::as_lower_case(fmt::format("{}", target)))) + ("sycl_kernel_invocation_type", "choose the kernel invocation type when using SYCL as backend: automatic|nd_range|hierarchical", cxxopts::value()->default_value(detail::as_lower_case(fmt::format("{}", sycl_kernel_invocation_type)))) ("q,quiet", "quiet mode (no outputs)", cxxopts::value(print_info)->default_value(fmt::format("{}", !print_info))) ("h,help", "print this helper message", cxxopts::value()) ("input", "", cxxopts::value(), "training_set_file") @@ -105,6 +106,9 @@ parameter_train::parameter_train(int argc, char **argv) { // parse target_platform and cast the value to the respective enum target = result["target_platform"].as(); + // parse kernel invocation type when using SYCL as backend + sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as(); + // parse print info print_info = !print_info; From 461919a22925e14d89a3782c80e1290bbee47e5b Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Mar 2022 10:13:13 +0100 Subject: [PATCH 26/56] SYCL kernel invocation type now also works with automatic target platforms. Outputs for OpenCL and SYCL the used target platform if only automatic was provided. --- .../plssvm/backends/OpenCL/detail/utility.hpp | 7 +++--- .../plssvm/backends/SYCL/detail/utility.hpp | 10 ++++---- src/plssvm/backends/OpenCL/csvm.cpp | 13 +++++++---- src/plssvm/backends/OpenCL/detail/utility.cpp | 20 +++++++++------- src/plssvm/backends/SYCL/csvm.cpp | 16 ++++++++----- src/plssvm/backends/SYCL/detail/utility.cpp | 23 +++++++++++-------- 6 files changed, 55 insertions(+), 34 deletions(-) diff --git a/include/plssvm/backends/OpenCL/detail/utility.hpp b/include/plssvm/backends/OpenCL/detail/utility.hpp index fad3153dc..fcbdee701 100644 --- a/include/plssvm/backends/OpenCL/detail/utility.hpp +++ b/include/plssvm/backends/OpenCL/detail/utility.hpp @@ -46,7 +46,8 @@ namespace plssvm::opencl::detail { void device_assert(error_code code, std::string_view msg = ""); /** - * @brief Returns the list devices matching the target platform @p target. + * @brief Returns the list devices matching the target platform @p target and the actually used target platform + * (only interesting if the provided @p target was automatic). * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order: * 1. NVIDIA GPUs * 2. AMD GPUs @@ -54,9 +55,9 @@ void device_assert(error_code code, std::string_view msg = ""); * 4. CPUs * * @param[in] target the target platform for which the devices must match - * @return the command queues (`[[nodiscard]]`) + * @return the command queues and used target platform (`[[nodiscard]]`) */ -[[nodiscard]] std::vector get_command_queues(target_platform target); +[[nodiscard]] std::pair, target_platform> get_command_queues(target_platform target); /** * @brief Wait for the compute device associated with @p queue to finish. diff --git a/include/plssvm/backends/SYCL/detail/utility.hpp b/include/plssvm/backends/SYCL/detail/utility.hpp index d6afe8bb1..17f0b60e3 100644 --- a/include/plssvm/backends/SYCL/detail/utility.hpp +++ b/include/plssvm/backends/SYCL/detail/utility.hpp @@ -15,12 +15,14 @@ #include "sycl/sycl.hpp" // sycl::queue -#include // std::vector +#include // std::pair +#include // std::vector namespace plssvm::sycl::detail { /** - * @brief Returns the list devices matching the target platform @p target. + * @brief Returns the list devices matching the target platform @p target and the actually used target platform + * (only interesting if the provided @p target was automatic). * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order: * 1. NVIDIA GPUs * 2. AMD GPUs @@ -28,9 +30,9 @@ namespace plssvm::sycl::detail { * 4. CPUs * * @param[in] target the target platform for which the devices must match - * @return the devices (`[[nodiscard]]`) + * @return the devices and used target platform (`[[nodiscard]]`) */ -[[nodiscard]] std::vector<::sycl::queue> get_device_list(target_platform target); +[[nodiscard]] std::pair, target_platform> get_device_list(target_platform target); /** * @brief Wait for the compute device associated with @p queue to finish. * @param[in] queue the SYCL queue to synchronize diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp index 71381a86e..03964ca7f 100644 --- a/src/plssvm/backends/OpenCL/csvm.cpp +++ b/src/plssvm/backends/OpenCL/csvm.cpp @@ -26,6 +26,7 @@ #include // std::terminate #include // std::string +#include // std::tie #include // std::pair, std::make_pair, std::move #include // std::vector @@ -60,14 +61,18 @@ csvm::csvm(const parameter ¶ms) : break; } + // get all available devices wrt the requested target platform + target_platform used_target; + std::tie(devices_, used_target) = detail::get_command_queues(target_); + devices_.resize(std::min(devices_.size(), num_features_)); + if (print_info_) { fmt::print("Using OpenCL as backend.\n"); + if (target_ == target_platform::automatic) { + fmt::print("Using {} as automatic target platform.\n", used_target); + } } - // get all available devices wrt the requested target platform - devices_ = detail::get_command_queues(target_); - devices_.resize(std::min(devices_.size(), num_features_)); - // throw exception if no devices for the requested target could be found if (devices_.empty()) { throw backend_exception{ fmt::format("OpenCL backend selected but no devices for the target {} were found!", target_) }; diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp index 1ba75d35e..afb4a7d0f 100644 --- a/src/plssvm/backends/OpenCL/detail/utility.cpp +++ b/src/plssvm/backends/OpenCL/detail/utility.cpp @@ -45,7 +45,7 @@ void device_assert(const error_code ec, const std::string_view msg) { } } -std::vector get_command_queues_impl(const target_platform target) { +[[nodiscard]] std::vector get_command_queues_impl(const target_platform target) { std::map> platform_devices; // get number of platforms @@ -124,21 +124,25 @@ std::vector get_command_queues_impl(const target_platform target) return command_queues; } -std::vector get_command_queues(const target_platform target) { +std::pair, target_platform> get_command_queues(const target_platform target) { if (target != target_platform::automatic) { - return get_command_queues_impl(target); + return std::make_pair(get_command_queues_impl(target), target); } else { - std::vector target_devices = get_command_queues_impl(target_platform::gpu_nvidia); + target_platform used_target = target_platform::gpu_nvidia; + std::vector target_devices = get_command_queues_impl(used_target); if (target_devices.empty()) { - target_devices = get_command_queues_impl(target_platform::gpu_amd); + used_target = target_platform::gpu_amd; + target_devices = get_command_queues_impl(used_target); if (target_devices.empty()) { - target_devices = get_command_queues_impl(target_platform::gpu_intel); + used_target = target_platform::gpu_intel; + target_devices = get_command_queues_impl(used_target); if (target_devices.empty()) { - target_devices = get_command_queues_impl(target_platform::cpu); + used_target = target_platform::cpu; + target_devices = get_command_queues_impl(used_target); } } } - return target_devices; + return std::make_pair(std::move(target_devices), used_target); } } diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 5f6dc5890..669b3508a 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -30,6 +30,7 @@ #include // std::size_t #include // std::terminate +#include // std::tie #include // std::vector namespace plssvm::sycl { @@ -63,11 +64,15 @@ csvm::csvm(const parameter ¶ms) : break; } + // get all available devices wrt the requested target platform + target_platform used_target; + std::tie(devices_, used_target) = detail::get_device_list(target_); + devices_.resize(std::min(devices_.size(), num_features_)); + // set correct kernel invocation type if "automatic" has been provided if (invocation_type_ == kernel_invocation_type::automatic) { // always use nd_range except for hipSYCL on the CPU - // TODO: automatic target_platform - if (target_ == target_platform::cpu && PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL) { + if (used_target == target_platform::cpu && PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL) { invocation_type_ = kernel_invocation_type::hierarchical; } else { invocation_type_ = kernel_invocation_type::nd_range; @@ -82,12 +87,11 @@ csvm::csvm(const parameter ¶ms) : constexpr std::string_view sycl_implementation_name = "DPC++"; #endif fmt::print("Using SYCL ({}) as backend with the kernel invocation type \"{}\".\n", sycl_implementation_name, invocation_type_); + if (target_ == target_platform::automatic) { + fmt::print("Using {} as automatic target platform.\n", used_target); + } } - // get all available devices wrt the requested target platform - devices_ = detail::get_device_list(target_); - devices_.resize(std::min(devices_.size(), num_features_)); - // throw exception if no devices for the requested target could be found if (devices_.empty()) { throw backend_exception{ fmt::format("SYCL backend selected but no devices for the target {} were found!", target_) }; diff --git a/src/plssvm/backends/SYCL/detail/utility.cpp b/src/plssvm/backends/SYCL/detail/utility.cpp index cac051ef0..d47550582 100644 --- a/src/plssvm/backends/SYCL/detail/utility.cpp +++ b/src/plssvm/backends/SYCL/detail/utility.cpp @@ -14,8 +14,9 @@ #include "sycl/sycl.hpp" // sycl::queue, sycl::platform, sycl::device, sycl::property::queue, sycl::info, sycl::gpu_selector -#include // std::string -#include // std::vector +#include // std::string +#include // std::pair, std::make_pair +#include // std::vector namespace plssvm::sycl::detail { @@ -70,21 +71,25 @@ namespace plssvm::sycl::detail { return target_devices; } -[[nodiscard]] std::vector<::sycl::queue> get_device_list(const target_platform target) { +std::pair, ::plssvm::target_platform> get_device_list(const target_platform target) { if (target != target_platform::automatic) { - return get_device_list_impl(target); + return std::make_pair(get_device_list_impl(target), target); } else { - std::vector<::sycl::queue> target_devices = get_device_list_impl(target_platform::gpu_nvidia); + target_platform used_target = target_platform::gpu_nvidia; + std::vector<::sycl::queue> target_devices = get_device_list_impl(used_target); if (target_devices.empty()) { - target_devices = get_device_list_impl(target_platform::gpu_amd); + used_target = target_platform::gpu_amd; + target_devices = get_device_list_impl(used_target); if (target_devices.empty()) { - target_devices = get_device_list_impl(target_platform::gpu_intel); + used_target = target_platform::gpu_intel; + target_devices = get_device_list_impl(used_target); if (target_devices.empty()) { - target_devices = get_device_list_impl(target_platform::cpu); + used_target = target_platform::cpu; + target_devices = get_device_list_impl(used_target); } } } - return target_devices; + return std::make_pair(std::move(target_devices), used_target); } } From 5b84ea9c1e4efef6a215db455dfe4edc998c1f19 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Mar 2022 10:31:33 +0100 Subject: [PATCH 27/56] Fix wrong OpenMP svm kernel file name. --- .../OpenMP/{svm_kernel_hierarchical.cpp => svm_kernel.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/plssvm/backends/OpenMP/{svm_kernel_hierarchical.cpp => svm_kernel.cpp} (100%) diff --git a/src/plssvm/backends/OpenMP/svm_kernel_hierarchical.cpp b/src/plssvm/backends/OpenMP/svm_kernel.cpp similarity index 100% rename from src/plssvm/backends/OpenMP/svm_kernel_hierarchical.cpp rename to src/plssvm/backends/OpenMP/svm_kernel.cpp From dd704cd81b889d47765ab0e02d90b20fecdd097d Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Mar 2022 10:33:32 +0100 Subject: [PATCH 28/56] Update documentation. --- .../plssvm/backends/SYCL/svm_kernel_nd_range.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp index 7d39c9867..634e1b9bb 100644 --- a/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp @@ -6,11 +6,12 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * -* @brief Defines the kernel functions for the C-SVM using the SYCL backend. +* @brief Defines the kernel functions for the C-SVM in the nd_range formulation using the SYCL backend. */ #pragma once +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::atomic_op #include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL #include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE @@ -30,7 +31,7 @@ template using local_accessor = ::sycl::accessor; /** -* @brief Calculates the C-SVM kernel using the linear kernel function. +* @brief Calculates the C-SVM kernel using the nd_range formulation and the linear kernel function. * @details Supports multi-GPU execution. * @tparam T the type of the data */ @@ -41,7 +42,7 @@ class nd_range_device_kernel_linear { using real_type = T; /** - * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel. + * @brief Construct a new device kernel calculating the C-SVM kernel using the linear C-SVM kernel. * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory * @param[in] q the `q` vector * @param[out] ret the result vector @@ -150,7 +151,7 @@ class nd_range_device_kernel_linear { }; /** -* @brief Calculates the C-SVM kernel using the polynomial kernel function. +* @brief Calculates the C-SVM kernel using the nd_range formulation and the polynomial kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data */ @@ -161,7 +162,7 @@ class nd_range_device_kernel_poly { using real_type = T; /** - * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel. + * @brief Construct a new device kernel calculating the C-SVM kernel using the polynomial C-SVM kernel. * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory * @param[in] q the `q` vector * @param[out] ret the result vector @@ -265,7 +266,7 @@ class nd_range_device_kernel_poly { }; /** -* @brief Calculates the C-SVM kernel using the radial basis functions kernel function. +* @brief Calculates the C-SVM kernel using the nd_range formulation and the radial basis functions kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data */ @@ -276,7 +277,7 @@ class nd_range_device_kernel_radial { using real_type = T; /** - * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel. + * @brief Construct a new device kernel calculating the C-SVM kernel using the radial basis functions C-SVM kernel. * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory * @param[in] q the `q` vector * @param[out] ret the result vector From 64c0a99a141aacffb5b9333e0e998c493fa60730 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Mar 2022 10:34:58 +0100 Subject: [PATCH 29/56] Update documentation. --- include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp index bb6c0a75a..3202e852c 100644 --- a/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp @@ -23,7 +23,7 @@ namespace plssvm::sycl { /** - * @brief Calculates the C-SVM kernel using the linear kernel function. + * @brief Calculates the C-SVM kernel using the hierarchical formulation and the linear kernel function. * @details Supports multi-GPU execution. * @tparam T the type of the data */ @@ -197,7 +197,7 @@ class hierarchical_device_kernel_linear { }; /** - * @brief Calculates the C-SVM kernel using the polynomial kernel function. + * @brief Calculates the C-SVM kernel using the hierarchical formulation and the polynomial kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data */ @@ -368,7 +368,7 @@ class hierarchical_device_kernel_poly { }; /** - * @brief Calculates the C-SVM kernel using the radial basis functions kernel function. + * @brief Calculates the C-SVM kernel using the hierarchical formulation and the radial basis functions kernel function. * @details Currently only single GPU execution is supported. * @tparam T the type of the data */ From 7c16ef01f76973d76c736d27e1d55a115fb8a7fa Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Mar 2022 11:02:59 +0100 Subject: [PATCH 30/56] Update hierarchical svm kernel invocation to be more in line with the nd_range version. --- .../backends/SYCL/svm_kernel_hierarchical.hpp | 630 ++++++++---------- src/plssvm/backends/SYCL/csvm.cpp | 36 +- 2 files changed, 305 insertions(+), 361 deletions(-) diff --git a/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp index 3202e852c..f367f45a8 100644 --- a/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp @@ -34,9 +34,7 @@ class hierarchical_device_kernel_linear { using real_type = T; /** - * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel. - * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued - * @param[in] range the execution range of the kernel + * @brief Construct a new device kernel calculating the C-SVM kernel using the linear C-SVM kernel. * @param[in] q the `q` vector * @param[out] ret the result vector * @param[in] d the right-hand side of the equation @@ -48,142 +46,126 @@ class hierarchical_device_kernel_linear { * @param[in] add denotes whether the values are added or subtracted from the result vector * @param[in] id the id of the device */ - hierarchical_device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : - queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} + hierarchical_device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : + q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} /** * @brief Function call operator overload performing the actual calculation. + * @param[in] group the [`sycl::group`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#group-class) + * identifying an instance of the currently execution work-group */ - void operator()() const { - queue_.submit([&](::sycl::handler &cgh) { - const real_type *q = q_; - real_type *ret = ret_; - const real_type *d = d_; - const real_type *data_d = data_d_; - const real_type QA_cost = QA_cost_; - const real_type cost = cost_; - const kernel_index_type num_rows = num_rows_; - const kernel_index_type feature_range = feature_range_; - const real_type add = add_; - const kernel_index_type device = device_; - - cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { - // allocate shared memory - real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - - // allocate memory for work-item local variables - // -> accessible across different 'parallel_for_work_item' invocations - ::sycl::private_memory private_matr{ group }; - ::sycl::private_memory private_data_j{ group }; - ::sycl::private_memory private_i{ group }; - ::sycl::private_memory private_j{ group }; - ::sycl::private_memory private_cond{ group }; - - // initialize private variables - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - // indices and diagonal condition - private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - private_cond(idx) = private_i(idx) >= private_j(idx); - if (private_cond(idx)) { - private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - } + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + } + + // matrix + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } + } + }); - // matrix - for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { - private_matr(idx)[i][j] = real_type{ 0.0 }; + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)]; } - } - }); - - // implicit group barrier - - // load data from global in shared memory - for (kernel_index_type vec_index = 0; vec_index < feature_range * num_rows; vec_index += num_rows) { - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - if (private_cond(idx)) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(1) == idx_1) { - data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)]; - } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(0) == idx_2) { - data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)]; - } - } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)]; } - }); + } + } + }); - // implicit group barrier + // implicit group barrier - // load data from shared in private memory and perform scalar product - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - if (private_cond(idx)) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; - } + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; - } - } + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; } - }); - - // implicit group barrier + } } + }); - // kernel function - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - if (private_cond(idx)) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - real_type temp; - if (device == 0) { - temp = (private_matr(idx)[x][y] + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add; - } else { - temp = private_matr(idx)[x][y] * add; - } - if (private_i(idx) + x > private_j(idx) + y) { - // upper triangular matrix - atomic_op{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x]; - ret_jx += temp * d[private_i(idx) + y]; - } else if (private_i(idx) + x == private_j(idx) + y) { - // diagonal - if (device == 0) { - ret_jx += (temp + cost * add) * d[private_i(idx) + y]; - } else { - ret_jx += temp * d[private_i(idx) + y]; - } - } + // implicit group barrier + } + + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + real_type temp; + if (device_ == 0) { + temp = (private_matr(idx)[x][y] + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_; + } else { + temp = private_matr(idx)[x][y] * add_; + } + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x]; + ret_jx += temp * d_[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + if (device_ == 0) { + ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y]; + } else { + ret_jx += temp * d_[private_i(idx) + y]; } - atomic_op{ ret[private_j(idx) + x] } += ret_jx; } } - }); - }); + atomic_op{ ret_[private_j(idx) + x] } += ret_jx; + } + } }); } private: - ::sycl::queue &queue_; - ::sycl::range<2> global_range_; - ::sycl::range<2> local_range_; - const real_type *q_; real_type *ret_; const real_type *d_; @@ -208,9 +190,7 @@ class hierarchical_device_kernel_poly { using real_type = T; /** - * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel. - * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued - * @param[in] range the execution range of the kernel + * @brief Construct a new device kernel calculating the C-SVM kernel using the polynomial C-SVM kernel. * @param[in] q the `q` vector * @param[out] ret the result vector * @param[in] d the right-hand side of the equation @@ -224,135 +204,117 @@ class hierarchical_device_kernel_poly { * @param[in] gamma the gamma parameter used in the polynomial kernel function * @param[in] coef0 the coef0 parameter used in the polynomial kernel function */ - hierarchical_device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : - queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} + hierarchical_device_kernel_poly(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : + q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} /** * @brief Function call operator overload performing the actual calculation. + * @param[in] group the [`sycl::group`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#group-class) + * identifying an instance of the currently execution work-group */ - void operator()() const { - queue_.submit([&](::sycl::handler &cgh) { - const real_type *q = q_; - real_type *ret = ret_; - const real_type *d = d_; - const real_type *data_d = data_d_; - const real_type QA_cost = QA_cost_; - const real_type cost = cost_; - const kernel_index_type num_rows = num_rows_; - const kernel_index_type num_cols = num_cols_; - const real_type add = add_; - const int degree = degree_; - const real_type gamma = gamma_; - const real_type coef0 = coef0_; - - cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { - // allocate shared memory - real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - - // allocate memory for work-item local variables - // -> accessible across different 'parallel_for_work_item' invocations - ::sycl::private_memory private_matr{ group }; - ::sycl::private_memory private_data_j{ group }; - ::sycl::private_memory private_i{ group }; - ::sycl::private_memory private_j{ group }; - ::sycl::private_memory private_cond{ group }; - - // initialize private variables - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - // indices and diagonal condition - private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - private_cond(idx) = private_i(idx) >= private_j(idx); - if (private_cond(idx)) { - private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - } + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + } + + // matrix + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } + } + }); - // matrix - for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { - private_matr(idx)[i][j] = real_type{ 0.0 }; + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)]; } - } - }); - - // implicit group barrier - - // load data from global in shared memory - for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - if (private_cond(idx)) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(1) == idx_1) { - data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)]; - } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(0) == idx_2) { - data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)]; - } - } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)]; } - }); + } + } + }); - // implicit group barrier + // implicit group barrier - // load data from shared in private memory and perform scalar product - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - if (private_cond(idx)) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; - } + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; - } - } + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; } - }); - - // implicit group barrier + } } + }); - // kernel function - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - if (private_cond(idx)) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast(degree)) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add; - if (private_i(idx) + x > private_j(idx) + y) { - // upper triangular matrix - atomic_op{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x]; - ret_jx += temp * d[private_i(idx) + y]; - } else if (private_i(idx) + x == private_j(idx) + y) { - // diagonal - ret_jx += (temp + cost * add) * d[private_i(idx) + y]; - } - } - atomic_op{ ret[private_j(idx) + x] } += ret_jx; + // implicit group barrier + } + + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::pow(gamma_ * private_matr(idx)[x][y] + coef0_, static_cast(degree_)) + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_; + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x]; + ret_jx += temp * d_[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y]; } } - }); - }); + atomic_op{ ret_[private_j(idx) + x] } += ret_jx; + } + } }); } private: - ::sycl::queue &queue_; - ::sycl::range<2> global_range_; - ::sycl::range<2> local_range_; - const real_type *q_; real_type *ret_; const real_type *d_; @@ -379,9 +341,7 @@ class hierarchical_device_kernel_radial { using real_type = T; /** - * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel. - * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued - * @param[in] range the execution range of the kernel + * @brief Construct a new device kernel calculating the C-SVM kernel using the radial basis functions kernel function. * @param[in] q the `q` vector * @param[out] ret the result vector * @param[in] d the right-hand side of the equation @@ -393,133 +353,117 @@ class hierarchical_device_kernel_radial { * @param[in] add denotes whether the values are added or subtracted from the result vector * @param[in] gamma the gamma parameter used in the rbf kernel function */ - hierarchical_device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : - queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} + hierarchical_device_kernel_radial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : + q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} /** * @brief Function call operator overload performing the actual calculation. + * @param[in] group the [`sycl::group`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#group-class) + * identifying an instance of the currently execution work-group */ - void operator()() const { - queue_.submit([&](::sycl::handler &cgh) { - const real_type *q = q_; - real_type *ret = ret_; - const real_type *d = d_; - const real_type *data_d = data_d_; - const real_type QA_cost = QA_cost_; - const real_type cost = cost_; - const kernel_index_type num_rows = num_rows_; - const kernel_index_type num_cols = num_cols_; - const real_type add = add_; - const real_type gamma = gamma_; - - cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) { - // allocate shared memory - real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; - - // allocate memory for work-item local variables - // -> accessible across different 'parallel_for_work_item' invocations - ::sycl::private_memory private_matr{ group }; - ::sycl::private_memory private_data_j{ group }; - ::sycl::private_memory private_i{ group }; - ::sycl::private_memory private_j{ group }; - ::sycl::private_memory private_cond{ group }; - - // initialize private variables - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - // indices and diagonal condition - private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; - private_cond(idx) = private_i(idx) >= private_j(idx); - if (private_cond(idx)) { - private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; - } + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + } + + // matrix + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } + } + }); - // matrix - for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { - private_matr(idx)[i][j] = real_type{ 0.0 }; + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)]; } - } - }); - - // implicit group barrier - - // load data from global in shared memory - for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - if (private_cond(idx)) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(1) == idx_1) { - data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)]; - } - const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; - if (idx.get_local_id(0) == idx_2) { - data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)]; - } - } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)]; } - }); + } + } + }); - // implicit group barrier + // implicit group barrier - // load data from shared in private memory and perform scalar product - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - if (private_cond(idx)) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { - private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; - } + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { - const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { - private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]); - } - } + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]); } - }); - - // implicit group barrier + } } + }); - // kernel function - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - if (private_cond(idx)) { - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { - real_type ret_jx = 0.0; - #pragma unroll INTERNAL_BLOCK_SIZE - for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { - const real_type temp = (::sycl::exp(-gamma * private_matr(idx)[x][y]) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add; - if (private_i(idx) + x > private_j(idx) + y) { - // upper triangular matrix - atomic_op{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x]; - ret_jx += temp * d[private_i(idx) + y]; - } else if (private_i(idx) + x == private_j(idx) + y) { - // diagonal - ret_jx += (temp + cost * add) * d[private_i(idx) + y]; - } - } - atomic_op{ ret[private_j(idx) + x] } += ret_jx; + // implicit group barrier + } + + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::exp(-gamma_ * private_matr(idx)[x][y]) + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_; + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x]; + ret_jx += temp * d_[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y]; } } - }); - }); + atomic_op{ ret_[private_j(idx) + x] } += ret_jx; + } + } }); } private: - ::sycl::queue &queue_; - ::sycl::range<2> global_range_; - ::sycl::range<2> local_range_; - const real_type *q_; real_type *ret_; const real_type *d_; diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 669b3508a..e218fde52 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -190,33 +190,33 @@ void csvm::run_svm_kernel(const std::size_t device, const ::plssvm::detail::e const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, invocation_type_); switch (kernel_) { case kernel_type::linear: - if (invocation_type_ == kernel_invocation_type::nd_range) { - devices_[device].submit([&](::sycl::handler &cgh) { + devices_[device].submit([&](::sycl::handler &cgh) { + if (invocation_type_ == kernel_invocation_type::nd_range) { cgh.parallel_for(execution_range, nd_range_device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)); - }); - } else { - hierarchical_device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)(); - } + } else if (invocation_type_ == kernel_invocation_type::hierarchical) { + cgh.parallel_for_work_group(execution_range.get_global_range(), execution_range.get_local_range(), hierarchical_device_kernel_linear(q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)); + } + }); break; case kernel_type::polynomial: PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!"); - if (invocation_type_ == kernel_invocation_type::nd_range) { - devices_[device].submit([&](::sycl::handler &cgh) { + devices_[device].submit([&](::sycl::handler &cgh) { + if (invocation_type_ == kernel_invocation_type::nd_range) { cgh.parallel_for(execution_range, nd_range_device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); - }); - } else { - hierarchical_device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)(); - } + } else if (invocation_type_ == kernel_invocation_type::hierarchical) { + cgh.parallel_for_work_group(execution_range.get_global_range(), execution_range.get_local_range(), hierarchical_device_kernel_poly(q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); + } + }); break; case kernel_type::rbf: PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!"); - if (invocation_type_ == kernel_invocation_type::nd_range) { - devices_[device].submit([&](::sycl::handler &cgh) { + devices_[device].submit([&](::sycl::handler &cgh) { + if (invocation_type_ == kernel_invocation_type::nd_range) { cgh.parallel_for(execution_range, nd_range_device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)); - }); - } else { - hierarchical_device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)(); - } + } else if (invocation_type_ == kernel_invocation_type::hierarchical) { + cgh.parallel_for_work_group(execution_range.get_global_range(), execution_range.get_local_range(), hierarchical_device_kernel_radial(q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)); + } + }); break; } } From ec0ee3728968aed7fc0cc97d7a68097b0d73c341 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Mar 2022 11:34:57 +0100 Subject: [PATCH 31/56] Move source file from SYCL to base library. --- CMakeLists.txt | 1 + src/plssvm/backends/SYCL/CMakeLists.txt | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 967b5cfe5..44eca562b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,7 @@ set(PLSSVM_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_predict.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_train.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/target_platforms.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/plssvm/backends/SYCL/kernel_invocation_type.cpp ) ## create base library: linked against all backend libraries diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt index b38db8494..cef2b4073 100644 --- a/src/plssvm/backends/SYCL/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/CMakeLists.txt @@ -45,7 +45,6 @@ set(PLSSVM_SYCL_SOURCES ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp - ${CMAKE_CURRENT_LIST_DIR}/kernel_invocation_type.cpp ${CMAKE_CURRENT_LIST_DIR}/../gpu_csvm.cpp ) From ff7432b45fb14d4b79bc96db42ff716e50d8048e Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Mar 2022 11:56:57 +0100 Subject: [PATCH 32/56] Fix parameter output test after adding sycl_kernel_invocation_type. --- tests/parameter_test.cpp | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/parameter_test.cpp b/tests/parameter_test.cpp index e997ca9b9..362f3a328 100644 --- a/tests/parameter_test.cpp +++ b/tests/parameter_test.cpp @@ -639,20 +639,21 @@ TYPED_TEST(Parameter, output_operator) { // correct output string std::string correct_output = - fmt::format("kernel_type linear\n" - "degree 3\n" - "gamma 0\n" - "coef0 0\n" - "cost 1\n" - "epsilon 0.001\n" - "print_info true\n" - "backend openmp\n" - "target platform automatic\n" - "input_filename ''\n" - "model_filename ''\n" - "predict_filename ''\n" - "rho 0\n" - "real_type {}\n", + fmt::format("kernel_type linear\n" + "degree 3\n" + "gamma 0\n" + "coef0 0\n" + "cost 1\n" + "epsilon 0.001\n" + "print_info true\n" + "backend openmp\n" + "target platform automatic\n" + "SYCL kernel invocation type automatic\n" + "input_filename ''\n" + "model_filename ''\n" + "predict_filename ''\n" + "rho 0\n" + "real_type {}\n", plssvm::detail::arithmetic_type_name()); // check for equality From 2baa30e7b65322447a99732a90e1a7d17c7984e6 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 3 Mar 2022 11:57:19 +0100 Subject: [PATCH 33/56] Add tests for SYCL nd_range AND hierarchical kernel formulations. --- tests/backends/SYCL/test.cpp | 17 +++++++++++------ tests/backends/generic_tests.hpp | 16 +++++++++------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/tests/backends/SYCL/test.cpp b/tests/backends/SYCL/test.cpp index 89d8d9d00..727ff0e12 100644 --- a/tests/backends/SYCL/test.cpp +++ b/tests/backends/SYCL/test.cpp @@ -13,9 +13,10 @@ #include "backends/generic_tests.hpp" // generic::write_model_test, generic::generate_q_test, generic::device_kernel_test, generic::predict_test, generic::accuracy_test #include "utility.hpp" // util::google_test::parameter_definition, util::google_test::parameter_definition_to_name -#include "plssvm/backends/SYCL/csvm.hpp" // plssvm::sycl::csvm -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/backends/SYCL/csvm.hpp" // plssvm::sycl::csvm +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/parameter.hpp" // plssvm::parameter #include "gtest/gtest.h" // ::testing::StaticAssertTypeEq, ::testing::Test, ::testing::Types, TYPED_TEST_SUITE, TYPED_TEST @@ -47,9 +48,13 @@ TYPED_TEST(SYCL_CSVM, generate_q) { generic::generate_q_test(); } -// check whether the device kernels are correct -TYPED_TEST(SYCL_CSVM, device_kernel) { - generic::device_kernel_test(); +// check whether the nd_range device kernels are correct +TYPED_TEST(SYCL_CSVM, device_kernel_nd_range) { + generic::device_kernel_test(); +} +// check whether the hierarchical device kernels are correct +TYPED_TEST(SYCL_CSVM, device_kernel_hierarchical) { + generic::device_kernel_test(); } // check whether the correct labels are predicted diff --git a/tests/backends/generic_tests.hpp b/tests/backends/generic_tests.hpp index 3fadd734d..1cdf24b33 100644 --- a/tests/backends/generic_tests.hpp +++ b/tests/backends/generic_tests.hpp @@ -15,12 +15,13 @@ #include "mock_csvm.hpp" // mock_csvm #include "utility.hpp" // util::gtest_assert_floating_point_near, util::gtest_assert_floating_point_eq, util::gtest_expect_correct_csvm_factory, util::create_temp_file -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE -#include "plssvm/detail/string_conversion.hpp" // plssvm::detail::convert_to -#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE +#include "plssvm/detail/string_conversion.hpp" // plssvm::detail::convert_to +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/parameter.hpp" // plssvm::parameter #include "fmt/format.h" // fmt::format #include "fmt/ostream.h" // can use fmt using operator<< overloads @@ -124,12 +125,13 @@ inline void generate_q_test() { } } -template