From 31733892c97a0eb790e09fa390a80f1d97a7a87b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Feb 2022 16:27:56 +0100
Subject: [PATCH 01/56] Add currently used SYCL implementation (hipSYCL or
 DPC++) to output.

---
 src/plssvm/backends/SYCL/csvm.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 4a8777dec..9f70416fc 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -63,7 +63,12 @@ csvm<T>::csvm(const parameter<T> &params) :
     }
 
     if (print_info_) {
-        fmt::print("Using SYCL as backend.\n");
+#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
+        fmt::print("Using SYCL (hipSYCL) as backend.\n");
+#endif
+#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
+        fmt::print("Using SYCL (DPC++) as backend.\n");
+#endif
     }
 
     // get all available devices wrt the requested target platform

From 33fb1d261425d3c6f3d646c3b8753465865923d5 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Feb 2022 16:39:30 +0100
Subject: [PATCH 02/56] Add timing information to predict functionality.

---
 src/plssvm/backends/gpu_csvm.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/backends/gpu_csvm.cpp b/src/plssvm/backends/gpu_csvm.cpp
index 8a50aedf4..79cf2c012 100644
--- a/src/plssvm/backends/gpu_csvm.cpp
+++ b/src/plssvm/backends/gpu_csvm.cpp
@@ -39,6 +39,9 @@ gpu_csvm<T, device_ptr_t, queue_t>::gpu_csvm(const parameter<T> &params) :
 
 template <typename T, typename device_ptr_t, typename queue_t>
 auto gpu_csvm<T, device_ptr_t, queue_t>::predict(const std::vector<std::vector<real_type>> &points) -> std::vector<real_type> {
+    // time prediction
+    auto start_time = std::chrono::steady_clock::now();
+
     using namespace plssvm::operators;
 
     PLSSVM_ASSERT(data_ptr_ != nullptr, "No data is provided!");  // exception in constructor
@@ -103,6 +106,11 @@ auto gpu_csvm<T, device_ptr_t, queue_t>::predict(const std::vector<std::vector<r
         out += bias_;
     }
 
+    auto end_time = std::chrono::steady_clock::now();
+    if (print_info_) {
+        fmt::print("Predicted data points in {}.\n", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time));
+    }
+
     return out;
 }
 
@@ -380,4 +388,4 @@ template class gpu_csvm<float, ::plssvm::sycl::detail::device_ptr<float>, ::sycl
 template class gpu_csvm<double, ::plssvm::sycl::detail::device_ptr<double>, ::sycl::queue>;
 #endif
 
-}  // namespace plssvm::detail
\ No newline at end of file
+}  // namespace plssvm::detail

From 3a5f7ab3a374b25dd951dba51f8b6dd5c80cd024 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Feb 2022 16:46:34 +0100
Subject: [PATCH 03/56] Add -sycl-std=2020 flags to hipSYCL and DPC++.

---
 src/plssvm/backends/SYCL/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index 8d484c644..b65956d58 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -58,10 +58,10 @@ if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL")
     # set backend compiler to hipSYCL (= 1)
     target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=1)
     # silence unknown options warnings
-    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Wno-unknown-warning-option)
+    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -Wno-unknown-warning-option)
 elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++")
     # enable DPC++ SYCL support
-    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl)
+    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl)
     target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl)
 
     # nvidia targets

From fa8ee450c3c9a7cc81e640e41a902b41b9135788 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 4 Feb 2022 17:41:07 +0100
Subject: [PATCH 04/56] Change predict kernel to use hierarchical SYCL notation
 if hipSYCL is used (faster on the CPU)

---
 .../plssvm/backends/SYCL/predict_kernel.hpp   | 28 +++++-----
 src/plssvm/backends/SYCL/csvm.cpp             | 52 +++++++++++++++++--
 2 files changed, 65 insertions(+), 15 deletions(-)

diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp
index 98b666676..d8568facf 100644
--- a/include/plssvm/backends/SYCL/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/predict_kernel.hpp
@@ -72,12 +72,15 @@ class device_kernel_w_linear {
  * @brief Predicts the labels for data points using the polynomial kernel function.
  * @details Currently only single GPU execution is supported.
  * @tparam T the type of the data points
+ * @tparam U the type of the `sycl::item`
  */
-template <typename T>
+template <typename T, typename U>
 class device_kernel_predict_poly {
   public:
     /// The type of the data.
     using real_type = T;
+    /// The `sycl::item` type.
+    using sycl_item_type = U;
 
     /**
      * @brief Construct a new device kernel to predict the labels for data points using the polynomial kernel function.
@@ -99,12 +102,11 @@ class device_kernel_predict_poly {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
-        const kernel_index_type data_point_index = nd_idx.get_global_id(0);
-        const kernel_index_type predict_point_index = nd_idx.get_global_id(1);
+    void operator()(sycl_item_type idx) const {
+        const kernel_index_type data_point_index = idx.get_global_id(0);
+        const kernel_index_type predict_point_index = idx.get_global_id(1);
 
         real_type temp = 0;
         if (predict_point_index < num_predict_points_) {
@@ -140,12 +142,15 @@ class device_kernel_predict_poly {
  * @brief Predicts the labels for data points using the radial basis functions kernel function.
  * @details Currently only single GPU execution is supported.
  * @tparam T the type of the data points
+ * @tparam U the type of the `sycl::item`
  */
-template <typename T>
+template <typename T, typename U>
 class device_kernel_predict_radial {
   public:
     /// The type of the data.
     using real_type = T;
+    /// The `sycl::item` type
+    using sycl_item_type = U;
 
     /**
      * @brief Construct a new device kernel to predict the labels for data points using the radial basis function kernel function.
@@ -165,12 +170,11 @@ class device_kernel_predict_radial {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
-        const kernel_index_type data_point_index = nd_idx.get_global_id(0);
-        const kernel_index_type predict_point_index = nd_idx.get_global_id(1);
+    void operator()(sycl_item_type idx) const {
+        const kernel_index_type data_point_index = idx.get_global_id(0);
+        const kernel_index_type predict_point_index = idx.get_global_id(1);
 
         real_type temp = 0;
         if (predict_point_index < num_predict_points_) {
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 4a8777dec..c5f7282fb 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -180,16 +180,62 @@ void csvm<T>::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe
 }
 
 template <typename T>
-void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) {
+void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) {
     const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
     switch (kernel_) {
         case kernel_type::linear:
             break;
         case kernel_type::polynomial:
-            devices_[0].parallel_for(execution_range, device_kernel_predict_poly(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, degree_, gamma_, coef0_));
+#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
+            {
+                ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
+                ::sycl::range<2> local_range{ range.block[0], range.block[1] };
+                devices_[0].submit([&](::sycl::handler& cgh) {
+                    real_type *out_d_ptr = out_d.get();
+                    const real_type *data_d_ptr = data_d_[0].get();
+                    const real_type *data_last_d_ptr = data_last_d_[0].get();
+                    const real_type *alpha_d_ptr = alpha_d.get();
+                    const std::size_t num_data_points = num_data_points_;
+                    const real_type *point_d_ptr = point_d.get();
+                    const std::size_t num_predict_points = p_num_predict_points;
+                    const std::size_t num_features = num_features_;
+                    const int degree = degree_;
+                    const real_type gamma = gamma_;
+                    const real_type coef0 = coef0_;
+
+                    cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
+                        group.parallel_for_work_item(device_kernel_predict_poly<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0));
+                    });
+                });
+            }
+#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
+            devices_[0].parallel_for(execution_range, device_kernel_predict_poly<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_));
+#endif
             break;
         case kernel_type::rbf:
-            devices_[0].parallel_for(execution_range, device_kernel_predict_radial(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, gamma_));
+#if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
+            {
+                ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
+                ::sycl::range<2> local_range{ range.block[0], range.block[1] };
+                devices_[0].submit([&](::sycl::handler& cgh) {
+                    real_type *out_d_ptr = out_d.get();
+                    const real_type *data_d_ptr = data_t_[0].get();
+                    const real_type *data_last_d_ptr = data_last_d_[0].get();
+                    const real_type *alpha_d_ptr = alpha_d.get();
+                    const std::size_t num_data_points = num_data_points_;
+                    const real_type *point_d_ptr = point_d.get();
+                    const std::size_t num_predict_points = p_num_predict_points;
+                    const std::size_t num_features = num_features_;
+                    const real_type gamma = gamma_;
+
+                    cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
+                        group.parallel_for_work_item(device_kernel_predict_radial<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma));
+                    });
+                });
+            }
+#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
+            devices_[0].parallel_for(execution_range, device_kernel_predict_radial<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_));
+#endif
             break;
     }
 }

From 727e7002027d30edc9f2918527b0bd1cd1ae1135 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 6 Feb 2022 18:49:07 +0100
Subject: [PATCH 05/56] Add missing prediction timing output to OpenMP backend
 and add number of predict points to GPU backends.

---
 src/plssvm/backends/OpenMP/csvm.cpp | 8 ++++++++
 src/plssvm/backends/gpu_csvm.cpp    | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp
index e9d1ccaa0..f0fcb961d 100644
--- a/src/plssvm/backends/OpenMP/csvm.cpp
+++ b/src/plssvm/backends/OpenMP/csvm.cpp
@@ -184,6 +184,9 @@ void csvm<T>::update_w() {
 
 template <typename T>
 auto csvm<T>::predict(const std::vector<std::vector<real_type>> &points) -> std::vector<real_type> {
+    // time prediction
+    auto start_time = std::chrono::steady_clock::now();
+
     using namespace plssvm::operators;
 
     PLSSVM_ASSERT(data_ptr_ != nullptr, "No data is provided!");  // exception in constructor
@@ -228,6 +231,11 @@ auto csvm<T>::predict(const std::vector<std::vector<real_type>> &points) -> std:
         }
     }
 
+    auto end_time = std::chrono::steady_clock::now();
+    if (print_info_) {
+        fmt::print("Predicted {} data points in {}.\n", points.size(), std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time));
+    }
+
     return out;
 }
 
diff --git a/src/plssvm/backends/gpu_csvm.cpp b/src/plssvm/backends/gpu_csvm.cpp
index 79cf2c012..b07a12e89 100644
--- a/src/plssvm/backends/gpu_csvm.cpp
+++ b/src/plssvm/backends/gpu_csvm.cpp
@@ -108,7 +108,7 @@ auto gpu_csvm<T, device_ptr_t, queue_t>::predict(const std::vector<std::vector<r
 
     auto end_time = std::chrono::steady_clock::now();
     if (print_info_) {
-        fmt::print("Predicted data points in {}.\n", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time));
+        fmt::print("Predicted {} data points in {}.\n", points.size(), std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time));
     }
 
     return out;

From 195d39120940e90a8e483a4188974b991bcbab0c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 6 Feb 2022 19:12:27 +0100
Subject: [PATCH 06/56] Add [[maybe_unused]] attribute.

---
 src/plssvm/backends/SYCL/csvm.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index c5f7282fb..97dab6674 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -181,7 +181,8 @@ void csvm<T>::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe
 
 template <typename T>
 void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) {
-    const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
+    [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
+    
     switch (kernel_) {
         case kernel_type::linear:
             break;

From 8bba28abf470af03c61eff2d8740142557149034 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Feb 2022 10:04:19 +0100
Subject: [PATCH 07/56] First try to reformulate SYCL SVM kernel using
 hiearchical kernels.

---
 include/plssvm/backends/SYCL/svm_kernel.hpp | 190 +++++++++++++++++++-
 src/plssvm/backends/SYCL/csvm.cpp           |   9 +-
 2 files changed, 191 insertions(+), 8 deletions(-)

diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp
index a0b1a669b..9ea3546aa 100644
--- a/include/plssvm/backends/SYCL/svm_kernel.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel.hpp
@@ -11,6 +11,7 @@
 
 #pragma once
 
+#include "plssvm/detail/execution_range.hpp"
 #include "plssvm/backends/SYCL/detail/constants.hpp"  // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
 #include "plssvm/constants.hpp"                       // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
 
@@ -176,15 +177,192 @@ class device_kernel_poly {
      * @param[in] gamma the gamma parameter used in the polynomial kernel function
      * @param[in] coef0 the coef0 parameter used in the polynomial kernel function
      */
-    device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
-        data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
+    device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
+        queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
 
     /**
      * @brief Function call operator overload performing the actual calculation.
      * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
      *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
+    void operator()() const {
+    
+    queue_.submit([&](::sycl::handler& cgh) {
+      const real_type *q = q_;
+      real_type *ret = ret_;
+      const real_type *d = d_;
+      const real_type *data_d = data_d_;
+      const real_type QA_cost = QA_cost_;
+      const real_type cost = cost_;
+      const kernel_index_type num_rows = num_rows_;
+      const kernel_index_type num_cols = num_cols_;
+      const real_type add = add_;
+      const int degree = degree_;
+      const real_type gamma = gamma_;
+      const real_type coef0 = coef0_;
+
+      cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
+        // allocate shared memory
+        real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+        real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+        
+        //const std::size_t gi = group.get_group_id(0);
+        //const std::size_t gj = group.get_group_id(1);
+
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                    private_matr(idx)[i][j] = real_type{ 0.0 };
+                }
+            }
+        });
+
+        for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
+
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+
+            if (i >= j) {
+                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                    const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                    if (idx.get_local_id(1) == idx_1) {
+                        data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i];
+                    }
+                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                    if (idx.get_local_id(0) == idx_2) {
+                        data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j];
+                    }
+                }
+            }
+        });
+
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+
+            if (i >= j) {
+                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                    private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                }
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                    const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                        private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
+                    }
+                }
+            }
+        });
+
+        }
+
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            kernel_index_type i = group.get_group_id(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group.get_group_id(1) * INTERNAL_BLOCK_SIZE;
+
+            if (i >= j) {
+                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                    real_type ret_jx = 0.0;
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                        const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[i + y] - q[j + x]) * add;
+                        if (i + x > j + y) {
+                            // upper triangular matrix
+                            atomic_op<real_type>{ ret[i + y] } += temp * d[j + x];
+                            ret_jx += temp * d[i + y];
+                        } else if (i + x == j + y) {
+                            // diagonal
+                            ret_jx += (temp + cost * add) * d[i + y];
+                        }
+                    }
+                    atomic_op<real_type>{ ret[j + x] } += ret_jx;
+                }
+            }
+        });
+  /*
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+          kernel_index_type i = gi * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+          kernel_index_type j = gj * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+
+          real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
+          real_type data_j[INTERNAL_BLOCK_SIZE];
+
+          if (i >= j) {
+            i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+            j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+            // cache data
+            for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
+                ::sycl::group_barrier(group);
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                    const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                    if (idx.get_local_id(1) == idx_1) {
+                        data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i];
+                    }
+                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                    if (idx.get_local_id(0) == idx_2) {
+                        data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j];
+                    }
+                }
+//                ::sycl::group_barrier(group);
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                    data_j[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                }
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                    const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                        matr[k][l] += data_i * data_j[k];
+                    }
+                }
+            }
+
+            #pragma unroll INTERNAL_BLOCK_SIZE
+            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                real_type ret_jx = 0.0;
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                    const real_type temp = (::sycl::pow(gamma * matr[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[i + y] - q[j + x]) * add;
+                    if (i + x > j + y) {
+                        // upper triangular matrix
+                        atomic_op<real_type>{ ret[i + y] } += temp * d[j + x];
+                        ret_jx += temp * d[i + y];
+                    } else if (i + x == j + y) {
+                        // diagonal
+                        ret_jx += (temp + cost * add) * d[i + y];
+                    }
+                }
+                atomic_op<real_type>{ ret[j + x] } += ret_jx;
+            }
+          }
+        });
+        */
+      });
+    });
+        /*
         kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
         kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
@@ -244,11 +422,13 @@ class device_kernel_poly {
                 atomic_op<real_type>{ ret_[j + x] } += ret_jx;
             }
         }
+        */
     }
 
   private:
-    local_accessor<real_type> data_intern_i_;
-    local_accessor<real_type> data_intern_j_;
+    ::sycl::queue& queue_;
+    ::sycl::range<2> global_range_;
+    ::sycl::range<2> local_range_;
 
     const real_type *q_;
     real_type *ret_;
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 97dab6674..dea4188e8 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -159,10 +159,13 @@ void csvm<T>::run_svm_kernel(const std::size_t device, const ::plssvm::detail::e
             });
             break;
         case kernel_type::polynomial:
+          {
             PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!");
-            devices_[device].submit([&](::sycl::handler &cgh) {
-                cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_));
-            });
+            device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)();
+            //devices_[device].submit([&](::sycl::handler &cgh) {
+            //    cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_));
+            //});
+          }
             break;
         case kernel_type::rbf:
             PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!");

From 025609fda736ae9eb3947e1d521e55956ae1131a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Feb 2022 10:58:22 +0100
Subject: [PATCH 08/56] Change get_group_id() to operator[] since the former
 currently isn't implemented in DPC++.

---
 include/plssvm/backends/SYCL/svm_kernel.hpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp
index 9ea3546aa..4492c0db9 100644
--- a/include/plssvm/backends/SYCL/svm_kernel.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel.hpp
@@ -206,9 +206,6 @@ class device_kernel_poly {
         real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
         real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
         
-        //const std::size_t gi = group.get_group_id(0);
-        //const std::size_t gj = group.get_group_id(1);
-
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
 
@@ -224,8 +221,8 @@ class device_kernel_poly {
         for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
 
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
             if (i >= j) {
                 i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
@@ -246,8 +243,8 @@ class device_kernel_poly {
         });
 
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
             if (i >= j) {
                 i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
@@ -272,8 +269,8 @@ class device_kernel_poly {
         }
 
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group.get_group_id(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group.get_group_id(1) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
             if (i >= j) {
                 i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;

From 9b611759a220eee4e742aeb00ff90b256b60cd41 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Feb 2022 14:07:47 +0100
Subject: [PATCH 09/56] Rewrite other SYCL SVM kernel to also use hierarchical
 form.

---
 include/plssvm/backends/SYCL/svm_kernel.hpp | 722 ++++++++++----------
 src/plssvm/backends/SYCL/csvm.cpp           |  96 ++-
 2 files changed, 394 insertions(+), 424 deletions(-)

diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp
index 4492c0db9..1ac1df6b6 100644
--- a/include/plssvm/backends/SYCL/svm_kernel.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel.hpp
@@ -11,25 +11,17 @@
 
 #pragma once
 
-#include "plssvm/detail/execution_range.hpp"
+#include "plssvm/backends/SYCL/detail/atomics.hpp"    // plssvm::sycl::atomic_op
 #include "plssvm/backends/SYCL/detail/constants.hpp"  // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
 #include "plssvm/constants.hpp"                       // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
+#include "plssvm/detail/execution_range.hpp"          // plssvm::detail::execution_range
 
-#include "sycl/sycl.hpp"  // sycl::nd_item, sycl::handler, sycl::accessor, sycl::access::mode, sycl::access::target, sycl::range, sycl::group_barrier, sycl::pow,
-                          // sycl::exp, sycl::atomic_ref, sycl::memory_order, sycl::memory_scope, sycl::access::address_space
+#include "sycl/sycl.hpp"  // sycl::queue, sycl::handler, sycl::h_item, sycl::range, sycl::private_memory, sycl::pow, sycl::exp
 
 #include <cstddef>  // std::size_t
 
 namespace plssvm::sycl {
 
-// TODO: change to ::sycl::local_accessor once implemented in the SYCL implementations
-/**
- * @brief Shortcut alias for a SYCL local accessor.
- * @tparam T the type of the accessed values
- */
-template <typename T>
-using local_accessor = ::sycl::accessor<T, 2, ::sycl::access::mode::read_write, ::sycl::access::target::local>;
-
 /**
  * @brief Calculates the C-SVM kernel using the linear kernel function.
  * @details Supports multi-GPU execution.
@@ -43,7 +35,8 @@ class device_kernel_linear {
 
     /**
      * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel.
-     * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
+     * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued
+     * @param[in] range the execution range of the kernel
      * @param[in] q the `q` vector
      * @param[out] ret the result vector
      * @param[in] d the right-hand side of the equation
@@ -55,88 +48,141 @@ class device_kernel_linear {
      * @param[in] add denotes whether the values are added or subtracted from the result vector
      * @param[in] id the id of the device
      */
-    device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
-        data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
+    device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
+        queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
-        kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-        kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-        real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
-        real_type data_j[INTERNAL_BLOCK_SIZE];
-
-        if (i >= j) {
-            i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-            j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-            // cache data
-            for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) {
-                ::sycl::group_barrier(nd_idx.get_group());
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(1) == idx) {
-                        data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
+    void operator()() const {
+        queue_.submit([&](::sycl::handler &cgh) {
+            const real_type *q = q_;
+            real_type *ret = ret_;
+            const real_type *d = d_;
+            const real_type *data_d = data_d_;
+            const real_type QA_cost = QA_cost_;
+            const real_type cost = cost_;
+            const kernel_index_type num_rows = num_rows_;
+            const kernel_index_type feature_range = feature_range_;
+            const real_type add = add_;
+            const kernel_index_type device = device_;
+
+            cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
+                // allocate shared memory
+                real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+                real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+                // allocate memory for work-item local variables
+                // -> accessible across different 'parallel_for_work_item' invocations
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
+                ::sycl::private_memory<bool, 2> private_cond{ group };
+
+                // initialize private variables
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // indices and diagonal condition
+                    private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+                    private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+                    private_cond(idx) = private_i(idx) >= private_j(idx);
+                    if (private_cond(idx)) {
+                        private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                        private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
                     }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(0) == idx_2) {
-                        data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
+
+                    // matrix
+                    for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                            private_matr(idx)[i][j] = real_type{ 0.0 };
+                        }
                     }
-                }
-                ::sycl::group_barrier(nd_idx.get_group());
+                });
+
+                // implicit group barrier
+
+                // load data from global in shared memory
+                for (kernel_index_type vec_index = 0; vec_index < feature_range * num_rows; vec_index += num_rows) {
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                                const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(1) == idx_1) {
+                                    data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)];
+                                }
+                                const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(0) == idx_2) {
+                                    data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)];
+                                }
+                            }
+                        }
+                    });
+
+                    // implicit group barrier
+
+                    // load data from shared in private memory and perform scalar product
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                                private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                            }
+
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                                const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                                #pragma unroll INTERNAL_BLOCK_SIZE
+                                for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                                    private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
+                                }
+                            }
+                        }
+                    });
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
+                    // implicit group barrier
                 }
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        matr[k][l] += data_i * data_j[k];
-                    }
-                }
-            }
-
-            #pragma unroll INTERNAL_BLOCK_SIZE
-            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                real_type ret_jx = 0.0;
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                    real_type temp;
-                    if (device_ == 0) {
-                        temp = (matr[x][y] + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
-                    } else {
-                        temp = matr[x][y] * add_;
-                    }
-                    if (i + x > j + y) {
-                        // upper triangular matrix
-                        atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
-                        ret_jx += temp * d_[i + y];
-                    } else if (i + x == j + y) {
-                        // diagonal
-                        if (device_ == 0) {
-                            ret_jx += (temp + cost_ * add_) * d_[i + y];
-                        } else {
-                            ret_jx += temp * d_[i + y];
+                // kernel function
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    if (private_cond(idx)) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                            real_type ret_jx = 0.0;
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                                real_type temp;
+                                if (device == 0) {
+                                    temp = (private_matr(idx)[x][y] + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add;
+                                } else {
+                                    temp = private_matr(idx)[x][y] * add;
+                                }
+                                if (private_i(idx) + x > private_j(idx) + y) {
+                                    // upper triangular matrix
+                                    atomic_op<real_type>{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x];
+                                    ret_jx += temp * d[private_i(idx) + y];
+                                } else if (private_i(idx) + x == private_j(idx) + y) {
+                                    // diagonal
+                                    if (device == 0) {
+                                        ret_jx += (temp + cost * add) * d[private_i(idx) + y];
+                                    } else {
+                                        ret_jx += temp * d[private_i(idx) + y];
+                                    }
+                                }
+                            }
+                            atomic_op<real_type>{ ret[private_j(idx) + x] } += ret_jx;
                         }
                     }
-                }
-                atomic_op<real_type>{ ret_[j + x] } += ret_jx;
-            }
-        }
+                });
+            });
+        });
     }
 
   private:
-    local_accessor<real_type> data_intern_i_;
-    local_accessor<real_type> data_intern_j_;
+    ::sycl::queue &queue_;
+    ::sycl::range<2> global_range_;
+    ::sycl::range<2> local_range_;
 
     const real_type *q_;
     real_type *ret_;
@@ -163,7 +209,8 @@ class device_kernel_poly {
 
     /**
      * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel.
-     * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
+     * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued
+     * @param[in] range the execution range of the kernel
      * @param[in] q the `q` vector
      * @param[out] ret the result vector
      * @param[in] d the right-hand side of the equation
@@ -182,248 +229,127 @@ class device_kernel_poly {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
     void operator()() const {
-    
-    queue_.submit([&](::sycl::handler& cgh) {
-      const real_type *q = q_;
-      real_type *ret = ret_;
-      const real_type *d = d_;
-      const real_type *data_d = data_d_;
-      const real_type QA_cost = QA_cost_;
-      const real_type cost = cost_;
-      const kernel_index_type num_rows = num_rows_;
-      const kernel_index_type num_cols = num_cols_;
-      const real_type add = add_;
-      const int degree = degree_;
-      const real_type gamma = gamma_;
-      const real_type coef0 = coef0_;
-
-      cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
-        // allocate shared memory
-        real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-        real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-        
-        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
-        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
-
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
-                    private_matr(idx)[i][j] = real_type{ 0.0 };
-                }
-            }
-        });
-
-        for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
-
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-            if (i >= j) {
-                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
-                    if (idx.get_local_id(1) == idx_1) {
-                        data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i];
+        queue_.submit([&](::sycl::handler &cgh) {
+            const real_type *q = q_;
+            real_type *ret = ret_;
+            const real_type *d = d_;
+            const real_type *data_d = data_d_;
+            const real_type QA_cost = QA_cost_;
+            const real_type cost = cost_;
+            const kernel_index_type num_rows = num_rows_;
+            const kernel_index_type num_cols = num_cols_;
+            const real_type add = add_;
+            const int degree = degree_;
+            const real_type gamma = gamma_;
+            const real_type coef0 = coef0_;
+
+            cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
+                // allocate shared memory
+                real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+                real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+                // allocate memory for work-item local variables
+                // -> accessible across different 'parallel_for_work_item' invocations
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
+                ::sycl::private_memory<bool, 2> private_cond{ group };
+
+                // initialize private variables
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // indices and diagonal condition
+                    private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+                    private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+                    private_cond(idx) = private_i(idx) >= private_j(idx);
+                    if (private_cond(idx)) {
+                        private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                        private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
                     }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (idx.get_local_id(0) == idx_2) {
-                        data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j];
-                    }
-                }
-            }
-        });
-
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
-            if (i >= j) {
-                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
-                }
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
-                    }
-                }
-            }
-        });
-
-        }
-
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-            if (i >= j) {
-                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                    real_type ret_jx = 0.0;
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                        const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[i + y] - q[j + x]) * add;
-                        if (i + x > j + y) {
-                            // upper triangular matrix
-                            atomic_op<real_type>{ ret[i + y] } += temp * d[j + x];
-                            ret_jx += temp * d[i + y];
-                        } else if (i + x == j + y) {
-                            // diagonal
-                            ret_jx += (temp + cost * add) * d[i + y];
+                    // matrix
+                    for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                            private_matr(idx)[i][j] = real_type{ 0.0 };
                         }
                     }
-                    atomic_op<real_type>{ ret[j + x] } += ret_jx;
-                }
-            }
-        });
-  /*
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-          kernel_index_type i = gi * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-          kernel_index_type j = gj * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-          real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
-          real_type data_j[INTERNAL_BLOCK_SIZE];
-
-          if (i >= j) {
-            i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-            j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-            // cache data
-            for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
-                ::sycl::group_barrier(group);
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
-                    if (idx.get_local_id(1) == idx_1) {
-                        data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i];
-                    }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (idx.get_local_id(0) == idx_2) {
-                        data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j];
-                    }
-                }
-//                ::sycl::group_barrier(group);
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    data_j[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
-                }
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        matr[k][l] += data_i * data_j[k];
-                    }
-                }
-            }
-
-            #pragma unroll INTERNAL_BLOCK_SIZE
-            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                real_type ret_jx = 0.0;
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                    const real_type temp = (::sycl::pow(gamma * matr[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[i + y] - q[j + x]) * add;
-                    if (i + x > j + y) {
-                        // upper triangular matrix
-                        atomic_op<real_type>{ ret[i + y] } += temp * d[j + x];
-                        ret_jx += temp * d[i + y];
-                    } else if (i + x == j + y) {
-                        // diagonal
-                        ret_jx += (temp + cost * add) * d[i + y];
-                    }
-                }
-                atomic_op<real_type>{ ret[j + x] } += ret_jx;
-            }
-          }
-        });
-        */
-      });
-    });
-        /*
-        kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-        kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-        real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
-        real_type data_j[INTERNAL_BLOCK_SIZE];
-
-        if (i >= j) {
-            i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-            j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-            // cache data
-            for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
-                ::sycl::group_barrier(nd_idx.get_group());
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(1) == idx) {
-                        data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
-                    }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(0) == idx_2) {
-                        data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
-                    }
-                }
-                ::sycl::group_barrier(nd_idx.get_group());
+                });
+
+                // implicit group barrier
+
+                // load data from global in shared memory
+                for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                                const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(1) == idx_1) {
+                                    data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)];
+                                }
+                                const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(0) == idx_2) {
+                                    data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)];
+                                }
+                            }
+                        }
+                    });
+
+                    // implicit group barrier
+
+                    // load data from shared in private memory and perform scalar product
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                                private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                            }
+
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                                const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                                #pragma unroll INTERNAL_BLOCK_SIZE
+                                for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                                    private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
+                                }
+                            }
+                        }
+                    });
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
+                    // implicit group barrier
                 }
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        matr[k][l] += data_i * data_j[k];
-                    }
-                }
-            }
-
-            #pragma unroll INTERNAL_BLOCK_SIZE
-            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                real_type ret_jx = 0.0;
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                    const real_type temp = (::sycl::pow(gamma_ * matr[x][y] + coef0_, static_cast<real_type>(degree_)) + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
-                    if (i + x > j + y) {
-                        // upper triangular matrix
-                        atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
-                        ret_jx += temp * d_[i + y];
-                    } else if (i + x == j + y) {
-                        // diagonal
-                        ret_jx += (temp + cost_ * add_) * d_[i + y];
+                // kernel function
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    if (private_cond(idx)) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                            real_type ret_jx = 0.0;
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                                const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add;
+                                if (private_i(idx) + x > private_j(idx) + y) {
+                                    // upper triangular matrix
+                                    atomic_op<real_type>{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x];
+                                    ret_jx += temp * d[private_i(idx) + y];
+                                } else if (private_i(idx) + x == private_j(idx) + y) {
+                                    // diagonal
+                                    ret_jx += (temp + cost * add) * d[private_i(idx) + y];
+                                }
+                            }
+                            atomic_op<real_type>{ ret[private_j(idx) + x] } += ret_jx;
+                        }
                     }
-                }
-                atomic_op<real_type>{ ret_[j + x] } += ret_jx;
-            }
-        }
-        */
+                });
+            });
+        });
     }
 
   private:
-    ::sycl::queue& queue_;
+    ::sycl::queue &queue_;
     ::sycl::range<2> global_range_;
     ::sycl::range<2> local_range_;
 
@@ -454,7 +380,8 @@ class device_kernel_radial {
 
     /**
      * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel.
-     * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
+     * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued
+     * @param[in] range the execution range of the kernel
      * @param[in] q the `q` vector
      * @param[out] ret the result vector
      * @param[in] d the right-hand side of the equation
@@ -466,79 +393,132 @@ class device_kernel_radial {
      * @param[in] add denotes whether the values are added or subtracted from the result vector
      * @param[in] gamma the gamma parameter used in the rbf kernel function
      */
-    device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
-        data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
+    device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
+        queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
-        kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-        kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-        real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
-        real_type data_j[INTERNAL_BLOCK_SIZE];
-
-        if (i >= j) {
-            i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-            j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-            // cache data
-            for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
-                ::sycl::group_barrier(nd_idx.get_group());
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(1) == idx) {
-                        data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
+    void operator()() const {
+        queue_.submit([&](::sycl::handler &cgh) {
+            const real_type *q = q_;
+            real_type *ret = ret_;
+            const real_type *d = d_;
+            const real_type *data_d = data_d_;
+            const real_type QA_cost = QA_cost_;
+            const real_type cost = cost_;
+            const kernel_index_type num_rows = num_rows_;
+            const kernel_index_type num_cols = num_cols_;
+            const real_type add = add_;
+            const real_type gamma = gamma_;
+
+            cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
+                // allocate shared memory
+                real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+                real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+                // allocate memory for work-item local variables
+                // -> accessible across different 'parallel_for_work_item' invocations
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
+                ::sycl::private_memory<bool, 2> private_cond{ group };
+
+                // initialize private variables
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // indices and diagonal condition
+                    private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+                    private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+                    private_cond(idx) = private_i(idx) >= private_j(idx);
+                    if (private_cond(idx)) {
+                        private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                        private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
                     }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(0) == idx_2) {
-                        data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
+
+                    // matrix
+                    for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                            private_matr(idx)[i][j] = real_type{ 0.0 };
+                        }
                     }
-                }
-                ::sycl::group_barrier(nd_idx.get_group());
+                });
+
+                // implicit group barrier
+
+                // load data from global in shared memory
+                for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                                const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(1) == idx_1) {
+                                    data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)];
+                                }
+                                const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(0) == idx_2) {
+                                    data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)];
+                                }
+                            }
+                        }
+                    });
+
+                    // implicit group barrier
+
+                    // load data from shared in private memory and perform scalar product
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                                private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                            }
+
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                                const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                                #pragma unroll INTERNAL_BLOCK_SIZE
+                                for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                                    private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]);
+                                }
+                            }
+                        }
+                    });
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
+                    // implicit group barrier
                 }
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]);
-                    }
-                }
-            }
-
-            #pragma unroll INTERNAL_BLOCK_SIZE
-            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                real_type ret_jx = 0.0;
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                    const real_type temp = (::sycl::exp(-gamma_ * matr[x][y]) + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
-                    if (i + x > j + y) {
-                        // upper triangular matrix
-                        atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
-                        ret_jx += temp * d_[i + y];
-                    } else if (i + x == j + y) {
-                        // diagonal
-                        ret_jx += (temp + cost_ * add_) * d_[i + y];
+                // kernel function
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    if (private_cond(idx)) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                            real_type ret_jx = 0.0;
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                                const real_type temp = (::sycl::exp(-gamma * private_matr(idx)[x][y]) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add;
+                                if (private_i(idx) + x > private_j(idx) + y) {
+                                    // upper triangular matrix
+                                    atomic_op<real_type>{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x];
+                                    ret_jx += temp * d[private_i(idx) + y];
+                                } else if (private_i(idx) + x == private_j(idx) + y) {
+                                    // diagonal
+                                    ret_jx += (temp + cost * add) * d[private_i(idx) + y];
+                                }
+                            }
+                            atomic_op<real_type>{ ret[private_j(idx) + x] } += ret_jx;
+                        }
                     }
-                }
-                atomic_op<real_type>{ ret_[j + x] } += ret_jx;
-            }
-        }
+                });
+            });
+        });
     }
 
   private:
-    local_accessor<real_type> data_intern_i_;
-    local_accessor<real_type> data_intern_j_;
+    ::sycl::queue &queue_;
+    ::sycl::range<2> global_range_;
+    ::sycl::range<2> local_range_;
 
     const real_type *q_;
     real_type *ret_;
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index dea4188e8..332a54975 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -151,27 +151,17 @@ void csvm<T>::run_q_kernel(const std::size_t device, const ::plssvm::detail::exe
 
 template <typename T>
 void csvm<T>::run_svm_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, const real_type add, const std::size_t num_features) {
-    const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
     switch (kernel_) {
         case kernel_type::linear:
-            devices_[device].submit([&](::sycl::handler &cgh) {
-                cgh.parallel_for(execution_range, device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device));
-            });
+            device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)();
             break;
         case kernel_type::polynomial:
-          {
             PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!");
             device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)();
-            //devices_[device].submit([&](::sycl::handler &cgh) {
-            //    cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_));
-            //});
-          }
             break;
         case kernel_type::rbf:
             PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!");
-            devices_[device].submit([&](::sycl::handler &cgh) {
-                cgh.parallel_for(execution_range, device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_));
-            });
+            device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)();
             break;
     }
 }
@@ -185,62 +175,62 @@ void csvm<T>::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe
 template <typename T>
 void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) {
     [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
-    
+
     switch (kernel_) {
         case kernel_type::linear:
             break;
         case kernel_type::polynomial:
 #if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
-            {
-                ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
-                ::sycl::range<2> local_range{ range.block[0], range.block[1] };
-                devices_[0].submit([&](::sycl::handler& cgh) {
-                    real_type *out_d_ptr = out_d.get();
-                    const real_type *data_d_ptr = data_d_[0].get();
-                    const real_type *data_last_d_ptr = data_last_d_[0].get();
-                    const real_type *alpha_d_ptr = alpha_d.get();
-                    const std::size_t num_data_points = num_data_points_;
-                    const real_type *point_d_ptr = point_d.get();
-                    const std::size_t num_predict_points = p_num_predict_points;
-                    const std::size_t num_features = num_features_;
-                    const int degree = degree_;
-                    const real_type gamma = gamma_;
-                    const real_type coef0 = coef0_;
-
-                    cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
-                        group.parallel_for_work_item(device_kernel_predict_poly<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0));
-                    });
+        {
+            ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
+            ::sycl::range<2> local_range{ range.block[0], range.block[1] };
+            devices_[0].submit([&](::sycl::handler &cgh) {
+                real_type *out_d_ptr = out_d.get();
+                const real_type *data_d_ptr = data_d_[0].get();
+                const real_type *data_last_d_ptr = data_last_d_[0].get();
+                const real_type *alpha_d_ptr = alpha_d.get();
+                const std::size_t num_data_points = num_data_points_;
+                const real_type *point_d_ptr = point_d.get();
+                const std::size_t num_predict_points = p_num_predict_points;
+                const std::size_t num_features = num_features_;
+                const int degree = degree_;
+                const real_type gamma = gamma_;
+                const real_type coef0 = coef0_;
+
+                cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
+                    group.parallel_for_work_item(device_kernel_predict_poly<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0));
                 });
-            }
+            });
+        }
 #elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
             devices_[0].parallel_for(execution_range, device_kernel_predict_poly<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_));
 #endif
-            break;
+        break;
         case kernel_type::rbf:
 #if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
-            {
-                ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
-                ::sycl::range<2> local_range{ range.block[0], range.block[1] };
-                devices_[0].submit([&](::sycl::handler& cgh) {
-                    real_type *out_d_ptr = out_d.get();
-                    const real_type *data_d_ptr = data_t_[0].get();
-                    const real_type *data_last_d_ptr = data_last_d_[0].get();
-                    const real_type *alpha_d_ptr = alpha_d.get();
-                    const std::size_t num_data_points = num_data_points_;
-                    const real_type *point_d_ptr = point_d.get();
-                    const std::size_t num_predict_points = p_num_predict_points;
-                    const std::size_t num_features = num_features_;
-                    const real_type gamma = gamma_;
-
-                    cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
-                        group.parallel_for_work_item(device_kernel_predict_radial<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma));
-                    });
+        {
+            ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
+            ::sycl::range<2> local_range{ range.block[0], range.block[1] };
+            devices_[0].submit([&](::sycl::handler &cgh) {
+                real_type *out_d_ptr = out_d.get();
+                const real_type *data_d_ptr = data_t_[0].get();
+                const real_type *data_last_d_ptr = data_last_d_[0].get();
+                const real_type *alpha_d_ptr = alpha_d.get();
+                const std::size_t num_data_points = num_data_points_;
+                const real_type *point_d_ptr = point_d.get();
+                const std::size_t num_predict_points = p_num_predict_points;
+                const std::size_t num_features = num_features_;
+                const real_type gamma = gamma_;
+
+                cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
+                    group.parallel_for_work_item(device_kernel_predict_radial<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma));
                 });
-            }
+            });
+        }
 #elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
             devices_[0].parallel_for(execution_range, device_kernel_predict_radial<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_));
 #endif
-            break;
+        break;
     }
 }
 

From a5c7600996713d335f5a2e433a7aee43c56b1b82 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Feb 2022 16:27:48 +0100
Subject: [PATCH 10/56] Add explicit CUDA target arch to DPC++ compile flags to
 prevent runtime jitting.

---
 src/plssvm/backends/SYCL/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index b65956d58..cef2b4073 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -68,6 +68,10 @@ elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++")
     if(DEFINED PLSSVM_NVIDIA_TARGET_ARCHS)
         target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda)
         target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda)
+        foreach(PLSSVM_NVIDIA_TARGET_ARCH_NAME ${PLSSVM_NVIDIA_TARGET_ARCHS})
+            target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME})
+            target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME})
+        endforeach()
     endif()
     # amd targets
     if(DEFINED PLSSVM_AMD_TARGET_ARCHS)

From 1ab61ff5823ed3c59497c6c88dc63f9583f0198b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 4 Feb 2022 17:41:07 +0100
Subject: [PATCH 11/56] Change predict kernel to use hierarchical SYCL notation
 if hipSYCL is used (faster on the CPU)

---
 .../plssvm/backends/SYCL/predict_kernel.hpp   | 28 +++++-----
 src/plssvm/backends/SYCL/csvm.cpp             | 52 +++++++++++++++++--
 2 files changed, 65 insertions(+), 15 deletions(-)

diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp
index 98b666676..d8568facf 100644
--- a/include/plssvm/backends/SYCL/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/predict_kernel.hpp
@@ -72,12 +72,15 @@ class device_kernel_w_linear {
  * @brief Predicts the labels for data points using the polynomial kernel function.
  * @details Currently only single GPU execution is supported.
  * @tparam T the type of the data points
+ * @tparam U the type of the `sycl::item`
  */
-template <typename T>
+template <typename T, typename U>
 class device_kernel_predict_poly {
   public:
     /// The type of the data.
     using real_type = T;
+    /// The `sycl::item` type.
+    using sycl_item_type = U;
 
     /**
      * @brief Construct a new device kernel to predict the labels for data points using the polynomial kernel function.
@@ -99,12 +102,11 @@ class device_kernel_predict_poly {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
-        const kernel_index_type data_point_index = nd_idx.get_global_id(0);
-        const kernel_index_type predict_point_index = nd_idx.get_global_id(1);
+    void operator()(sycl_item_type idx) const {
+        const kernel_index_type data_point_index = idx.get_global_id(0);
+        const kernel_index_type predict_point_index = idx.get_global_id(1);
 
         real_type temp = 0;
         if (predict_point_index < num_predict_points_) {
@@ -140,12 +142,15 @@ class device_kernel_predict_poly {
  * @brief Predicts the labels for data points using the radial basis functions kernel function.
  * @details Currently only single GPU execution is supported.
  * @tparam T the type of the data points
+ * @tparam U the type of the `sycl::item`
  */
-template <typename T>
+template <typename T, typename U>
 class device_kernel_predict_radial {
   public:
     /// The type of the data.
     using real_type = T;
+    /// The `sycl::item` type
+    using sycl_item_type = U;
 
     /**
      * @brief Construct a new device kernel to predict the labels for data points using the radial basis function kernel function.
@@ -165,12 +170,11 @@ class device_kernel_predict_radial {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
-        const kernel_index_type data_point_index = nd_idx.get_global_id(0);
-        const kernel_index_type predict_point_index = nd_idx.get_global_id(1);
+    void operator()(sycl_item_type idx) const {
+        const kernel_index_type data_point_index = idx.get_global_id(0);
+        const kernel_index_type predict_point_index = idx.get_global_id(1);
 
         real_type temp = 0;
         if (predict_point_index < num_predict_points_) {
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 9f70416fc..106e70817 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -185,16 +185,62 @@ void csvm<T>::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe
 }
 
 template <typename T>
-void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) {
+void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) {
     const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
     switch (kernel_) {
         case kernel_type::linear:
             break;
         case kernel_type::polynomial:
-            devices_[0].parallel_for(execution_range, device_kernel_predict_poly(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, degree_, gamma_, coef0_));
+#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
+            {
+                ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
+                ::sycl::range<2> local_range{ range.block[0], range.block[1] };
+                devices_[0].submit([&](::sycl::handler& cgh) {
+                    real_type *out_d_ptr = out_d.get();
+                    const real_type *data_d_ptr = data_d_[0].get();
+                    const real_type *data_last_d_ptr = data_last_d_[0].get();
+                    const real_type *alpha_d_ptr = alpha_d.get();
+                    const std::size_t num_data_points = num_data_points_;
+                    const real_type *point_d_ptr = point_d.get();
+                    const std::size_t num_predict_points = p_num_predict_points;
+                    const std::size_t num_features = num_features_;
+                    const int degree = degree_;
+                    const real_type gamma = gamma_;
+                    const real_type coef0 = coef0_;
+
+                    cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
+                        group.parallel_for_work_item(device_kernel_predict_poly<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0));
+                    });
+                });
+            }
+#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
+            devices_[0].parallel_for(execution_range, device_kernel_predict_poly<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_));
+#endif
             break;
         case kernel_type::rbf:
-            devices_[0].parallel_for(execution_range, device_kernel_predict_radial(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, gamma_));
+#if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
+            {
+                ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
+                ::sycl::range<2> local_range{ range.block[0], range.block[1] };
+                devices_[0].submit([&](::sycl::handler& cgh) {
+                    real_type *out_d_ptr = out_d.get();
+                    const real_type *data_d_ptr = data_t_[0].get();
+                    const real_type *data_last_d_ptr = data_last_d_[0].get();
+                    const real_type *alpha_d_ptr = alpha_d.get();
+                    const std::size_t num_data_points = num_data_points_;
+                    const real_type *point_d_ptr = point_d.get();
+                    const std::size_t num_predict_points = p_num_predict_points;
+                    const std::size_t num_features = num_features_;
+                    const real_type gamma = gamma_;
+
+                    cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
+                        group.parallel_for_work_item(device_kernel_predict_radial<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma));
+                    });
+                });
+            }
+#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
+            devices_[0].parallel_for(execution_range, device_kernel_predict_radial<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_));
+#endif
             break;
     }
 }

From cd5180e66c7306e86a3ee1f207839152e3f825a4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 6 Feb 2022 19:12:27 +0100
Subject: [PATCH 12/56] Add [[maybe_unused]] attribute.

---
 src/plssvm/backends/SYCL/csvm.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 106e70817..674b1fea6 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -186,7 +186,8 @@ void csvm<T>::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe
 
 template <typename T>
 void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) {
-    const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
+    [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
+    
     switch (kernel_) {
         case kernel_type::linear:
             break;

From 088afcc85461da939775bd20c03518009941e33f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Feb 2022 10:04:19 +0100
Subject: [PATCH 13/56] First try to reformulate SYCL SVM kernel using
 hiearchical kernels.

---
 include/plssvm/backends/SYCL/svm_kernel.hpp | 190 +++++++++++++++++++-
 src/plssvm/backends/SYCL/csvm.cpp           |   9 +-
 2 files changed, 191 insertions(+), 8 deletions(-)

diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp
index a0b1a669b..9ea3546aa 100644
--- a/include/plssvm/backends/SYCL/svm_kernel.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel.hpp
@@ -11,6 +11,7 @@
 
 #pragma once
 
+#include "plssvm/detail/execution_range.hpp"
 #include "plssvm/backends/SYCL/detail/constants.hpp"  // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
 #include "plssvm/constants.hpp"                       // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
 
@@ -176,15 +177,192 @@ class device_kernel_poly {
      * @param[in] gamma the gamma parameter used in the polynomial kernel function
      * @param[in] coef0 the coef0 parameter used in the polynomial kernel function
      */
-    device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
-        data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
+    device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
+        queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
 
     /**
      * @brief Function call operator overload performing the actual calculation.
      * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
      *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
+    void operator()() const {
+    
+    queue_.submit([&](::sycl::handler& cgh) {
+      const real_type *q = q_;
+      real_type *ret = ret_;
+      const real_type *d = d_;
+      const real_type *data_d = data_d_;
+      const real_type QA_cost = QA_cost_;
+      const real_type cost = cost_;
+      const kernel_index_type num_rows = num_rows_;
+      const kernel_index_type num_cols = num_cols_;
+      const real_type add = add_;
+      const int degree = degree_;
+      const real_type gamma = gamma_;
+      const real_type coef0 = coef0_;
+
+      cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
+        // allocate shared memory
+        real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+        real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+        
+        //const std::size_t gi = group.get_group_id(0);
+        //const std::size_t gj = group.get_group_id(1);
+
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                    private_matr(idx)[i][j] = real_type{ 0.0 };
+                }
+            }
+        });
+
+        for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
+
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+
+            if (i >= j) {
+                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                    const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                    if (idx.get_local_id(1) == idx_1) {
+                        data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i];
+                    }
+                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                    if (idx.get_local_id(0) == idx_2) {
+                        data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j];
+                    }
+                }
+            }
+        });
+
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+
+            if (i >= j) {
+                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                    private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                }
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                    const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                        private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
+                    }
+                }
+            }
+        });
+
+        }
+
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            kernel_index_type i = group.get_group_id(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group.get_group_id(1) * INTERNAL_BLOCK_SIZE;
+
+            if (i >= j) {
+                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                    real_type ret_jx = 0.0;
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                        const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[i + y] - q[j + x]) * add;
+                        if (i + x > j + y) {
+                            // upper triangular matrix
+                            atomic_op<real_type>{ ret[i + y] } += temp * d[j + x];
+                            ret_jx += temp * d[i + y];
+                        } else if (i + x == j + y) {
+                            // diagonal
+                            ret_jx += (temp + cost * add) * d[i + y];
+                        }
+                    }
+                    atomic_op<real_type>{ ret[j + x] } += ret_jx;
+                }
+            }
+        });
+  /*
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+          kernel_index_type i = gi * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+          kernel_index_type j = gj * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+
+          real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
+          real_type data_j[INTERNAL_BLOCK_SIZE];
+
+          if (i >= j) {
+            i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+            j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+            // cache data
+            for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
+                ::sycl::group_barrier(group);
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                    const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                    if (idx.get_local_id(1) == idx_1) {
+                        data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i];
+                    }
+                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                    if (idx.get_local_id(0) == idx_2) {
+                        data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j];
+                    }
+                }
+//                ::sycl::group_barrier(group);
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                    data_j[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                }
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                    const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                        matr[k][l] += data_i * data_j[k];
+                    }
+                }
+            }
+
+            #pragma unroll INTERNAL_BLOCK_SIZE
+            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                real_type ret_jx = 0.0;
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                    const real_type temp = (::sycl::pow(gamma * matr[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[i + y] - q[j + x]) * add;
+                    if (i + x > j + y) {
+                        // upper triangular matrix
+                        atomic_op<real_type>{ ret[i + y] } += temp * d[j + x];
+                        ret_jx += temp * d[i + y];
+                    } else if (i + x == j + y) {
+                        // diagonal
+                        ret_jx += (temp + cost * add) * d[i + y];
+                    }
+                }
+                atomic_op<real_type>{ ret[j + x] } += ret_jx;
+            }
+          }
+        });
+        */
+      });
+    });
+        /*
         kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
         kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
@@ -244,11 +422,13 @@ class device_kernel_poly {
                 atomic_op<real_type>{ ret_[j + x] } += ret_jx;
             }
         }
+        */
     }
 
   private:
-    local_accessor<real_type> data_intern_i_;
-    local_accessor<real_type> data_intern_j_;
+    ::sycl::queue& queue_;
+    ::sycl::range<2> global_range_;
+    ::sycl::range<2> local_range_;
 
     const real_type *q_;
     real_type *ret_;
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 674b1fea6..7d2786962 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -164,10 +164,13 @@ void csvm<T>::run_svm_kernel(const std::size_t device, const ::plssvm::detail::e
             });
             break;
         case kernel_type::polynomial:
+          {
             PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!");
-            devices_[device].submit([&](::sycl::handler &cgh) {
-                cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_));
-            });
+            device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)();
+            //devices_[device].submit([&](::sycl::handler &cgh) {
+            //    cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_));
+            //});
+          }
             break;
         case kernel_type::rbf:
             PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!");

From 9c4088c8583a6512eef20aa6b8495e51574f3ef7 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Feb 2022 10:58:22 +0100
Subject: [PATCH 14/56] Change get_group_id() to operator[] since the former
 currently isn't implemented in DPC++.

---
 include/plssvm/backends/SYCL/svm_kernel.hpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp
index 9ea3546aa..4492c0db9 100644
--- a/include/plssvm/backends/SYCL/svm_kernel.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel.hpp
@@ -206,9 +206,6 @@ class device_kernel_poly {
         real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
         real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
         
-        //const std::size_t gi = group.get_group_id(0);
-        //const std::size_t gj = group.get_group_id(1);
-
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
 
@@ -224,8 +221,8 @@ class device_kernel_poly {
         for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
 
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
             if (i >= j) {
                 i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
@@ -246,8 +243,8 @@ class device_kernel_poly {
         });
 
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group.get_group_id(0) * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group.get_group_id(1) * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
             if (i >= j) {
                 i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
@@ -272,8 +269,8 @@ class device_kernel_poly {
         }
 
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group.get_group_id(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group.get_group_id(1) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
             if (i >= j) {
                 i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;

From 0cf9aa6eac9c185c6c1fa602bc10273d2492d795 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Feb 2022 14:07:47 +0100
Subject: [PATCH 15/56] Rewrite other SYCL SVM kernel to also use hierarchical
 form.

---
 include/plssvm/backends/SYCL/svm_kernel.hpp | 722 ++++++++++----------
 src/plssvm/backends/SYCL/csvm.cpp           |  96 ++-
 2 files changed, 394 insertions(+), 424 deletions(-)

diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp
index 4492c0db9..1ac1df6b6 100644
--- a/include/plssvm/backends/SYCL/svm_kernel.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel.hpp
@@ -11,25 +11,17 @@
 
 #pragma once
 
-#include "plssvm/detail/execution_range.hpp"
+#include "plssvm/backends/SYCL/detail/atomics.hpp"    // plssvm::sycl::atomic_op
 #include "plssvm/backends/SYCL/detail/constants.hpp"  // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
 #include "plssvm/constants.hpp"                       // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
+#include "plssvm/detail/execution_range.hpp"          // plssvm::detail::execution_range
 
-#include "sycl/sycl.hpp"  // sycl::nd_item, sycl::handler, sycl::accessor, sycl::access::mode, sycl::access::target, sycl::range, sycl::group_barrier, sycl::pow,
-                          // sycl::exp, sycl::atomic_ref, sycl::memory_order, sycl::memory_scope, sycl::access::address_space
+#include "sycl/sycl.hpp"  // sycl::queue, sycl::handler, sycl::h_item, sycl::range, sycl::private_memory, sycl::pow, sycl::exp
 
 #include <cstddef>  // std::size_t
 
 namespace plssvm::sycl {
 
-// TODO: change to ::sycl::local_accessor once implemented in the SYCL implementations
-/**
- * @brief Shortcut alias for a SYCL local accessor.
- * @tparam T the type of the accessed values
- */
-template <typename T>
-using local_accessor = ::sycl::accessor<T, 2, ::sycl::access::mode::read_write, ::sycl::access::target::local>;
-
 /**
  * @brief Calculates the C-SVM kernel using the linear kernel function.
  * @details Supports multi-GPU execution.
@@ -43,7 +35,8 @@ class device_kernel_linear {
 
     /**
      * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel.
-     * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
+     * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued
+     * @param[in] range the execution range of the kernel
      * @param[in] q the `q` vector
      * @param[out] ret the result vector
      * @param[in] d the right-hand side of the equation
@@ -55,88 +48,141 @@ class device_kernel_linear {
      * @param[in] add denotes whether the values are added or subtracted from the result vector
      * @param[in] id the id of the device
      */
-    device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
-        data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
+    device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
+        queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
-        kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-        kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-        real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
-        real_type data_j[INTERNAL_BLOCK_SIZE];
-
-        if (i >= j) {
-            i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-            j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-            // cache data
-            for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) {
-                ::sycl::group_barrier(nd_idx.get_group());
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(1) == idx) {
-                        data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
+    void operator()() const {
+        queue_.submit([&](::sycl::handler &cgh) {
+            const real_type *q = q_;
+            real_type *ret = ret_;
+            const real_type *d = d_;
+            const real_type *data_d = data_d_;
+            const real_type QA_cost = QA_cost_;
+            const real_type cost = cost_;
+            const kernel_index_type num_rows = num_rows_;
+            const kernel_index_type feature_range = feature_range_;
+            const real_type add = add_;
+            const kernel_index_type device = device_;
+
+            cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
+                // allocate shared memory
+                real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+                real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+                // allocate memory for work-item local variables
+                // -> accessible across different 'parallel_for_work_item' invocations
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
+                ::sycl::private_memory<bool, 2> private_cond{ group };
+
+                // initialize private variables
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // indices and diagonal condition
+                    private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+                    private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+                    private_cond(idx) = private_i(idx) >= private_j(idx);
+                    if (private_cond(idx)) {
+                        private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                        private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
                     }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(0) == idx_2) {
-                        data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
+
+                    // matrix
+                    for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                            private_matr(idx)[i][j] = real_type{ 0.0 };
+                        }
                     }
-                }
-                ::sycl::group_barrier(nd_idx.get_group());
+                });
+
+                // implicit group barrier
+
+                // load data from global in shared memory
+                for (kernel_index_type vec_index = 0; vec_index < feature_range * num_rows; vec_index += num_rows) {
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                                const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(1) == idx_1) {
+                                    data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)];
+                                }
+                                const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(0) == idx_2) {
+                                    data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)];
+                                }
+                            }
+                        }
+                    });
+
+                    // implicit group barrier
+
+                    // load data from shared in private memory and perform scalar product
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                                private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                            }
+
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                                const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                                #pragma unroll INTERNAL_BLOCK_SIZE
+                                for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                                    private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
+                                }
+                            }
+                        }
+                    });
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
+                    // implicit group barrier
                 }
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        matr[k][l] += data_i * data_j[k];
-                    }
-                }
-            }
-
-            #pragma unroll INTERNAL_BLOCK_SIZE
-            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                real_type ret_jx = 0.0;
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                    real_type temp;
-                    if (device_ == 0) {
-                        temp = (matr[x][y] + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
-                    } else {
-                        temp = matr[x][y] * add_;
-                    }
-                    if (i + x > j + y) {
-                        // upper triangular matrix
-                        atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
-                        ret_jx += temp * d_[i + y];
-                    } else if (i + x == j + y) {
-                        // diagonal
-                        if (device_ == 0) {
-                            ret_jx += (temp + cost_ * add_) * d_[i + y];
-                        } else {
-                            ret_jx += temp * d_[i + y];
+                // kernel function
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    if (private_cond(idx)) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                            real_type ret_jx = 0.0;
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                                real_type temp;
+                                if (device == 0) {
+                                    temp = (private_matr(idx)[x][y] + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add;
+                                } else {
+                                    temp = private_matr(idx)[x][y] * add;
+                                }
+                                if (private_i(idx) + x > private_j(idx) + y) {
+                                    // upper triangular matrix
+                                    atomic_op<real_type>{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x];
+                                    ret_jx += temp * d[private_i(idx) + y];
+                                } else if (private_i(idx) + x == private_j(idx) + y) {
+                                    // diagonal
+                                    if (device == 0) {
+                                        ret_jx += (temp + cost * add) * d[private_i(idx) + y];
+                                    } else {
+                                        ret_jx += temp * d[private_i(idx) + y];
+                                    }
+                                }
+                            }
+                            atomic_op<real_type>{ ret[private_j(idx) + x] } += ret_jx;
                         }
                     }
-                }
-                atomic_op<real_type>{ ret_[j + x] } += ret_jx;
-            }
-        }
+                });
+            });
+        });
     }
 
   private:
-    local_accessor<real_type> data_intern_i_;
-    local_accessor<real_type> data_intern_j_;
+    ::sycl::queue &queue_;
+    ::sycl::range<2> global_range_;
+    ::sycl::range<2> local_range_;
 
     const real_type *q_;
     real_type *ret_;
@@ -163,7 +209,8 @@ class device_kernel_poly {
 
     /**
      * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel.
-     * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
+     * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued
+     * @param[in] range the execution range of the kernel
      * @param[in] q the `q` vector
      * @param[out] ret the result vector
      * @param[in] d the right-hand side of the equation
@@ -182,248 +229,127 @@ class device_kernel_poly {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
     void operator()() const {
-    
-    queue_.submit([&](::sycl::handler& cgh) {
-      const real_type *q = q_;
-      real_type *ret = ret_;
-      const real_type *d = d_;
-      const real_type *data_d = data_d_;
-      const real_type QA_cost = QA_cost_;
-      const real_type cost = cost_;
-      const kernel_index_type num_rows = num_rows_;
-      const kernel_index_type num_cols = num_cols_;
-      const real_type add = add_;
-      const int degree = degree_;
-      const real_type gamma = gamma_;
-      const real_type coef0 = coef0_;
-
-      cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
-        // allocate shared memory
-        real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-        real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-        
-        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
-        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
-
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
-                    private_matr(idx)[i][j] = real_type{ 0.0 };
-                }
-            }
-        });
-
-        for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
-
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-            if (i >= j) {
-                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
-                    if (idx.get_local_id(1) == idx_1) {
-                        data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i];
+        queue_.submit([&](::sycl::handler &cgh) {
+            const real_type *q = q_;
+            real_type *ret = ret_;
+            const real_type *d = d_;
+            const real_type *data_d = data_d_;
+            const real_type QA_cost = QA_cost_;
+            const real_type cost = cost_;
+            const kernel_index_type num_rows = num_rows_;
+            const kernel_index_type num_cols = num_cols_;
+            const real_type add = add_;
+            const int degree = degree_;
+            const real_type gamma = gamma_;
+            const real_type coef0 = coef0_;
+
+            cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
+                // allocate shared memory
+                real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+                real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+                // allocate memory for work-item local variables
+                // -> accessible across different 'parallel_for_work_item' invocations
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
+                ::sycl::private_memory<bool, 2> private_cond{ group };
+
+                // initialize private variables
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // indices and diagonal condition
+                    private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+                    private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+                    private_cond(idx) = private_i(idx) >= private_j(idx);
+                    if (private_cond(idx)) {
+                        private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                        private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
                     }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (idx.get_local_id(0) == idx_2) {
-                        data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j];
-                    }
-                }
-            }
-        });
-
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
 
-            if (i >= j) {
-                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
-                }
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
-                    }
-                }
-            }
-        });
-
-        }
-
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            kernel_index_type i = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-            kernel_index_type j = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-            if (i >= j) {
-                i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-                j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                    real_type ret_jx = 0.0;
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                        const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[i + y] - q[j + x]) * add;
-                        if (i + x > j + y) {
-                            // upper triangular matrix
-                            atomic_op<real_type>{ ret[i + y] } += temp * d[j + x];
-                            ret_jx += temp * d[i + y];
-                        } else if (i + x == j + y) {
-                            // diagonal
-                            ret_jx += (temp + cost * add) * d[i + y];
+                    // matrix
+                    for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                            private_matr(idx)[i][j] = real_type{ 0.0 };
                         }
                     }
-                    atomic_op<real_type>{ ret[j + x] } += ret_jx;
-                }
-            }
-        });
-  /*
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-          kernel_index_type i = gi * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-          kernel_index_type j = gj * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-          real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
-          real_type data_j[INTERNAL_BLOCK_SIZE];
-
-          if (i >= j) {
-            i += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-            j += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-            // cache data
-            for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
-                ::sycl::group_barrier(group);
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
-                    if (idx.get_local_id(1) == idx_1) {
-                        data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + i];
-                    }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (idx.get_local_id(0) == idx_2) {
-                        data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + j];
-                    }
-                }
-//                ::sycl::group_barrier(group);
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    data_j[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
-                }
-
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        matr[k][l] += data_i * data_j[k];
-                    }
-                }
-            }
-
-            #pragma unroll INTERNAL_BLOCK_SIZE
-            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                real_type ret_jx = 0.0;
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                    const real_type temp = (::sycl::pow(gamma * matr[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[i + y] - q[j + x]) * add;
-                    if (i + x > j + y) {
-                        // upper triangular matrix
-                        atomic_op<real_type>{ ret[i + y] } += temp * d[j + x];
-                        ret_jx += temp * d[i + y];
-                    } else if (i + x == j + y) {
-                        // diagonal
-                        ret_jx += (temp + cost * add) * d[i + y];
-                    }
-                }
-                atomic_op<real_type>{ ret[j + x] } += ret_jx;
-            }
-          }
-        });
-        */
-      });
-    });
-        /*
-        kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-        kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-        real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
-        real_type data_j[INTERNAL_BLOCK_SIZE];
-
-        if (i >= j) {
-            i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-            j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-            // cache data
-            for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
-                ::sycl::group_barrier(nd_idx.get_group());
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(1) == idx) {
-                        data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
-                    }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(0) == idx_2) {
-                        data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
-                    }
-                }
-                ::sycl::group_barrier(nd_idx.get_group());
+                });
+
+                // implicit group barrier
+
+                // load data from global in shared memory
+                for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                                const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(1) == idx_1) {
+                                    data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)];
+                                }
+                                const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(0) == idx_2) {
+                                    data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)];
+                                }
+                            }
+                        }
+                    });
+
+                    // implicit group barrier
+
+                    // load data from shared in private memory and perform scalar product
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                                private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                            }
+
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                                const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                                #pragma unroll INTERNAL_BLOCK_SIZE
+                                for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                                    private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
+                                }
+                            }
+                        }
+                    });
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
+                    // implicit group barrier
                 }
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        matr[k][l] += data_i * data_j[k];
-                    }
-                }
-            }
-
-            #pragma unroll INTERNAL_BLOCK_SIZE
-            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                real_type ret_jx = 0.0;
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                    const real_type temp = (::sycl::pow(gamma_ * matr[x][y] + coef0_, static_cast<real_type>(degree_)) + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
-                    if (i + x > j + y) {
-                        // upper triangular matrix
-                        atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
-                        ret_jx += temp * d_[i + y];
-                    } else if (i + x == j + y) {
-                        // diagonal
-                        ret_jx += (temp + cost_ * add_) * d_[i + y];
+                // kernel function
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    if (private_cond(idx)) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                            real_type ret_jx = 0.0;
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                                const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add;
+                                if (private_i(idx) + x > private_j(idx) + y) {
+                                    // upper triangular matrix
+                                    atomic_op<real_type>{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x];
+                                    ret_jx += temp * d[private_i(idx) + y];
+                                } else if (private_i(idx) + x == private_j(idx) + y) {
+                                    // diagonal
+                                    ret_jx += (temp + cost * add) * d[private_i(idx) + y];
+                                }
+                            }
+                            atomic_op<real_type>{ ret[private_j(idx) + x] } += ret_jx;
+                        }
                     }
-                }
-                atomic_op<real_type>{ ret_[j + x] } += ret_jx;
-            }
-        }
-        */
+                });
+            });
+        });
     }
 
   private:
-    ::sycl::queue& queue_;
+    ::sycl::queue &queue_;
     ::sycl::range<2> global_range_;
     ::sycl::range<2> local_range_;
 
@@ -454,7 +380,8 @@ class device_kernel_radial {
 
     /**
      * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel.
-     * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
+     * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued
+     * @param[in] range the execution range of the kernel
      * @param[in] q the `q` vector
      * @param[out] ret the result vector
      * @param[in] d the right-hand side of the equation
@@ -466,79 +393,132 @@ class device_kernel_radial {
      * @param[in] add denotes whether the values are added or subtracted from the result vector
      * @param[in] gamma the gamma parameter used in the rbf kernel function
      */
-    device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
-        data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
+    device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
+        queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<2> nd_idx) const {
-        kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-        kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-
-        real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
-        real_type data_j[INTERNAL_BLOCK_SIZE];
-
-        if (i >= j) {
-            i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-            j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-
-            // cache data
-            for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
-                ::sycl::group_barrier(nd_idx.get_group());
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                    const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(1) == idx) {
-                        data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
+    void operator()() const {
+        queue_.submit([&](::sycl::handler &cgh) {
+            const real_type *q = q_;
+            real_type *ret = ret_;
+            const real_type *d = d_;
+            const real_type *data_d = data_d_;
+            const real_type QA_cost = QA_cost_;
+            const real_type cost = cost_;
+            const kernel_index_type num_rows = num_rows_;
+            const kernel_index_type num_cols = num_cols_;
+            const real_type add = add_;
+            const real_type gamma = gamma_;
+
+            cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
+                // allocate shared memory
+                real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+                real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+                // allocate memory for work-item local variables
+                // -> accessible across different 'parallel_for_work_item' invocations
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
+                ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
+                ::sycl::private_memory<bool, 2> private_cond{ group };
+
+                // initialize private variables
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // indices and diagonal condition
+                    private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+                    private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+                    private_cond(idx) = private_i(idx) >= private_j(idx);
+                    if (private_cond(idx)) {
+                        private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                        private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
                     }
-                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                    if (nd_idx.get_local_id(0) == idx_2) {
-                        data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
+
+                    // matrix
+                    for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                            private_matr(idx)[i][j] = real_type{ 0.0 };
+                        }
                     }
-                }
-                ::sycl::group_barrier(nd_idx.get_group());
+                });
+
+                // implicit group barrier
+
+                // load data from global in shared memory
+                for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                                const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(1) == idx_1) {
+                                    data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)];
+                                }
+                                const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                                if (idx.get_local_id(0) == idx_2) {
+                                    data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)];
+                                }
+                            }
+                        }
+                    });
+
+                    // implicit group barrier
+
+                    // load data from shared in private memory and perform scalar product
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        if (private_cond(idx)) {
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                                private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                            }
+
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                                const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                                #pragma unroll INTERNAL_BLOCK_SIZE
+                                for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                                    private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]);
+                                }
+                            }
+                        }
+                    });
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                    data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
+                    // implicit group barrier
                 }
 
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                    const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
-                    #pragma unroll INTERNAL_BLOCK_SIZE
-                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                        matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]);
-                    }
-                }
-            }
-
-            #pragma unroll INTERNAL_BLOCK_SIZE
-            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                real_type ret_jx = 0.0;
-                #pragma unroll INTERNAL_BLOCK_SIZE
-                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                    const real_type temp = (::sycl::exp(-gamma_ * matr[x][y]) + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
-                    if (i + x > j + y) {
-                        // upper triangular matrix
-                        atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
-                        ret_jx += temp * d_[i + y];
-                    } else if (i + x == j + y) {
-                        // diagonal
-                        ret_jx += (temp + cost_ * add_) * d_[i + y];
+                // kernel function
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    if (private_cond(idx)) {
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                            real_type ret_jx = 0.0;
+                            #pragma unroll INTERNAL_BLOCK_SIZE
+                            for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                                const real_type temp = (::sycl::exp(-gamma * private_matr(idx)[x][y]) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add;
+                                if (private_i(idx) + x > private_j(idx) + y) {
+                                    // upper triangular matrix
+                                    atomic_op<real_type>{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x];
+                                    ret_jx += temp * d[private_i(idx) + y];
+                                } else if (private_i(idx) + x == private_j(idx) + y) {
+                                    // diagonal
+                                    ret_jx += (temp + cost * add) * d[private_i(idx) + y];
+                                }
+                            }
+                            atomic_op<real_type>{ ret[private_j(idx) + x] } += ret_jx;
+                        }
                     }
-                }
-                atomic_op<real_type>{ ret_[j + x] } += ret_jx;
-            }
-        }
+                });
+            });
+        });
     }
 
   private:
-    local_accessor<real_type> data_intern_i_;
-    local_accessor<real_type> data_intern_j_;
+    ::sycl::queue &queue_;
+    ::sycl::range<2> global_range_;
+    ::sycl::range<2> local_range_;
 
     const real_type *q_;
     real_type *ret_;
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 7d2786962..9c7d180df 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -156,27 +156,17 @@ void csvm<T>::run_q_kernel(const std::size_t device, const ::plssvm::detail::exe
 
 template <typename T>
 void csvm<T>::run_svm_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, const real_type add, const std::size_t num_features) {
-    const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
     switch (kernel_) {
         case kernel_type::linear:
-            devices_[device].submit([&](::sycl::handler &cgh) {
-                cgh.parallel_for(execution_range, device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device));
-            });
+            device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)();
             break;
         case kernel_type::polynomial:
-          {
             PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!");
             device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)();
-            //devices_[device].submit([&](::sycl::handler &cgh) {
-            //    cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_));
-            //});
-          }
             break;
         case kernel_type::rbf:
             PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!");
-            devices_[device].submit([&](::sycl::handler &cgh) {
-                cgh.parallel_for(execution_range, device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_));
-            });
+            device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)();
             break;
     }
 }
@@ -190,62 +180,62 @@ void csvm<T>::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe
 template <typename T>
 void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) {
     [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
-    
+
     switch (kernel_) {
         case kernel_type::linear:
             break;
         case kernel_type::polynomial:
 #if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
-            {
-                ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
-                ::sycl::range<2> local_range{ range.block[0], range.block[1] };
-                devices_[0].submit([&](::sycl::handler& cgh) {
-                    real_type *out_d_ptr = out_d.get();
-                    const real_type *data_d_ptr = data_d_[0].get();
-                    const real_type *data_last_d_ptr = data_last_d_[0].get();
-                    const real_type *alpha_d_ptr = alpha_d.get();
-                    const std::size_t num_data_points = num_data_points_;
-                    const real_type *point_d_ptr = point_d.get();
-                    const std::size_t num_predict_points = p_num_predict_points;
-                    const std::size_t num_features = num_features_;
-                    const int degree = degree_;
-                    const real_type gamma = gamma_;
-                    const real_type coef0 = coef0_;
-
-                    cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
-                        group.parallel_for_work_item(device_kernel_predict_poly<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0));
-                    });
+        {
+            ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
+            ::sycl::range<2> local_range{ range.block[0], range.block[1] };
+            devices_[0].submit([&](::sycl::handler &cgh) {
+                real_type *out_d_ptr = out_d.get();
+                const real_type *data_d_ptr = data_d_[0].get();
+                const real_type *data_last_d_ptr = data_last_d_[0].get();
+                const real_type *alpha_d_ptr = alpha_d.get();
+                const std::size_t num_data_points = num_data_points_;
+                const real_type *point_d_ptr = point_d.get();
+                const std::size_t num_predict_points = p_num_predict_points;
+                const std::size_t num_features = num_features_;
+                const int degree = degree_;
+                const real_type gamma = gamma_;
+                const real_type coef0 = coef0_;
+
+                cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
+                    group.parallel_for_work_item(device_kernel_predict_poly<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0));
                 });
-            }
+            });
+        }
 #elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
             devices_[0].parallel_for(execution_range, device_kernel_predict_poly<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_));
 #endif
-            break;
+        break;
         case kernel_type::rbf:
 #if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
-            {
-                ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
-                ::sycl::range<2> local_range{ range.block[0], range.block[1] };
-                devices_[0].submit([&](::sycl::handler& cgh) {
-                    real_type *out_d_ptr = out_d.get();
-                    const real_type *data_d_ptr = data_t_[0].get();
-                    const real_type *data_last_d_ptr = data_last_d_[0].get();
-                    const real_type *alpha_d_ptr = alpha_d.get();
-                    const std::size_t num_data_points = num_data_points_;
-                    const real_type *point_d_ptr = point_d.get();
-                    const std::size_t num_predict_points = p_num_predict_points;
-                    const std::size_t num_features = num_features_;
-                    const real_type gamma = gamma_;
-
-                    cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
-                        group.parallel_for_work_item(device_kernel_predict_radial<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma));
-                    });
+        {
+            ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
+            ::sycl::range<2> local_range{ range.block[0], range.block[1] };
+            devices_[0].submit([&](::sycl::handler &cgh) {
+                real_type *out_d_ptr = out_d.get();
+                const real_type *data_d_ptr = data_t_[0].get();
+                const real_type *data_last_d_ptr = data_last_d_[0].get();
+                const real_type *alpha_d_ptr = alpha_d.get();
+                const std::size_t num_data_points = num_data_points_;
+                const real_type *point_d_ptr = point_d.get();
+                const std::size_t num_predict_points = p_num_predict_points;
+                const std::size_t num_features = num_features_;
+                const real_type gamma = gamma_;
+
+                cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
+                    group.parallel_for_work_item(device_kernel_predict_radial<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma));
                 });
-            }
+            });
+        }
 #elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
             devices_[0].parallel_for(execution_range, device_kernel_predict_radial<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_));
 #endif
-            break;
+        break;
     }
 }
 

From 5cbcbc198c800bacfed7fcd678f32732893c59b2 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 8 Feb 2022 13:01:38 +0100
Subject: [PATCH 16/56] Fix error in rewritten SYCL predict function.

---
 .../plssvm/backends/SYCL/predict_kernel.hpp   | 26 ++++-----
 src/plssvm/backends/SYCL/csvm.cpp             | 56 ++-----------------
 2 files changed, 15 insertions(+), 67 deletions(-)

diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp
index d8568facf..c9fce1a0b 100644
--- a/include/plssvm/backends/SYCL/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/predict_kernel.hpp
@@ -72,15 +72,12 @@ class device_kernel_w_linear {
  * @brief Predicts the labels for data points using the polynomial kernel function.
  * @details Currently only single GPU execution is supported.
  * @tparam T the type of the data points
- * @tparam U the type of the `sycl::item`
  */
-template <typename T, typename U>
+template <typename T>
 class device_kernel_predict_poly {
   public:
     /// The type of the data.
     using real_type = T;
-    /// The `sycl::item` type.
-    using sycl_item_type = U;
 
     /**
      * @brief Construct a new device kernel to predict the labels for data points using the polynomial kernel function.
@@ -102,11 +99,11 @@ class device_kernel_predict_poly {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] idx the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(sycl_item_type idx) const {
-        const kernel_index_type data_point_index = idx.get_global_id(0);
-        const kernel_index_type predict_point_index = idx.get_global_id(1);
+    void operator()(::sycl::id<2> idx) const {
+        const kernel_index_type data_point_index = idx[0];
+        const kernel_index_type predict_point_index = idx[1];
 
         real_type temp = 0;
         if (predict_point_index < num_predict_points_) {
@@ -142,15 +139,12 @@ class device_kernel_predict_poly {
  * @brief Predicts the labels for data points using the radial basis functions kernel function.
  * @details Currently only single GPU execution is supported.
  * @tparam T the type of the data points
- * @tparam U the type of the `sycl::item`
  */
-template <typename T, typename U>
+template <typename T>
 class device_kernel_predict_radial {
   public:
     /// The type of the data.
     using real_type = T;
-    /// The `sycl::item` type
-    using sycl_item_type = U;
 
     /**
      * @brief Construct a new device kernel to predict the labels for data points using the radial basis function kernel function.
@@ -170,11 +164,11 @@ class device_kernel_predict_radial {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] idx the [`sycl::h_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#hitem-class) (hipSYCL) or [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) (DPC++) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] idx the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(sycl_item_type idx) const {
-        const kernel_index_type data_point_index = idx.get_global_id(0);
-        const kernel_index_type predict_point_index = idx.get_global_id(1);
+    void operator()(::sycl::id<2> idx) const {
+        const kernel_index_type data_point_index = idx[0];
+        const kernel_index_type predict_point_index = idx[1];
 
         real_type temp = 0;
         if (predict_point_index < num_predict_points_) {
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 9c7d180df..eddfcf9b9 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -178,64 +178,18 @@ void csvm<T>::run_w_kernel(const std::size_t device, const ::plssvm::detail::exe
 }
 
 template <typename T>
-void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t p_num_predict_points) {
+void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) {
     [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
 
     switch (kernel_) {
         case kernel_type::linear:
             break;
         case kernel_type::polynomial:
-#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
-        {
-            ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
-            ::sycl::range<2> local_range{ range.block[0], range.block[1] };
-            devices_[0].submit([&](::sycl::handler &cgh) {
-                real_type *out_d_ptr = out_d.get();
-                const real_type *data_d_ptr = data_d_[0].get();
-                const real_type *data_last_d_ptr = data_last_d_[0].get();
-                const real_type *alpha_d_ptr = alpha_d.get();
-                const std::size_t num_data_points = num_data_points_;
-                const real_type *point_d_ptr = point_d.get();
-                const std::size_t num_predict_points = p_num_predict_points;
-                const std::size_t num_features = num_features_;
-                const int degree = degree_;
-                const real_type gamma = gamma_;
-                const real_type coef0 = coef0_;
-
-                cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
-                    group.parallel_for_work_item(device_kernel_predict_poly<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, degree, gamma, coef0));
-                });
-            });
-        }
-#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
-            devices_[0].parallel_for(execution_range, device_kernel_predict_poly<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, degree_, gamma_, coef0_));
-#endif
-        break;
+            devices_[0].parallel_for(::sycl::range<2>{ num_data_points_, num_predict_points }, device_kernel_predict_poly(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, degree_, gamma_, coef0_));
+            break;
         case kernel_type::rbf:
-#if PLSSVM_SYCL_BACKEND_COPMILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
-        {
-            ::sycl::range<2> global_range{ range.grid[0], range.grid[1] };
-            ::sycl::range<2> local_range{ range.block[0], range.block[1] };
-            devices_[0].submit([&](::sycl::handler &cgh) {
-                real_type *out_d_ptr = out_d.get();
-                const real_type *data_d_ptr = data_t_[0].get();
-                const real_type *data_last_d_ptr = data_last_d_[0].get();
-                const real_type *alpha_d_ptr = alpha_d.get();
-                const std::size_t num_data_points = num_data_points_;
-                const real_type *point_d_ptr = point_d.get();
-                const std::size_t num_predict_points = p_num_predict_points;
-                const std::size_t num_features = num_features_;
-                const real_type gamma = gamma_;
-
-                cgh.parallel_for_work_group(global_range, local_range, [=](::sycl::group<2> group) {
-                    group.parallel_for_work_item(device_kernel_predict_radial<real_type, ::sycl::h_item<2>>(out_d_ptr, data_d_ptr, data_last_d_ptr, alpha_d_ptr, num_data_points, point_d_ptr, num_predict_points, num_features, gamma));
-                });
-            });
-        }
-#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
-            devices_[0].parallel_for(execution_range, device_kernel_predict_radial<real_type, ::sycl::nd_item<2>>(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), p_num_predict_points, num_features_, gamma_));
-#endif
-        break;
+            devices_[0].parallel_for(::sycl::range<2>{ num_data_points_, num_predict_points }, device_kernel_predict_radial(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, gamma_));
+            break;
     }
 }
 

From 52ab64064fe43847c8dfbdf449fe73379237b7ad Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 17 Feb 2022 09:19:47 +0100
Subject: [PATCH 17/56] Remove erroneous hipSYCL compile flag

---
 src/plssvm/backends/SYCL/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index cef2b4073..ebc235008 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -58,7 +58,7 @@ if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL")
     # set backend compiler to hipSYCL (= 1)
     target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=1)
     # silence unknown options warnings
-    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -Wno-unknown-warning-option)
+    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Wno-unknown-warning-option)
 elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++")
     # enable DPC++ SYCL support
     target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl)

From 26a7d3650cacb710c612fa3047791a71053f317d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 28 Feb 2022 10:37:59 +0100
Subject: [PATCH 18/56] Add timing output for OpenCL kernel JIT compilation.

---
 src/plssvm/backends/OpenCL/csvm.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp
index 71381a86e..fe1e27cd8 100644
--- a/src/plssvm/backends/OpenCL/csvm.cpp
+++ b/src/plssvm/backends/OpenCL/csvm.cpp
@@ -21,9 +21,11 @@
 #include "plssvm/parameter.hpp"                             // plssvm::parameter
 #include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
 
+#include "fmt/chrono.h"   // can directly print std::chrono literals
 #include "fmt/core.h"     // fmt::print, fmt::format
 #include "fmt/ostream.h"  // can use fmt using operator<< overloads
 
+#include <chrono>     // std::chrono
 #include <exception>  // std::terminate
 #include <string>     // std::string
 #include <utility>    // std::pair, std::make_pair, std::move
@@ -91,6 +93,8 @@ csvm<T>::csvm(const parameter<T> &params) :
         fmt::print("\n");
     }
 
+    auto jit_start_time = std::chrono::steady_clock::now();
+
     // get kernel names
     std::pair<std::string, std::string> kernel_names = detail::kernel_type_to_function_name(kernel_);
     // build necessary kernel
@@ -110,6 +114,11 @@ csvm<T>::csvm(const parameter<T> &params) :
             break;
     }
 
+    auto jit_end_time = std::chrono::steady_clock::now();
+    if (print_info_) {
+        fmt::print("OpenCL kernel JIT compilation done in {}.\n", std::chrono::duration_cast<std::chrono::milliseconds>(jit_end_time - jit_start_time));
+    }
+
     // sanity checks for the number of OpenCL kernels
     PLSSVM_ASSERT(devices_.size() == q_kernel_.size(), fmt::format("Number of kernels for the q kernel ({}) must match the number of devices ({})!", q_kernel_.size(), devices_.size()));
     PLSSVM_ASSERT(devices_.size() == svm_kernel_.size(), fmt::format("Number of kernels for the svm kernel ({}) must match the number of devices ({})!", svm_kernel_.size(), devices_.size()));

From de9bcf12db8b6a5bb1e524969c86e2f2f60422ec Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 2 Mar 2022 09:52:51 +0100
Subject: [PATCH 19/56] Update SYCL DPC++ CMake code to allow AOT compilation
 for CPUs and Intel GPUs. Fix a bug that disallowed multiple targets or
 multiple architectures per target.

---
 CMakeLists.txt                          | 13 ++---
 src/plssvm/backends/SYCL/CMakeLists.txt | 67 ++++++++++++++++++++-----
 2 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 967b5cfe5..2dfaf064b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -255,28 +255,29 @@ foreach(PLSSVM_PLATFORM ${PLSSVM_TARGET_PLATFORMS})
     if(PLSSVM_PLATFORM MATCHES "^cpu")
         # parse provided CPU architectures
         parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_CPU_TARGET_ARCHS PLSSVM_NUM_CPU_TARGET_ARCHS)
-        if(NOT PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 0)
-            message(FATAL_ERROR "Target platform \"cpu\" must not have any architecture specifications!")
+        if(PLSSVM_NUM_CPU_TARGET_ARCHS GREATER 1)
+            message(FATAL_ERROR "Target platform \"cpu\" must at most have one architecture specification!")
         endif()
         target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_CPU_TARGET)
     elseif(PLSSVM_PLATFORM MATCHES "^nvidia")
         # parse provided NVIDIA GPU architectures
         parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_NVIDIA_TARGET_ARCHS PLSSVM_NUM_NVIDIA_TARGET_ARCHS)
         if(PLSSVM_NUM_NVIDIA_TARGET_ARCHS EQUAL 0)
-            message(FATAL_ERROR "Target platform \"nvidia\" must at least have one architecture specifications!")
+            message(FATAL_ERROR "Target platform \"nvidia\" must at least have one architecture specification!")
         endif()
         target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_NVIDIA_TARGET)
     elseif(PLSSVM_PLATFORM MATCHES "^amd")
         # parse provided AMD GPU architectures
         parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_AMD_TARGET_ARCHS PLSSVM_NUM_AMD_TARGET_ARCHS)
         if(PLSSVM_NUM_AMD_TARGET_ARCHS EQUAL 0)
-            message(FATAL_ERROR "Target platform \"amd\" must at least have one architecture specifications!")
+            message(FATAL_ERROR "Target platform \"amd\" must at least have one architecture specification!")
         endif()
         target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_AMD_TARGET)
     elseif(PLSSVM_PLATFORM MATCHES "^intel")
+        # parse provided Intel GPU architectures
         parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_INTEL_TARGET_ARCHS PLSSVM_NUM_INTEL_TARGET_ARCHS)
-        if(NOT PLSSVM_NUM_INTEL_TARGET_ARCHS EQUAL 0)
-            message(FATAL_ERROR "Target platform \"intel\" must not have any architecture specifications!")
+        if(PLSSVM_NUM_INTEL_TARGET_ARCHS EQUAL 0)
+            message(FATAL_ERROR "Target platform \"intel\" must at least have one architecture specification!")
         endif()
         target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_INTEL_TARGET)
     else()
diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index ebc235008..2d0253b41 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -8,17 +8,27 @@
 message(CHECK_START "Checking for SYCL backend")
 
 # reformat PLSSVM_TARGET_PLATFORMS to be usable with HIPSYCL_TARGETS (in case hipSYCL may be available)
-set(HIPSYCL_TARGETS ${PLSSVM_TARGET_PLATFORMS} CACHE STRING "" FORCE)
+set(HIPSYCL_TARGETS "${PLSSVM_TARGET_PLATFORMS}" CACHE STRING "" FORCE)
 list(TRANSFORM HIPSYCL_TARGETS REPLACE "cpu" "omp")
 list(TRANSFORM HIPSYCL_TARGETS REPLACE "nvidia" "cuda")
 list(TRANSFORM HIPSYCL_TARGETS REPLACE "amd" "hip")
 list(TRANSFORM HIPSYCL_TARGETS REPLACE "intel" "spirv")
+# remove CPU and Intel GPU target architectures since they are not supported when using hipSYCL
+if(DEFINED PLSSVM_CPU_TARGET_ARCHS AND PLSSVM_NUM_CPU_TARGET_ARCHS GREATER 0)
+    string(REPLACE ";" "," PLSSVM_CPU_TARGET_ARCHS_COMMA "${PLSSVM_CPU_TARGET_ARCHS}")
+    string(REPLACE ":${PLSSVM_CPU_TARGET_ARCHS_COMMA}" "" HIPSYCL_TARGETS "${HIPSYCL_TARGETS}")
+endif()
+if(DEFINED PLSSVM_INTEL_TARGET_ARCHS)
+    string(REPLACE ";" "," PLSSVM_INTEL_TARGET_ARCHS_COMMA "${PLSSVM_INTEL_TARGET_ARCHS}")
+    string(REPLACE ":${PLSSVM_INTEL_TARGET_ARCHS_COMMA}" "" HIPSYCL_TARGETS "${HIPSYCL_TARGETS}")
+endif()
 
 # check if hipSYCL is used as SYCL compiler
 find_package(hipSYCL CONFIG)
 if(hipSYCL_FOUND)
     set(PLSSVM_SYCL_BACKEND_COMPILER "hipSYCL" CACHE STRING "" FORCE)
     message(CHECK_PASS "found hipSYCL")
+    message(STATUS "Setting HIPSYCL_TARGETS to \"${HIPSYCL_TARGETS}\".")
 else()
     # if not, check if DPC++ is used instead
     try_compile(PLSSVM_SYCL_BACKEND_CHECK_FOR_DPCPP_COMPILER
@@ -59,29 +69,60 @@ if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL")
     target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=1)
     # silence unknown options warnings
     target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Wno-unknown-warning-option)
+
+    # print note that Intel GPU architecture specifications are ignored when using hipSYCL
+    if(DEFINED PLSSVM_INTEL_TARGET_ARCHS)
+        message(STATUS "Ignoring specified Intel architectures \"${PLSSVM_INTEL_TARGET_ARCHS}\" in favor of SPIR-V when using hipSYCL!")
+    endif()
 elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++")
     # enable DPC++ SYCL support
     target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl)
     target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl)
 
+    set(PLSSVM_DPCPP_FSYCL_TARGETS "")
+    # cpu targets
+    if(DEFINED PLSSVM_CPU_TARGET_ARCHS)
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "spir64_x86_64")
+        # add target specific flags for AOT
+        if(PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1)
+            target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
+            target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
+        endif()
+    endif()
     # nvidia targets
     if(DEFINED PLSSVM_NVIDIA_TARGET_ARCHS)
-        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda)
-        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda)
-        foreach(PLSSVM_NVIDIA_TARGET_ARCH_NAME ${PLSSVM_NVIDIA_TARGET_ARCHS})
-            target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME})
-            target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME})
-        endforeach()
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "nvptx64-nvidia-cuda")
+        # add target specific flags for AOT
+        list(JOIN PLSSVM_NVIDIA_TARGET_ARCHS "," PLSSVM_NVIDIA_TARGET_ARCHS_STRING)
+        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda "-cuda-gpu-arch=${PLSSVM_NVIDIA_TARGET_ARCHS_STRING}")
+        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda "-cuda-gpu-arch=${PLSSVM_NVIDIA_TARGET_ARCHS_STRING}")
     endif()
     # amd targets
     if(DEFINED PLSSVM_AMD_TARGET_ARCHS)
-        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=amdgcn-amd-amdhsa)
-        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=amdgcn-amd-amdhsa)
-        foreach(PLSSVM_AMD_TARGET_ARCH_NAME ${PLSSVM_AMD_TARGET_ARCHS})
-            target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCH_NAME})
-            target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCH_NAME})
-        endforeach()
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "amdgcn-amd-amdhsa")
+        # add target specific flags for AOT
+        if(NOT PLSSVM_NUM_AMD_TARGET_ARCHS EQUAL 1)
+            message(FATAL_ERROR "DPC++ currently only supports a single AMD architecture specification but ${PLSSVM_NUM_AMD_TARGET_ARCHS} were provided!")
+        endif()
+        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS})
+        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS})
+    endif()
+    # intel targets
+    if(DEFINED PLSSVM_INTEL_TARGET_ARCHS)
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "spir64_gen")
+        # add target specific flags for AOT
+        list(JOIN PLSSVM_INTEL_TARGET_ARCHS "," PLSSVM_INTEL_TARGET_ARCHS_STRING)
+        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
+        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
     endif()
+    # set -fsycl-targets
+    list(JOIN PLSSVM_DPCPP_FSYCL_TARGETS "," PLSSVM_DPCPP_FSYCL_TARGETS_STRING)
+    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=${PLSSVM_DPCPP_FSYCL_TARGETS_STRING})
+    target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=${PLSSVM_DPCPP_FSYCL_TARGETS_STRING})
 
     # set backend compiler to DPC++ (= 0)
     target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=0)

From 99f07f462b67f8168b469d9991a6a950c87c4c28 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 2 Mar 2022 09:53:56 +0100
Subject: [PATCH 20/56] Update summary string to also include CPU architecture
 if possible and remove architecture where not used (e.g., OpenCL since
 everything is JIT compiled).

---
 cmake/assemble_summary_string.cmake       | 8 ++++++--
 src/plssvm/backends/OpenCL/CMakeLists.txt | 5 +++++
 src/plssvm/backends/SYCL/CMakeLists.txt   | 5 +++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/cmake/assemble_summary_string.cmake b/cmake/assemble_summary_string.cmake
index 80e73c24e..d4c31d6c7 100644
--- a/cmake/assemble_summary_string.cmake
+++ b/cmake/assemble_summary_string.cmake
@@ -8,7 +8,11 @@ function(assemble_summary_string out_var)
     set(PLSSVM_SUMMARY_STRING_ASSEMBLE "")
     if(DEFINED PLSSVM_CPU_TARGET_ARCHS)
         # add cpu platform
-        string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " cpu,")
+        if(PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 0)
+            string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " cpu,")
+        else()
+            string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " cpu (${PLSSVM_CPU_TARGET_ARCHS}),")
+        endif()
     endif()
     if(DEFINED PLSSVM_NVIDIA_TARGET_ARCHS)
         # add nvidia platform
@@ -20,7 +24,7 @@ function(assemble_summary_string out_var)
     endif()
     if(DEFINED PLSSVM_INTEL_TARGET_ARCHS)
         # add intel platform
-        string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " intel,")
+        string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " intel (${PLSSVM_INTEL_TARGET_ARCHS}),")
     endif()
     # remove last comma
     string(REGEX REPLACE ",$" "" PLSSVM_SUMMARY_STRING_ASSEMBLE "${PLSSVM_SUMMARY_STRING_ASSEMBLE}")
diff --git a/src/plssvm/backends/OpenCL/CMakeLists.txt b/src/plssvm/backends/OpenCL/CMakeLists.txt
index cd28a2913..0273ca652 100644
--- a/src/plssvm/backends/OpenCL/CMakeLists.txt
+++ b/src/plssvm/backends/OpenCL/CMakeLists.txt
@@ -62,4 +62,9 @@ set(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_TARGETS_TO_INSTALL} PARENT_SCOPE)
 set(PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_COMPILER " - OpenCL:")
 include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake)
 assemble_summary_string(PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS)
+# do not print any special target architecture information
+string(REPLACE " (${PLSSVM_CPU_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}")
+string(REPLACE " (${PLSSVM_NVIDIA_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}")
+string(REPLACE " (${PLSSVM_AMD_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}")
+string(REPLACE " (${PLSSVM_INTEL_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}")
 set(PLSSVM_OPENCL_BACKEND_SUMMARY_STRING "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_COMPILER}${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}" PARENT_SCOPE)
\ No newline at end of file
diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index 2d0253b41..ce64e03bd 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -155,5 +155,10 @@ set(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_TARGETS_TO_INSTALL} PARENT_SCOPE)
 set(PLSSVM_SYCL_BACKEND_SUMMARY_STRING_COMPILER " - SYCL (${PLSSVM_SYCL_BACKEND_COMPILER}):")
 include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake)
 assemble_summary_string(PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS)
+# do not print CPU and Intel GPU target architectures when using hipSYCL
+if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL")
+    string(REPLACE " (${PLSSVM_CPU_TARGET_ARCHS})" "" PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}")
+    string(REPLACE " (${PLSSVM_INTEL_TARGET_ARCHS})" "" PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}")
+endif()
 set(PLSSVM_SYCL_BACKEND_SUMMARY_STRING "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_COMPILER}${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}" PARENT_SCOPE)
 

From f76f6bcdd36b6e367ad6b79530edade7db7de39b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 2 Mar 2022 10:16:34 +0100
Subject: [PATCH 21/56] Update gpu_name_to_arch script: new name, add more AMD
 targets, add support for Intel (i)GPUs, output possible
 -DPLSSVM_TARGET_PLATFORMS string.

---
 utility_scripts/gpu_name_to_arch.py        | 162 ----------
 utility_scripts/plssvm_target_platforms.py | 344 +++++++++++++++++++++
 2 files changed, 344 insertions(+), 162 deletions(-)
 delete mode 100644 utility_scripts/gpu_name_to_arch.py
 create mode 100644 utility_scripts/plssvm_target_platforms.py

diff --git a/utility_scripts/gpu_name_to_arch.py b/utility_scripts/gpu_name_to_arch.py
deleted file mode 100644
index 06bc30fc8..000000000
--- a/utility_scripts/gpu_name_to_arch.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-########################################################################################################################
-# Authors: Alexander Van Craen, Marcel Breyer                                                                          #
-# Copyright (C): 2018-today The PLSSVM project - All Rights Reserved                                                   #
-# License: This file is part of the PLSSVM project which is released under the MIT license.                            #
-#          See the LICENSE.md file in the project root for full license information.                                   #
-########################################################################################################################
-
-import argparse
-import re
-
-# parse command line arguments
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--name", help="the full name of the GPU (e.g. GeForce RTX 3080)")
-args = parser.parse_args()
-
-if args.name is None:
-    # for nvidia GPUs
-    import GPUtil
-    # for AMD GPUs
-    import pyamdgpuinfo
-
-    gpu_names = []
-    # check for possible NVIDIA GPUs
-    gpu_names.extend([gpu.name for gpu in GPUtil.getGPUs()])
-    # check for possible AMD GPUs
-    gpu_names.extend([pyamdgpuinfo.get_gpu(
-        gpu_id).name for gpu_id in range(pyamdgpuinfo.detect_gpus())])
-    if not gpu_names:
-        # error if no GPUs where found
-        raise RuntimeError("Couldn't find any NVIDIA or AMD GPU(s)!")
-    else:
-        print("Found {} GPU(s):".format(len(gpu_names)))
-else:
-    # use provided GPU name
-    gpu_names = [args.name]
-
-# mapping of NVIDIA compute capabilities given the GPU name
-# only GPUs with a compute capability greater or equal than 6.0 are support
-# https://developer.nvidia.com/cuda-gpus
-nvidia_compute_capability_mapping = {
-    # Datacenter Products
-    "NVIDIA A100": "sm_80",
-    "NVIDIA A40": "sm_86",
-    "NVIDIA A30": "sm_80",
-    "NVIDIA A10": "sm_86",
-    "NVIDIA A16": "sm_86",
-    "NVIDIA T4": "sm_75",
-    "NVIDIA V100": "sm_70",
-    "Tesla P100": "sm_60",
-    "Tesla P40": "sm_61",
-    "Tesla P4": "sm_61",
-    # NVIDIA Quadro and NVIDIA RTX
-    "RTX A6000": "sm_86",
-    "RTX A5000": "sm_86",
-    "RTX A4000": "sm_86",
-    "T1000": "sm_75",
-    "T600": "sm_75",
-    "T400": "sm_75",
-    "Quadro RTX 8000": "sm_75",
-    "Quadro RTX 6000": "sm_75",
-    "Quadro RTX 5000": "sm_75",
-    "Quadro RTX 4000": "sm_75",
-    "Quadro GV100": "sm_70",
-    "Quadro GP100": "sm_60",
-    "Quadro P6000": "sm_61",
-    "Quadro P5000": "sm_61",
-    "Quadro P4000": "sm_61",
-    "Quadro P2200": "sm_61",
-    "Quadro P2000": "sm_61",
-    "Quadro P1000": "sm_61",
-    "Quadro P620": "sm_61",
-    "Quadro P600": "sm_61",
-    "Quadro P400": "sm_61",
-    "RTX A3000": "sm_86",
-    "RTX A2000": "sm_86",
-    "RTX 5000": "sm_75",
-    "RTX 4000": "sm_75",
-    "RTX 3000": "sm_75",
-    "T2000": "sm_75",
-    "T1200": "sm_75",
-    "T500": "sm_75",
-    "P620": "sm_61",
-    "P520": "sm_61",
-    "Quadro P5200": "sm_61",
-    "Quadro P4200": "sm_61",
-    "Quadro P3200": "sm_61",
-    "Quadro P3000": "sm_61",
-    "Quadro P500": "sm_61",
-    # GeForce and TITAN Products
-    "GeForce RTX 3060 Ti": "sm_86",
-    "GeForce RTX 3060": "sm_86",
-    "GeForce RTX 3090": "sm_86",
-    "GeForce RTX 3080": "sm_86",
-    "GeForce RTX 3070": "sm_86",
-    "GeForce GTX 1650 Ti": "sm_75",
-    "NVIDIA TITAN RTX": "sm_75",
-    "GeForce RTX 2080 Ti": "sm_75",
-    "GeForce RTX 2080": "sm_75",
-    "GeForce RTX 2070": "sm_75",
-    "GeForce RTX 2060": "sm_75",
-    "NVIDIA TITAN V": "sm_70",
-    "NVIDIA TITAN Xp": "sm_61",
-    "NVIDIA TITAN X": "sm_61",
-    "GeForce GTX 1080 Ti": "sm_61",
-    "GeForce GTX 1080": "sm_61",
-    "GeForce GTX 1070 Ti": "sm_61",
-    "GeForce GTX 1070": "sm_61",
-    "GeForce GTX 1060": "sm_61",
-    "GeForce GTX 1050": "sm_61",
-    "GeForce RTX 3050 Ti": "sm_86",
-    "GeForce RTX 3050": "sm_86",
-    # Jetson Products
-    "Jetson AGX Xavier": "sm_72",
-}
-
-# mapping of AMD architectures given the GPU name
-# https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/master/ROCm_Compiler_SDK/ROCm-Native-ISA.rst#id145
-amd_arch_mapping = {
-    "Radeon Pro VII": "gfx906",
-    "Radeon VII": "gfx906",
-    "Radeon Instinct MI50": "gfx906",
-    "Radeon Instinct MI6": "gfx906",
-    "Ryzen 3 2200G": "gfx902",
-    "Ryzen 5 2400G": "gfx902",
-    "Radeon Vega Frontier Edition": "gfx900",
-    "Radeon RX Vega 56": "gfx900",
-    "Radeon RX Vega 64": "gfx900",
-    "Radeon RX Vega 64 Liquid Cooled": "gfx900",
-    "Radeon Instinct MI25": "gfx900",
-    "Radeon RX 460": "gfx803",
-    "Radeon RX 470": "gfx803",
-    "Radeon RX 480": "gfx803",
-    "Radeon R9 Nano": "gfx803",
-    "Radeon R9 Fury": "gfx803",
-    "Radeon R9 FuryX": "gfx803",
-    "Radeon Pro Duo FirePro S9300x2": "gfx803",
-    "Radeon Instinct MI8": "gfx803",
-}
-
-# output mapped name
-for name in gpu_names:
-    found_name = False
-    for key in nvidia_compute_capability_mapping:
-        if re.search(key, name, re.IGNORECASE):
-            print("  {}: {}".format(name, nvidia_compute_capability_mapping[key]))
-            found_name = True
-            break
-
-    for key in amd_arch_mapping:
-        name_cleaned = name.replace("AMD", "").strip()
-        name_cleaned = name_cleaned.replace("(TM) ", "").strip()
-        if re.search(key, name_cleaned, re.IGNORECASE):
-            print("  {}: {}".format(name_cleaned, amd_arch_mapping[key]))
-            found_name = True
-            break
-
-    if not found_name:
-        raise RuntimeError("Unrecognized GPU name '{}'".format(name))
diff --git a/utility_scripts/plssvm_target_platforms.py b/utility_scripts/plssvm_target_platforms.py
new file mode 100644
index 000000000..3047513ce
--- /dev/null
+++ b/utility_scripts/plssvm_target_platforms.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+########################################################################################################################
+# Authors: Alexander Van Craen, Marcel Breyer                                                                          #
+# Copyright (C): 2018-today The PLSSVM project - All Rights Reserved                                                   #
+# License: This file is part of the PLSSVM project which is released under the MIT license.                            #
+#          See the LICENSE.md file in the project root for full license information.                                   #
+########################################################################################################################
+
+import argparse
+import re
+
+import cpuinfo       # get CPU SIMD information
+import GPUtil        # get NVIDIA GPU information
+import pyamdgpuinfo  # get AMD GPU information
+import pylspci       # get Intel GPU information
+
+# parse command line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("--quiet", help="only output the final PLSSVM_TARGET_PLATFORMS string", action="store_true")
+args = parser.parse_args()
+
+
+def cond_print(msg=""):
+    if not args.quiet:
+        print(msg)
+
+
+# mapping of NVIDIA compute capabilities given the GPU name
+# only GPUs with compute capability greater or equal than 6.0 are support
+# https://developer.nvidia.com/cuda-gpus
+nvidia_compute_capability_mapping = {
+    # Datacenter Products
+    "NVIDIA A100": "sm_80",
+    "NVIDIA A40": "sm_86",
+    "NVIDIA A30": "sm_80",
+    "NVIDIA A10": "sm_86",
+    "NVIDIA A16": "sm_86",
+    "NVIDIA T4": "sm_75",
+    "NVIDIA V100": "sm_70",
+    "Tesla P100": "sm_60",
+    "Tesla P40": "sm_61",
+    "Tesla P4": "sm_61",
+    # NVIDIA Quadro and NVIDIA RTX
+    "RTX A6000": "sm_86",
+    "RTX A5000": "sm_86",
+    "RTX A4000": "sm_86",
+    "T1000": "sm_75",
+    "T600": "sm_75",
+    "T400": "sm_75",
+    "Quadro RTX 8000": "sm_75",
+    "Quadro RTX 6000": "sm_75",
+    "Quadro RTX 5000": "sm_75",
+    "Quadro RTX 4000": "sm_75",
+    "Quadro GV100": "sm_70",
+    "Quadro GP100": "sm_60",
+    "Quadro P6000": "sm_61",
+    "Quadro P5000": "sm_61",
+    "Quadro P4000": "sm_61",
+    "Quadro P2200": "sm_61",
+    "Quadro P2000": "sm_61",
+    "Quadro P1000": "sm_61",
+    "Quadro P620": "sm_61",
+    "Quadro P600": "sm_61",
+    "Quadro P400": "sm_61",
+    "RTX A3000": "sm_86",
+    "RTX A2000": "sm_86",
+    "RTX 5000": "sm_75",
+    "RTX 4000": "sm_75",
+    "RTX 3000": "sm_75",
+    "T2000": "sm_75",
+    "T1200": "sm_75",
+    "T500": "sm_75",
+    "P620": "sm_61",
+    "P520": "sm_61",
+    "Quadro P5200": "sm_61",
+    "Quadro P4200": "sm_61",
+    "Quadro P3200": "sm_61",
+    "Quadro P3000": "sm_61",
+    "Quadro P500": "sm_61",
+    # GeForce and TITAN Products
+    "GeForce RTX 3060 Ti": "sm_86",
+    "GeForce RTX 3060": "sm_86",
+    "GeForce RTX 3090": "sm_86",
+    "GeForce RTX 3080": "sm_86",
+    "GeForce RTX 3070": "sm_86",
+    "GeForce GTX 1650 Ti": "sm_75",
+    "NVIDIA TITAN RTX": "sm_75",
+    "GeForce RTX 2080 Ti": "sm_75",
+    "GeForce RTX 2080": "sm_75",
+    "GeForce RTX 2070": "sm_75",
+    "GeForce RTX 2060": "sm_75",
+    "NVIDIA TITAN V": "sm_70",
+    "NVIDIA TITAN Xp": "sm_61",
+    "NVIDIA TITAN X": "sm_61",
+    "GeForce GTX 1080 Ti": "sm_61",
+    "GeForce GTX 1080": "sm_61",
+    "GeForce GTX 1070 Ti": "sm_61",
+    "GeForce GTX 1070": "sm_61",
+    "GeForce GTX 1060": "sm_61",
+    "GeForce GTX 1050": "sm_61",
+    "GeForce RTX 3050 Ti": "sm_86",
+    "GeForce RTX 3050": "sm_86",
+    # Jetson Products
+    "Jetson AGX Xavier": "sm_72",
+}
+
+# mapping of AMD architectures given the GPU name
+# https://llvm.org/docs/AMDGPUUsage.html
+amd_arch_mapping = {
+    # AMD Radeon GPUs
+    "Radeon RX 6700 XT": "gfx1031",
+    "Radeon RX 6800": "gfx1030",
+    "Radeon RX 6800 XT": "gfx1030",
+    "Radeon RX 6900 XT": "gfx1030",
+    "Radeon RX 5500": "gfx1012",
+    "Radeon RX 5500 XT": "gfx1012",
+    "Radeon Pro V520": "gfx1011",
+    "Radeon RX 5700": "gfx1010",
+    "Radeon RX 5700 XT": "gfx1010",
+    "Radeon Pro 5600 XT": "gfx1010",
+    "Radeon Pro 5600M": "gfx1010",
+    "Radeon Instinct MI100 Accelerator": "gfx908",
+    "Radeon Pro VII": "gfx906",
+    "Radeon VII": "gfx906",
+    "Radeon Instinct MI50": "gfx906",
+    "Radeon Instinct MI60": "gfx906",
+    "Radeon Vega Frontier Edition": "gfx900",
+    "Radeon RX Vega 56": "gfx900",
+    "Radeon RX Vega 64": "gfx900",
+    "Radeon RX Vega 64 Liquid": "gfx900",
+    "Radeon Instinct MI25": "gfx900",
+    "Radeon RX 460": "gfx803",
+    "Radeon Instinct MI6": "gfx803",
+    "Radeon RX 470": "gfx803",
+    "Radeon RX 480": "gfx803",
+    "Radeon Instinct MI8": "gfx803",
+    "Radeon R9 Nano": "gfx803",
+    "Radeon R9 Fury": "gfx803",
+    "Radeon R9 FuryX": "gfx803",
+    "Radeon Pro Duo": "gfx803",
+    "Radeon R9 285": "gfx802",
+    "Radeon R9 380": "gfx802",
+    "Radeon R9 385": "gfx802",
+    # AMD Ryzen iGPUs
+    "Ryzen 7 4700G": "gfx90c",
+    "Ryzen 7 4700GE": "gfx90c",
+    "Ryzen 5 4600G": "gfx90c",
+    "Ryzen 5 4600GE": "gfx90c",
+    "Ryzen 3 4300G": "gfx90c",
+    "Ryzen 3 4300GE": "gfx90c",
+    "Ryzen Pro 4000G": "gfx90c",
+    "Ryzen 7 Pro 4700G": "gfx90c",
+    "Ryzen 7 Pro 4750GE": "gfx90c",
+    "Ryzen 5 Pro 4650G": "gfx90c",
+    "Ryzen 5 Pro 4650GE": "gfx90c",
+    "Ryzen 3 Pro 4350G": "gfx90c",
+    "Ryzen 3 Pro 4350GE": "gfx90c",
+    "Ryzen 3 2200G": "gfx902",
+    "Ryzen 5 2400G": "gfx902",
+    # other AMD targets
+    "FirePro S7150": "gfx805",
+    "FirePro S7100": "gfx805",
+    "FirePro W7100": "gfx805",
+    "Mobile FirePro M7170": "gfx805",
+    "FirePro S9300x2": "gfx803",
+    "A6-8500P": "gfx801",
+    "Pro A6-8500B": "gfx801",
+    "A8-8600P": "gfx801",
+    "Pro A8-8600B": "gfx801",
+    "FX-8800P": "gfx801",
+    "Pro A12-8800B": "gfx801",
+    "A10-8700P": "gfx801",
+    "Pro A10-8700B": "gfx801",
+    "A10-8780P": "gfx801",
+    "A10-9600P": "gfx801",
+    "A12-9700P": "gfx801",
+    "A12-9730P": "gfx801",
+    "FX-9800P": "gfx801",
+    "FX-9830P": "gfx801",
+    "E2-9010": "gfx801",
+    "A6-9210": "gfx801",
+    "A9-9410": "gfx801",
+}
+
+# mapping of Intel architecture names
+# https://dgpu-docs.intel.com/devices/hardware-table.html
+# https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html
+intel_arch_mapping = {
+    # Skylake
+    "skl": ["192A", "1932", "193B", "193A", "193D", "1923", "1926", "1927", "192B", "192D", "1912", "191B", "1913",
+            "1915", "1917", "191A", "1916", "1921", "191D", "191E", "1902", "1906", "190B", "190A", "190E"],
+    # Gemini Lake
+    "glk": ["3185", "3184"],
+    # Apollo Lake
+    "Gen9": ["1A85", "5A85", "0A84", "1A84", "5A84"],
+    # Kaby Lake
+    "kbl": ["593B", "5923", "5926", "5927", "5917", "5912", "591B", "5916", "5921", "591A", "591D", "591E", "591C",
+            "87C0", "5913", "5915", "5902", "5906", "590B", "590A", "5908", "590E"],
+    # Coffee Lake
+    "cfl": ["3EA5", "3EA8", "3EA6", "3EA7", "3EA2", "3E90", "3E93", "3E99", "3E9C", "3EA1", "9BA5", "9BA8", "3EA4",
+            "9B21", "9BA0", "9BA2", "9BA4", "9BAA", "9BAB", "9BAC", "87CA", "3EA3", "9B41", "9BC0", "9BC2", "9BC4",
+            "9BCA", "9BCB", "9BCC", "3E91", "3E92", "3E98", "3E9B", "9BC5", "9BC8", "3E96", "3E9A", "3E94", "9BC6",
+            "9BE6", "9BF6", "3EA9", "3EA0"],
+    # Ice Lake
+    "icllp": ["8A71", "8A56", "8A58", "8A5B", "8A5D", "8A54", "8A5A", "8A5C", "8A57", "8A59", "8A50", "8A51", "8A52",
+              "8A53"],
+    # Tiger Lake
+    "tgllp": ["9A60", "9A68", "9A70", "9A40", "9A49", "9A78", "9AC0", "9AC9", "9AD9", "9AF8"],
+    # Xe MAX
+    "dg1": ["4905"],
+}
+intel_arch_to_name_mapping = {
+    "skl": "Skylake with Intel Processor Graphics Gen9",
+    "kbl": "Kaby Lake with Intel Processor Graphics Gen9",
+    "cfl": "Coffee Lake with Intel Processor Graphics Gen9",
+    "glk": "Gemini Lake with Intel Processor Graphics Gen9",
+    "icllp": "Ice Lake with Intel Processor Graphics Gen11",
+    "tgllp": "Tiger Lake with Intel Processor Graphics Gen12",
+    "dg1": "Intel Iris Xe MAX graphics",
+    "Gen9": "Intel Processor Graphics Gen9",
+    "Gen11": "Intel Processor Graphics Gen11",
+    "Gen12LP": "Intel Processor Graphics Gen12 (Lower Power)",
+}
+
+
+# construct PLSSVM_TARGET_PLATFORMS string
+plssvm_target_platforms = ""
+
+# CPU SIMD information for cpu target
+simd_version_support = {
+    "avx512": False,
+    "avx2": False,
+    "avx": False,
+    "sse4_2": False,
+}
+
+cpu_info = cpuinfo.get_cpu_info()
+
+for flag in cpu_info["flags"]:
+    for key in simd_version_support:
+        if flag == key:
+            simd_version_support[key] = True
+    if flag.startswith("avx512"):
+        simd_version_support["avx512"] = True
+
+cond_print("{}: {}\n".format(cpu_info["brand_raw"], simd_version_support))
+
+newest_simd_version = ""
+for key in simd_version_support:
+    if simd_version_support[key]:
+        newest_simd_version = key
+        break
+plssvm_target_platforms += "cpu" + ("" if "".__eq__(newest_simd_version) else ":") + newest_simd_version
+
+
+# NVIDIA GPU information
+nvidia_gpu_names = [gpu.name for gpu in GPUtil.getGPUs()]
+nvidia_num_gpus = len(nvidia_gpu_names)
+
+if nvidia_num_gpus > 0:
+    nvidia_gpus = {x: nvidia_gpu_names.count(x) for x in nvidia_gpu_names}
+    nvidia_gpu_sm = {}
+    # get NVIDIA SM from GPU name
+    for name in nvidia_gpus:
+        found_name = False
+        for key in nvidia_compute_capability_mapping:
+            if re.search(key, name, re.IGNORECASE):
+                nvidia_gpu_sm[name] = nvidia_compute_capability_mapping[key]
+                found_name = True
+                break
+
+        if not found_name:
+            raise RuntimeError("Unrecognized GPU name '{}'".format(name))
+
+    cond_print("Found {} NVIDIA GPU(s):".format(nvidia_num_gpus))
+    for name in nvidia_gpus:
+        cond_print("  {}x {}: {}".format(nvidia_gpus[name], name, nvidia_gpu_sm[name]))
+    cond_print()
+
+    plssvm_target_platforms += ";nvidia:" + ",".join({str(sm) for sm in nvidia_gpu_sm.values()})
+
+
+# AMD GPU information
+amd_gpu_names = [pyamdgpuinfo.get_gpu(gpu_id).name for gpu_id in range(pyamdgpuinfo.detect_gpus())]
+amd_num_gpus = len(amd_gpu_names)
+
+if amd_num_gpus > 0:
+    amd_gpus = {x: amd_gpu_names.count(x) for x in amd_gpu_names}
+    amd_gpu_arch = {}
+    # get AMD gfx from GPU name
+    for name in amd_gpus:
+        found_name = False
+        for key in amd_arch_mapping:
+            name_cleaned = name.replace("AMD", "").strip()
+            name_cleaned = name_cleaned.replace("(TM) ", "").strip()
+            if re.search(key, name_cleaned, re.IGNORECASE):
+                amd_gpu_arch[name] = amd_arch_mapping[key]
+                found_name = True
+                break
+
+        if not found_name:
+            raise RuntimeError("Unrecognized GPU name '{}'".format(name))
+
+    cond_print("Found {} AMD GPU(s):".format(amd_num_gpus))
+    for name in amd_gpus:
+        cond_print("  {}x {}: {}".format(amd_gpus[name], name, amd_gpu_arch[name]))
+    cond_print()
+
+    plssvm_target_platforms += ";amd:" + ",".join({str(sm) for sm in amd_gpu_arch.values()})
+
+
+# Intel GPU information
+intel_gpu_names = []
+for device in pylspci.parsers.SimpleParser().run():
+    if re.search("VGA", str(device.cls), re.IGNORECASE):
+        for key in intel_arch_mapping:
+            if any(re.search(arch, str(device.device), re.IGNORECASE) for arch in intel_arch_mapping[key]):
+                intel_gpu_names.append(str(device.device))
+                break
+intel_num_gpus = len(intel_gpu_names)
+
+if intel_num_gpus > 0:
+    intel_gpus = {x: intel_gpu_names.count(x) for x in intel_gpu_names}
+    intel_gpu_arch = {}
+    for name in intel_gpus:
+        for key in intel_arch_mapping:
+            if any(re.search(arch, name, re.IGNORECASE) for arch in intel_arch_mapping[key]):
+                intel_gpu_arch[name] = key
+                break
+
+    cond_print("Found {} Intel (i)GPU(s):".format(intel_num_gpus))
+    for name in intel_gpus:
+        cond_print("  {}x {} ({}): {}".format(intel_gpus[name], name,
+                                              intel_arch_to_name_mapping[intel_gpu_arch[name]],
+                                              intel_gpu_arch[name]))
+    cond_print()
+
+    plssvm_target_platforms += ";intel:" + ",".join({str(sm) for sm in intel_gpu_arch.values()})
+
+
+cond_print("Possible -DPLSSVM_TARGET_PLATFORMS entries:")
+print("\"{}\"".format(plssvm_target_platforms))

From 05086fffc5f3c97426310682a23aff0861fc5161 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 2 Mar 2022 10:29:59 +0100
Subject: [PATCH 22/56] Update README to reflect changes in SYCL DPC++ target
 platforms handling and in the plssvm_target_platforms.py script.

---
 README.md | 46 ++++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 3f963626d..731844cfa 100644
--- a/README.md
+++ b/README.md
@@ -54,40 +54,50 @@ Building the library can be done using the normal CMake approach:
 
 The **required** CMake option `PLSSVM_TARGET_PLATFORMS` is used to determine for which targets the backends should be compiled.
 Valid targets are:
-  - `cpu`: compile for the CPU; **no** architectural specifications  is allowed
-  - `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g. `nvidia:sm_86,sm_70`
-  - `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g. `amd:gfx906`
-  - `intel`: compile for Intel GPUs; **no** architectural specification is allowed
+  - `cpu`: compile for the CPU; an **optional** architectural specifications is allowed but only used when compiling with DPC++, e.g., `cpu:avx2`
+  - `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g., `nvidia:sm_86,sm_70`
+  - `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g., `amd:gfx906`
+  - `intel`: compile for Intel GPUs; **at least one** architectural specification is necessary, e.g., `intel:skl`
 
 At least one of the above targets must be present.
 
-To retrieve the architectural specification, given an NVIDIA or AMD GPU name, a simple Python3 script `utility/gpu_name_to_arch.py` is provided
-(requiring Python3 [`argparse`](https://docs.python.org/3/library/argparse.html) as dependency):
+Note that when using DPC++ only a single architectural specification for `cpu` or `amd` is allowed.
+
+To retrieve the architectural specifications of the current system, a simple Python3 script `utility/plssvm_target_platforms.py` is provided
+(required Python3 dependencies: 
+[`argparse`](https://docs.python.org/3/library/argparse.html), [`py-cpuinfo`](https://pypi.org/project/py-cpuinfo/),
+[`GPUtil`](https://pypi.org/project/GPUtil/), [`pyamdgpuinfo`](https://pypi.org/project/pyamdgpuinfo/), and
+[`pylspci`](https://pypi.org/project/pylspci/))
 
 ```bash
-> python3 utility/gpu_name_to_arch.py --help
-usage: gpu_name_to_arch.py [-h] [--name NAME]
+> python3 utility/plssvm_target_platforms.py --help
+usage: plssvm_target_platforms.py [-h] [--quiet]
 
 optional arguments:
-  -h, --help   show this help message and exit
-  --name NAME  the full name of the GPU (e.g. GeForce RTX 3080)
+  -h, --help  show this help message and exit
+  --quiet     only output the final PLSSVM_TARGET_PLATFORMS string
 ```
 
 Example invocations:
 
 ```bash
-> python3 utility_scripts/gpu_name_to_arch.py --name "GeForce RTX 3080"
-sm_86
-> python3 utility_scripts/gpu_name_to_arch.py --name "Radeon VII"
-gfx906
-```
+> python3 utility_scripts/plssvm_target_platforms.py
+Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz: {'avx512': True, 'avx2': True, 'avx': True, 'sse4_2': True}
+
+Found 1 NVIDIA GPU(s):
+  1x NVIDIA GeForce RTX 3080: sm_86
 
-If no GPU name is provided, the script tries to automatically detect any NVIDIA or AMD GPU
-(requires the Python3 dependencies [`GPUtil`](https://pypi.org/project/GPUtil/) and [`pyamdgpuinfo`](https://pypi.org/project/pyamdgpuinfo/)).
+Possible -DPLSSVM_TARGET_PLATFORMS entries:
+cpu:avx512;nvidia:sm_86
+
+> python3 utility_scripts/plssvm_target_platforms.py --quiet
+cpu:avx512;intel:dg1
+```
 
 If the architectural information for the requested GPU could not be retrieved, one option would be to have a look at:
   - for NVIDIA GPUs:  [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)
-  - for AMD GPUs: [ROCm Documentation](https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/master/ROCm_Compiler_SDK/ROCm-Native-ISA.rst)
+  - for AMD GPUs: [clang AMDGPU backend usage](https://llvm.org/docs/AMDGPUUsage.html)
+  - for Intel GPUs and CPUs: [Ahead of Time Compilation](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html) and [Intel graphics processor table](https://dgpu-docs.intel.com/devices/hardware-table.html)
 
 
 #### Optional CMake Options

From f240e25b4df18d50919d7ddc9ecd9e90e17d4d37 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 2 Mar 2022 10:50:32 +0100
Subject: [PATCH 23/56] Update README: add DPC++ and hipSYCL to SYCL in the
 beginning, add Python requirements file, fix bug in code sample.

---
 README.md                       |  5 +++--
 install/python_requirements.txt | 11 +++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 install/python_requirements.txt

diff --git a/README.md b/README.md
index 731844cfa..8e9a1c184 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ The currently available backends are:
   - [OpenMP](https://www.openmp.org/)
   - [CUDA](https://developer.nvidia.com/cuda-zone)
   - [OpenCL](https://www.khronos.org/opencl/)
-  - [SYCL](https://www.khronos.org/sycl/)
+  - [SYCL](https://www.khronos.org/sycl/) (tested implementations are [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL))
 
 ## Getting Started
 
@@ -20,6 +20,7 @@ General dependencies:
   - [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful)
   - [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled
   - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up file parsing
+  - multiple Python3 modules used in the utility scripts; <br>to install all modules use `pip install --user -r install/python_requirements.txt`
 
 Additional dependencies for the OpenMP backend:
   - compiler with OpenMP support
@@ -300,7 +301,7 @@ A simple C++ program (`main.cpp`) using this library could look like:
 #include <iostream>
 #include <vector>
 
-int main(i) {
+int main() {
     try {
         // parse SVM parameter from command line
         plssvm::parameter<double> params;
diff --git a/install/python_requirements.txt b/install/python_requirements.txt
new file mode 100644
index 000000000..4e86a4e9a
--- /dev/null
+++ b/install/python_requirements.txt
@@ -0,0 +1,11 @@
+### optional and required python packages
+argparse
+sklearn
+py-cpuinfo
+GPUtil
+pyamdgpuinfo
+pylspci
+numpy
+pandas
+arff
+matplotlib
\ No newline at end of file

From 489915cc6b7a46156900a1fd4bd874509682a714 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 2 Mar 2022 11:05:08 +0100
Subject: [PATCH 24/56] Remove leading > in code blocks.

---
 README.md | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 8e9a1c184..d4ed4d1f2 100644
--- a/README.md
+++ b/README.md
@@ -44,11 +44,11 @@ Additional dependencies if `PLSSVM_ENABLE_TESTING` and `PLSSVM_GENERATE_TEST_FIL
 Building the library can be done using the normal CMake approach:
 
 ```bash
-> git clone git@gitlab-sim.informatik.uni-stuttgart.de:vancraar/Bachelor-Code.git SVM
-> cd SVM/SVM
-> mkdir build && cd build
-> cmake -DPLSSVM_TARGET_PLATFORMS="..." [optional_options] ..
-> cmake --build .
+git clone git@github.com:SC-SGS/PLSSVM.git
+cd PLSSVM 
+mkdir build && cd build 
+cmake -DPLSSVM_TARGET_PLATFORMS="..." [optional_options] .. 
+cmake --build .
 ```
 
 #### Target Platform Selection
@@ -71,7 +71,7 @@ To retrieve the architectural specifications of the current system, a simple Pyt
 [`pylspci`](https://pypi.org/project/pylspci/))
 
 ```bash
-> python3 utility/plssvm_target_platforms.py --help
+python3 utility/plssvm_target_platforms.py --help
 usage: plssvm_target_platforms.py [-h] [--quiet]
 
 optional arguments:
@@ -82,7 +82,7 @@ optional arguments:
 Example invocations:
 
 ```bash
-> python3 utility_scripts/plssvm_target_platforms.py
+python3 utility_scripts/plssvm_target_platforms.py
 Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz: {'avx512': True, 'avx2': True, 'avx': True, 'sse4_2': True}
 
 Found 1 NVIDIA GPU(s):
@@ -91,7 +91,7 @@ Found 1 NVIDIA GPU(s):
 Possible -DPLSSVM_TARGET_PLATFORMS entries:
 cpu:avx512;nvidia:sm_86
 
-> python3 utility_scripts/plssvm_target_platforms.py --quiet
+python3 utility_scripts/plssvm_target_platforms.py --quiet
 cpu:avx512;intel:dg1
 ```
 
@@ -146,7 +146,7 @@ To use DPC++ as compiler simply set the `CMAKE_CXX_COMPILER` to the respective D
 To run the tests after building the library (with `PLSSVM_ENABLE_TESTING` set to `ON`) use:
 
 ```bash
-> ctest
+ctest
 ```
 
 ### Generating test coverage results
@@ -155,10 +155,10 @@ To enable the generation of test coverage reports using `locv` the library must
 Additionally, it's advisable to use smaller test files to shorten the `ctest` step.
 
 ```bash
-> cmake -DCMAKE_BUILD_TYPE=Coverage -DPLSSVM_TARGET_PLATFORMS="..." \
-        -DPLSSVM_TEST_FILE_NUM_DATA_POINTS=100 \
-        -DPLSSVM_TEST_FILE_NUM_FEATURES=50 ..
-> cmake --build . -- coverage
+cmake -DCMAKE_BUILD_TYPE=Coverage -DPLSSVM_TARGET_PLATFORMS="..." \
+      -DPLSSVM_TEST_FILE_NUM_DATA_POINTS=100 \
+      -DPLSSVM_TEST_FILE_NUM_FEATURES=50 ..
+cmake --build . -- coverage
 ```
 
 The resulting `html` coverage report is located in the `coverage` folder in the build directory.
@@ -167,7 +167,7 @@ The resulting `html` coverage report is located in the `coverage` folder in the
 
 If doxygen is installed and `PLSSVM_ENABLE_DOCUMENTATION` is set to `ON` the documentation can be build using
 ```bash
-> make doc
+make doc
 ```
 The documentation of the current state of the main branch can be found [here](https://sc-sgs.github.io/PLSSVM/).
 
@@ -176,7 +176,7 @@ The documentation of the current state of the main branch can be found [here](ht
 The library supports the `install` target:
 
 ```bash
-> cmake --build . -- install
+cmake --build . -- install
 ```
 
 ## Usage
@@ -193,7 +193,7 @@ In order to use all functionality, the following Python3 modules must be install
 [`mpl_toolkits`](https://pypi.org/project/matplotlib/)
 
 ```bash
-> python3 utility_scripts/generate_data**.py --help
+python3 utility_scripts/generate_data**.py --help
 usage: generate_data.py [-h] --output OUTPUT --format FORMAT [--problem PROBLEM] --samples SAMPLES [--test_samples TEST_SAMPLES] --features FEATURES [--plot]
 
 optional arguments:
@@ -211,13 +211,13 @@ optional arguments:
 An example invocation generating a data set consisting of blobs with 1000 data points with 200 features each could look like:
 
 ```bash
-> python3 generate_data.py --ouput data_file --format libsvm --problem blobs --samples 1000 --features 200
+python3 generate_data.py --ouput data_file --format libsvm --problem blobs --samples 1000 --features 200
 ```
 
 ### Training
 
 ```bash
-> ./svm-train --help
+./svm-train --help
 LS-SVM with multiple (GPU-)backends
 Usage:
   ./svm-train [OPTION...] training_set_file [model_file]
@@ -243,13 +243,13 @@ Usage:
 An example invocation using the CUDA backend could look like:
 
 ```bash
-> ./svm-train --backend cuda --input /path/to/data_file
+./svm-train --backend cuda --input /path/to/data_file
 ```
 
 Another example targeting NVIDIA GPUs using the SYCL backend looks like:
 
 ```bash
-> ./svm-train --backend sycl --target_platform gpu_nvidia --input /path/to/data_file
+./svm-train --backend sycl --target_platform gpu_nvidia --input /path/to/data_file
 ```
 
 The `--target_platform=automatic` flags works for the different backends as follows:
@@ -262,7 +262,7 @@ The `--target_platform=automatic` flags works for the different backends as foll
 ### Predicting
 
 ```bash
-> ./svm-predict --help
+./svm-predict --help
 LS-SVM with multiple (GPU-)backends
 Usage:
   ./svm-predict [OPTION...] test_file model_file [output_file]
@@ -279,13 +279,13 @@ Usage:
 An example invocation could look like:
 
 ```bash
-> ./svm-predict --backend cuda --test /path/to/test_file --model /path/to/model_file
+./svm-predict --backend cuda --test /path/to/test_file --model /path/to/model_file
 ```
 
 Another example targeting NVIDIA GPUs using the SYCL backend looks like:
 
 ```bash
-> ./svm-predict --backend sycl --target_platform gpu_nvidia --test /path/to/test_file --model /path/to/model_file
+./svm-predict --backend sycl --target_platform gpu_nvidia --test /path/to/test_file --model /path/to/model_file
 ```
 
 The `--target_platform=automatic` flags works like in the training (`./svm-train`) case.

From 97f28deef4b52a731fe395e225c1e18eb6a15396 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 09:44:27 +0100
Subject: [PATCH 25/56] Add SYCL kernel invocation type to be able to change
 from the nd_range formulation to the hierarchical using a command line switch
 (SYCL only).

---
 include/plssvm/backends/SYCL/csvm.hpp         |   9 +-
 .../backends/SYCL/kernel_invocation_type.hpp  |  46 +++
 ...kernel.hpp => svm_kernel_hierarchical.hpp} |  12 +-
 .../backends/SYCL/svm_kernel_nd_range.hpp     | 378 ++++++++++++++++++
 include/plssvm/core.hpp                       |   2 +
 include/plssvm/parameter.hpp                  |  10 +-
 include/plssvm/parameter_train.hpp            |   1 +
 src/main_predict.cpp                          |  10 +-
 src/main_train.cpp                            |  11 +-
 ...kernel.cpp => svm_kernel_hierarchical.cpp} |   0
 src/plssvm/backends/SYCL/CMakeLists.txt       |   1 +
 src/plssvm/backends/SYCL/csvm.cpp             | 100 +++--
 .../backends/SYCL/kernel_invocation_type.cpp  |  49 +++
 src/plssvm/parameter.cpp                      |  30 +-
 src/plssvm/parameter_train.cpp                |   4 +
 15 files changed, 609 insertions(+), 54 deletions(-)
 create mode 100644 include/plssvm/backends/SYCL/kernel_invocation_type.hpp
 rename include/plssvm/backends/SYCL/{svm_kernel.hpp => svm_kernel_hierarchical.hpp} (95%)
 create mode 100644 include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp
 rename src/plssvm/backends/OpenMP/{svm_kernel.cpp => svm_kernel_hierarchical.cpp} (100%)
 create mode 100644 src/plssvm/backends/SYCL/kernel_invocation_type.cpp

diff --git a/include/plssvm/backends/SYCL/csvm.hpp b/include/plssvm/backends/SYCL/csvm.hpp
index 1ecda3705..4d2018fff 100644
--- a/include/plssvm/backends/SYCL/csvm.hpp
+++ b/include/plssvm/backends/SYCL/csvm.hpp
@@ -11,8 +11,9 @@
 
 #pragma once
 
-#include "plssvm/backends/SYCL/detail/device_ptr.hpp"  // plssvm::sycl::detail::device_ptr
-#include "plssvm/backends/gpu_csvm.hpp"                // plssvm::detail::gpu_csvm
+#include "plssvm/backends/SYCL/detail/device_ptr.hpp"       // plssvm::sycl::detail::device_ptr
+#include "plssvm/backends/SYCL/kernel_invocation_type.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/gpu_csvm.hpp"                     // plssvm::detail::gpu_csvm
 
 #include "sycl/sycl.hpp"  // sycl::queue
 
@@ -105,6 +106,10 @@ class csvm : public ::plssvm::detail::gpu_csvm<T, ::plssvm::sycl::detail::device
      * @copydoc plssvm::detail::gpu_csvm::run_predict_kernel
      */
     void run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, std::size_t num_predict_points) final;
+
+  private:
+    /// The SYCL kernel invocation type for the svm kernel. Either nd_range or hierarchical.
+    kernel_invocation_type invocation_type_;
 };
 
 extern template class csvm<float>;
diff --git a/include/plssvm/backends/SYCL/kernel_invocation_type.hpp b/include/plssvm/backends/SYCL/kernel_invocation_type.hpp
new file mode 100644
index 000000000..1bd291d04
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel_invocation_type.hpp
@@ -0,0 +1,46 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Defines all available kernel invoke types when using SYCL.
+ */
+
+#pragma once
+
+#include <iosfwd>  // forward declare std::ostream and std::istream
+
+namespace plssvm::sycl {
+
+/**
+ * @brief Enum class for all possible SYCL kernel invocation types.
+ */
+enum class kernel_invocation_type {
+    /** Use the best kernel invocation type for the current SYCL implementation and target hardware platform. */
+    automatic,
+    /** Use the [*nd_range* invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke). */
+    nd_range,
+    /** Use the SYCL specific [hierarchical invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_hierarchical_invoke). */
+    hierarchical
+};
+
+/**
+ * @brief Output the @p invocation type to the given output-stream @p out.
+ * @param[in,out] out the output-stream to write the backend type to
+ * @param[in] invocation the SYCL kernel invocation type
+ * @return the output-stream
+ */
+std::ostream &operator<<(std::ostream &out, kernel_invocation_type invocation);
+
+/**
+ * @brief Use the input-stream @p in to initialize the @p invocation type.
+ * @param[in,out] in input-stream to extract the backend type from
+ * @param[in] invocation the SYCL kernel invocation type
+ * @return the input-stream
+ */
+std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation);
+
+}  // namespace plssvm::sycl
diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp
similarity index 95%
rename from include/plssvm/backends/SYCL/svm_kernel.hpp
rename to include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp
index 1ac1df6b6..bb6c0a75a 100644
--- a/include/plssvm/backends/SYCL/svm_kernel.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp
@@ -28,7 +28,7 @@ namespace plssvm::sycl {
  * @tparam T the type of the data
  */
 template <typename T>
-class device_kernel_linear {
+class hierarchical_device_kernel_linear {
   public:
     /// The type of the data.
     using real_type = T;
@@ -48,7 +48,7 @@ class device_kernel_linear {
      * @param[in] add denotes whether the values are added or subtracted from the result vector
      * @param[in] id the id of the device
      */
-    device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
+    hierarchical_device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
         queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
 
     /**
@@ -202,7 +202,7 @@ class device_kernel_linear {
  * @tparam T the type of the data
  */
 template <typename T>
-class device_kernel_poly {
+class hierarchical_device_kernel_poly {
   public:
     /// The type of the data.
     using real_type = T;
@@ -224,7 +224,7 @@ class device_kernel_poly {
      * @param[in] gamma the gamma parameter used in the polynomial kernel function
      * @param[in] coef0 the coef0 parameter used in the polynomial kernel function
      */
-    device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
+    hierarchical_device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
         queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
 
     /**
@@ -373,7 +373,7 @@ class device_kernel_poly {
  * @tparam T the type of the data
  */
 template <typename T>
-class device_kernel_radial {
+class hierarchical_device_kernel_radial {
   public:
     /// The type of the data.
     using real_type = T;
@@ -393,7 +393,7 @@ class device_kernel_radial {
      * @param[in] add denotes whether the values are added or subtracted from the result vector
      * @param[in] gamma the gamma parameter used in the rbf kernel function
      */
-    device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
+    hierarchical_device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
         queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
 
     /**
diff --git a/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp
new file mode 100644
index 000000000..7d39c9867
--- /dev/null
+++ b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp
@@ -0,0 +1,378 @@
+/**
+* @file
+* @author Alexander Van Craen
+* @author Marcel Breyer
+* @copyright 2018-today The PLSSVM project - All Rights Reserved
+* @license This file is part of the PLSSVM project which is released under the MIT license.
+*          See the LICENSE.md file in the project root for full license information.
+*
+* @brief Defines the kernel functions for the C-SVM using the SYCL backend.
+*/
+
+#pragma once
+
+#include "plssvm/backends/SYCL/detail/constants.hpp"  // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
+#include "plssvm/constants.hpp"                       // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
+
+#include "sycl/sycl.hpp"  // sycl::nd_item, sycl::handler, sycl::accessor, sycl::access::mode, sycl::access::target, sycl::range, sycl::group_barrier, sycl::pow,
+                          // sycl::exp, sycl::atomic_ref, sycl::memory_order, sycl::memory_scope, sycl::access::address_space
+
+#include <cstddef>  // std::size_t
+
+namespace plssvm::sycl {
+
+// TODO: change to ::sycl::local_accessor once implemented in the SYCL implementations
+/**
+* @brief Shortcut alias for a SYCL local accessor.
+* @tparam T the type of the accessed values
+*/
+template <typename T>
+using local_accessor = ::sycl::accessor<T, 2, ::sycl::access::mode::read_write, ::sycl::access::target::local>;
+
+/**
+* @brief Calculates the C-SVM kernel using the linear kernel function.
+* @details Supports multi-GPU execution.
+* @tparam T the type of the data
+*/
+template <typename T>
+class nd_range_device_kernel_linear {
+  public:
+    /// The type of the data.
+    using real_type = T;
+
+    /**
+    * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel.
+    * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
+    * @param[in] q the `q` vector
+    * @param[out] ret the result vector
+    * @param[in] d the right-hand side of the equation
+    * @param[in] data_d the one-dimension data matrix
+    * @param[in] QA_cost he bottom right matrix entry multiplied by cost
+    * @param[in] cost 1 / the cost parameter in the C-SVM
+    * @param[in] num_rows the number of columns in the data matrix
+    * @param[in] feature_range number of features used for the calculation on the device @p id
+    * @param[in] add denotes whether the values are added or subtracted from the result vector
+    * @param[in] id the id of the device
+    */
+    nd_range_device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
+        data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
+
+    /**
+    * @brief Function call operator overload performing the actual calculation.
+    * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
+    *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+    */
+    void operator()(::sycl::nd_item<2> nd_idx) const {
+        kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+        kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+
+        real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
+        real_type data_j[INTERNAL_BLOCK_SIZE];
+
+        if (i >= j) {
+            i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+            j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+            // cache data
+            for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) {
+                ::sycl::group_barrier(nd_idx.get_group());
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                    const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
+                    if (nd_idx.get_local_id(1) == idx) {
+                        data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
+                    }
+                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                    if (nd_idx.get_local_id(0) == idx_2) {
+                        data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
+                    }
+                }
+                ::sycl::group_barrier(nd_idx.get_group());
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                    data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
+                }
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                    const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                        matr[k][l] += data_i * data_j[k];
+                    }
+                }
+            }
+
+            #pragma unroll INTERNAL_BLOCK_SIZE
+            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                real_type ret_jx = 0.0;
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                    real_type temp;
+                    if (device_ == 0) {
+                        temp = (matr[x][y] + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
+                    } else {
+                        temp = matr[x][y] * add_;
+                    }
+                    if (i + x > j + y) {
+                        // upper triangular matrix
+                        atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
+                        ret_jx += temp * d_[i + y];
+                    } else if (i + x == j + y) {
+                        // diagonal
+                        if (device_ == 0) {
+                            ret_jx += (temp + cost_ * add_) * d_[i + y];
+                        } else {
+                            ret_jx += temp * d_[i + y];
+                        }
+                    }
+                }
+                atomic_op<real_type>{ ret_[j + x] } += ret_jx;
+            }
+        }
+    }
+
+  private:
+    local_accessor<real_type> data_intern_i_;
+    local_accessor<real_type> data_intern_j_;
+
+    const real_type *q_;
+    real_type *ret_;
+    const real_type *d_;
+    const real_type *data_d_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const kernel_index_type num_rows_;
+    const kernel_index_type feature_range_;
+    const real_type add_;
+    const kernel_index_type device_;
+};
+
+/**
+* @brief Calculates the C-SVM kernel using the polynomial kernel function.
+* @details Currently only single GPU execution is supported.
+* @tparam T the type of the data
+*/
+template <typename T>
+class nd_range_device_kernel_poly {
+  public:
+    /// The type of the data.
+    using real_type = T;
+
+    /**
+    * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel.
+    * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
+    * @param[in] q the `q` vector
+    * @param[out] ret the result vector
+    * @param[in] d the right-hand side of the equation
+    * @param[in] data_d the one-dimension data matrix
+    * @param[in] QA_cost he bottom right matrix entry multiplied by cost
+    * @param[in] cost 1 / the cost parameter in the C-SVM
+    * @param[in] num_rows the number of columns in the data matrix
+    * @param[in] num_cols the number of rows in the data matrix
+    * @param[in] add denotes whether the values are added or subtracted from the result vector
+    * @param[in] degree the degree parameter used in the polynomial kernel function
+    * @param[in] gamma the gamma parameter used in the polynomial kernel function
+    * @param[in] coef0 the coef0 parameter used in the polynomial kernel function
+    */
+    nd_range_device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
+        data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
+
+    /**
+    * @brief Function call operator overload performing the actual calculation.
+    * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
+    *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+    */
+    void operator()(::sycl::nd_item<2> nd_idx) const {
+        kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+        kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+
+        real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
+        real_type data_j[INTERNAL_BLOCK_SIZE];
+
+        if (i >= j) {
+            i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+            j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+            // cache data
+            for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
+                ::sycl::group_barrier(nd_idx.get_group());
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                    const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
+                    if (nd_idx.get_local_id(1) == idx) {
+                        data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
+                    }
+                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                    if (nd_idx.get_local_id(0) == idx_2) {
+                        data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
+                    }
+                }
+                ::sycl::group_barrier(nd_idx.get_group());
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                    data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
+                }
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                    const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                        matr[k][l] += data_i * data_j[k];
+                    }
+                }
+            }
+
+            #pragma unroll INTERNAL_BLOCK_SIZE
+            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                real_type ret_jx = 0.0;
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                    const real_type temp = (::sycl::pow(gamma_ * matr[x][y] + coef0_, static_cast<real_type>(degree_)) + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
+                    if (i + x > j + y) {
+                        // upper triangular matrix
+                        atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
+                        ret_jx += temp * d_[i + y];
+                    } else if (i + x == j + y) {
+                        // diagonal
+                        ret_jx += (temp + cost_ * add_) * d_[i + y];
+                    }
+                }
+                atomic_op<real_type>{ ret_[j + x] } += ret_jx;
+            }
+        }
+    }
+
+  private:
+    local_accessor<real_type> data_intern_i_;
+    local_accessor<real_type> data_intern_j_;
+
+    const real_type *q_;
+    real_type *ret_;
+    const real_type *d_;
+    const real_type *data_d_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const kernel_index_type num_rows_;
+    const kernel_index_type num_cols_;
+    const real_type add_;
+    const int degree_;
+    const real_type gamma_;
+    const real_type coef0_;
+};
+
+/**
+* @brief Calculates the C-SVM kernel using the radial basis functions kernel function.
+* @details Currently only single GPU execution is supported.
+* @tparam T the type of the data
+*/
+template <typename T>
+class nd_range_device_kernel_radial {
+  public:
+    /// The type of the data.
+    using real_type = T;
+
+    /**
+    * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel.
+    * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
+    * @param[in] q the `q` vector
+    * @param[out] ret the result vector
+    * @param[in] d the right-hand side of the equation
+    * @param[in] data_d the one-dimension data matrix
+    * @param[in] QA_cost he bottom right matrix entry multiplied by cost
+    * @param[in] cost 1 / the cost parameter in the C-SVM
+    * @param[in] num_rows the number of columns in the data matrix
+    * @param[in] num_cols the number of rows in the data matrix
+    * @param[in] add denotes whether the values are added or subtracted from the result vector
+    * @param[in] gamma the gamma parameter used in the rbf kernel function
+    */
+    nd_range_device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
+        data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
+
+    /**
+    * @brief Function call operator overload performing the actual calculation.
+    * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class)
+    *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+    */
+    void operator()(::sycl::nd_item<2> nd_idx) const {
+        kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+        kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+
+        real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
+        real_type data_j[INTERNAL_BLOCK_SIZE];
+
+        if (i >= j) {
+            i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+            j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+
+            // cache data
+            for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
+                ::sycl::group_barrier(nd_idx.get_group());
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                    const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
+                    if (nd_idx.get_local_id(1) == idx) {
+                        data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
+                    }
+                    const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                    if (nd_idx.get_local_id(0) == idx_2) {
+                        data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
+                    }
+                }
+                ::sycl::group_barrier(nd_idx.get_group());
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                    data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
+                }
+
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                    const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                        matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]);
+                    }
+                }
+            }
+
+            #pragma unroll INTERNAL_BLOCK_SIZE
+            for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                real_type ret_jx = 0.0;
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                    const real_type temp = (::sycl::exp(-gamma_ * matr[x][y]) + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
+                    if (i + x > j + y) {
+                        // upper triangular matrix
+                        atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
+                        ret_jx += temp * d_[i + y];
+                    } else if (i + x == j + y) {
+                        // diagonal
+                        ret_jx += (temp + cost_ * add_) * d_[i + y];
+                    }
+                }
+                atomic_op<real_type>{ ret_[j + x] } += ret_jx;
+            }
+        }
+    }
+
+  private:
+    local_accessor<real_type> data_intern_i_;
+    local_accessor<real_type> data_intern_j_;
+
+    const real_type *q_;
+    real_type *ret_;
+    const real_type *d_;
+    const real_type *data_d_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const kernel_index_type num_rows_;
+    const kernel_index_type num_cols_;
+    const real_type add_;
+    const real_type gamma_;
+};
+
+}  // namespace plssvm::sycl
diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp
index 28e2c5d77..62336def6 100644
--- a/include/plssvm/core.hpp
+++ b/include/plssvm/core.hpp
@@ -25,6 +25,8 @@
 #include "plssvm/exceptions/exceptions.hpp"
 #include "plssvm/version/version.hpp"
 
+#include "plssvm/backends/SYCL/kernel_invocation_type.hpp"
+
 /// The main namespace containing all public API functions.
 namespace plssvm {}
 
diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp
index ccc102124..47d9ee323 100644
--- a/include/plssvm/parameter.hpp
+++ b/include/plssvm/parameter.hpp
@@ -11,9 +11,10 @@
 
 #pragma once
 
-#include "plssvm/backend_types.hpp"     // plssvm::backend_type
-#include "plssvm/kernel_types.hpp"      // plssvm::kernel_type
-#include "plssvm/target_platforms.hpp"  // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                         // plssvm::backend_type
+#include "plssvm/backends/SYCL/kernel_invocation_type.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/kernel_types.hpp"                          // plssvm::kernel_type
+#include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
 
 #include <iosfwd>       // forward declare std::ostream
 #include <memory>       // std::shared_ptr
@@ -191,6 +192,9 @@ class parameter {
     /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD or Intel.
     target_platform target = target_platform::automatic;
 
+    /// The kernel invocation type when using SYCL as backend.
+    sycl::kernel_invocation_type sycl_kernel_invocation_type = sycl::kernel_invocation_type::automatic;
+
     /// The name of the data/test file to parse.
     std::string input_filename{};
     /// The name of the model file to write the learned support vectors to/to parse the saved model from.
diff --git a/include/plssvm/parameter_train.hpp b/include/plssvm/parameter_train.hpp
index 223b9c9d6..9bb4903dc 100644
--- a/include/plssvm/parameter_train.hpp
+++ b/include/plssvm/parameter_train.hpp
@@ -36,6 +36,7 @@ class parameter_train : public parameter<T> {
     using base_type::kernel;
     using base_type::print_info;
     using base_type::target;
+    using base_type::sycl_kernel_invocation_type;
 
     using base_type::input_filename;
     using base_type::model_filename;
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index e83442ad9..ab2a42856 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -17,7 +17,7 @@
 #include <chrono>     // std::chrono
 #include <exception>  // std::exception
 #include <fstream>    // std::ofstream
-#include <iostream>   // std::cerr, std::endl
+#include <iostream>   // std::cerr, std::clog, std::endl
 #include <vector>     // std::vector
 
 // perform calculations in single precision if requested
@@ -32,6 +32,14 @@ int main(int argc, char *argv[]) {
         // parse SVM parameter from command line
         plssvm::parameter_predict<real_type> params{ argc, argv };
 
+        // warn if kernel invocation type nd_range or hierarchical are explicitly set but SYCL isn't the current backend
+        if (params.backend != plssvm::backend_type::sycl && params.sycl_kernel_invocation_type != plssvm::sycl::kernel_invocation_type::automatic) {
+            std::clog << fmt::format(
+                "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}",
+                params.sycl_kernel_invocation_type)
+                      << std::endl;
+        }
+
         // output used parameter
         if (params.print_info) {
             fmt::print("\n");
diff --git a/src/main_train.cpp b/src/main_train.cpp
index a14ae55a1..62e1e2ddf 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -10,10 +10,11 @@
 
 #include "plssvm/core.hpp"
 
+#include "fmt/core.h"     // std::format
 #include "fmt/ostream.h"  // use operator<< to output enum class
 
 #include <exception>  // std::exception
-#include <iostream>   // std::cerr, std::endl
+#include <iostream>   // std::cerr, std::clog, std::endl
 
 // perform calculations in single precision if requested
 #ifdef PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION
@@ -27,6 +28,14 @@ int main(int argc, char *argv[]) {
         // parse SVM parameter from command line
         plssvm::parameter_train<real_type> params{ argc, argv };
 
+        // warn if kernel invocation type nd_range or hierarchical are explicitly set but SYCL isn't the current backend
+        if (params.backend != plssvm::backend_type::sycl && params.sycl_kernel_invocation_type != plssvm::sycl::kernel_invocation_type::automatic) {
+            std::clog << fmt::format(
+                "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}",
+                params.sycl_kernel_invocation_type)
+                      << std::endl;
+        }
+
         // output used parameter
         if (params.print_info) {
             fmt::print("\n");
diff --git a/src/plssvm/backends/OpenMP/svm_kernel.cpp b/src/plssvm/backends/OpenMP/svm_kernel_hierarchical.cpp
similarity index 100%
rename from src/plssvm/backends/OpenMP/svm_kernel.cpp
rename to src/plssvm/backends/OpenMP/svm_kernel_hierarchical.cpp
diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index cef2b4073..b38db8494 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -45,6 +45,7 @@ set(PLSSVM_SYCL_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/kernel_invocation_type.cpp
     ${CMAKE_CURRENT_LIST_DIR}/../gpu_csvm.cpp
 )
 
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index eddfcf9b9..5f6dc5890 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -8,20 +8,21 @@
 
 #include "plssvm/backends/SYCL/csvm.hpp"
 
-#include "plssvm/backends/SYCL/detail/device_ptr.hpp"  // plssvm::detail::sycl::device_ptr
-#include "plssvm/backends/SYCL/detail/utility.hpp"     // plssvm::detail::sycl::get_device_list, plssvm::detail::sycl::device_synchronize
-#include "plssvm/backends/SYCL/exceptions.hpp"         // plssvm::sycl::backend_exception
-#include "plssvm/backends/SYCL/predict_kernel.hpp"     // plssvm::sycl::kernel_w, plssvm::sycl::predict_points_poly, plssvm::sycl::predict_points_rbf
-#include "plssvm/backends/SYCL/q_kernel.hpp"           // plssvm::sycl::device_kernel_q_linear, plssvm::sycl::device_kernel_q_poly, plssvm::sycl::device_kernel_q_radial
-#include "plssvm/backends/SYCL/svm_kernel.hpp"         // plssvm::sycl::device_kernel_linear, plssvm::sycl::device_kernel_poly, plssvm::sycl::device_kernel_radial
-#include "plssvm/backends/gpu_csvm.hpp"                // plssvm::detail::gpu_csvm
-#include "plssvm/constants.hpp"                        // plssvm::kernel_index_type
-#include "plssvm/detail/assert.hpp"                    // PLSSVM_ASSERT
-#include "plssvm/detail/execution_range.hpp"           // plssvm::detail::execution_range
-#include "plssvm/exceptions/exceptions.hpp"            // plssvm::exception
-#include "plssvm/kernel_types.hpp"                     // plssvm::kernel_type
-#include "plssvm/parameter.hpp"                        // plssvm::parameter
-#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
+#include "plssvm/backends/SYCL/detail/device_ptr.hpp"        // plssvm::detail::sycl::device_ptr
+#include "plssvm/backends/SYCL/detail/utility.hpp"           // plssvm::detail::sycl::get_device_list, plssvm::detail::sycl::device_synchronize
+#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::sycl::backend_exception
+#include "plssvm/backends/SYCL/predict_kernel.hpp"           // plssvm::sycl::kernel_w, plssvm::sycl::predict_points_poly, plssvm::sycl::predict_points_rbf
+#include "plssvm/backends/SYCL/q_kernel.hpp"                 // plssvm::sycl::device_kernel_q_linear, plssvm::sycl::device_kernel_q_poly, plssvm::sycl::device_kernel_q_radial
+#include "plssvm/backends/SYCL/svm_kernel_hierarchical.hpp"  // plssvm::sycl::hierarchical_device_kernel_linear, plssvm::sycl::hierarchical_device_kernel_poly, plssvm::sycl::hierarchical_device_kernel_radial
+#include "plssvm/backends/SYCL/svm_kernel_nd_range.hpp"      // plssvm::sycl::nd_range_device_kernel_linear, plssvm::sycl::nd_range_device_kernel_poly, plssvm::sycl::nd_range_device_kernel_radial
+#include "plssvm/backends/gpu_csvm.hpp"                      // plssvm::detail::gpu_csvm
+#include "plssvm/constants.hpp"                              // plssvm::kernel_index_type
+#include "plssvm/detail/assert.hpp"                          // PLSSVM_ASSERT
+#include "plssvm/detail/execution_range.hpp"                 // plssvm::detail::execution_range
+#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::exception
+#include "plssvm/kernel_types.hpp"                           // plssvm::kernel_type
+#include "plssvm/parameter.hpp"                              // plssvm::parameter
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "fmt/core.h"     // fmt::print, fmt::format
 #include "fmt/ostream.h"  // can use fmt using operator<< overloads
@@ -35,7 +36,7 @@ namespace plssvm::sycl {
 
 template <typename T>
 csvm<T>::csvm(const parameter<T> &params) :
-    base_type{ params } {
+    base_type{ params }, invocation_type_{ params.sycl_kernel_invocation_type } {
     // check whether the requested target platform has been enabled
     switch (target_) {
         case target_platform::automatic:
@@ -62,13 +63,25 @@ csvm<T>::csvm(const parameter<T> &params) :
             break;
     }
 
+    // set correct kernel invocation type if "automatic" has been provided
+    if (invocation_type_ == kernel_invocation_type::automatic) {
+        // always use nd_range except for hipSYCL on the CPU
+        // TODO: automatic target_platform
+        if (target_ == target_platform::cpu && PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL) {
+            invocation_type_ = kernel_invocation_type::hierarchical;
+        } else {
+            invocation_type_ = kernel_invocation_type::nd_range;
+        }
+    }
+
     if (print_info_) {
 #if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
-        fmt::print("Using SYCL (hipSYCL) as backend.\n");
+        constexpr std::string_view sycl_implementation_name = "hipSYCL";
 #endif
 #if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
-        fmt::print("Using SYCL (DPC++) as backend.\n");
+        constexpr std::string_view sycl_implementation_name = "DPC++";
 #endif
+        fmt::print("Using SYCL ({}) as backend with the kernel invocation type \"{}\".\n", sycl_implementation_name, invocation_type_);
     }
 
     // get all available devices wrt the requested target platform
@@ -118,17 +131,31 @@ void csvm<T>::device_synchronize(queue_type &queue) {
 }
 
 template <std::size_t I>
-::sycl::nd_range<I> execution_range_to_native(const ::plssvm::detail::execution_range &range) {
+::sycl::nd_range<I> execution_range_to_native(const ::plssvm::detail::execution_range &range, const kernel_invocation_type invocation_type) {
+    PLSSVM_ASSERT(invocation_type != kernel_invocation_type::automatic, "The SYCL kernel invocation type may not be automatic anymore at this point!");
+
+    // set grid value based on used kernel invocation type
+    const auto fill_grid = [&](const std::size_t i) {
+        switch (invocation_type) {
+            case kernel_invocation_type::nd_range:
+                return range.grid[i] * range.block[i];
+            case kernel_invocation_type::hierarchical:
+                return range.grid[i];
+            case kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't create native execution range from kernel invocation type automatic!" };
+        }
+    };
+
     if constexpr (I == 1) {
-        ::sycl::range<1> grid{ range.grid[0] * range.block[0] };
+        ::sycl::range<1> grid{ fill_grid(0) };
         ::sycl::range<1> block{ range.block[0] };
         return ::sycl::nd_range<1>{ grid, block };
     } else if constexpr (I == 2) {
-        ::sycl::range<2> grid{ range.grid[0] * range.block[0], range.grid[1] * range.block[1] };
+        ::sycl::range<2> grid{ fill_grid(0), fill_grid(1) };
         ::sycl::range<2> block{ range.block[0], range.block[1] };
         return ::sycl::nd_range<2>{ grid, block };
     } else if constexpr (I == 3) {
-        ::sycl::range<3> grid{ range.grid[0] * range.block[0], range.grid[1] * range.block[1], range.grid[2] * range.block[2] };
+        ::sycl::range<3> grid{ fill_grid(0), fill_grid(1), fill_grid(2) };
         ::sycl::range<3> block{ range.block[0], range.block[1], range.block[2] };
         return ::sycl::nd_range<3>{ grid, block };
     } else {
@@ -138,7 +165,7 @@ ::sycl::nd_range<I> execution_range_to_native(const ::plssvm::detail::execution_
 
 template <typename T>
 void csvm<T>::run_q_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, const std::size_t num_features) {
-    const ::sycl::nd_range execution_range = execution_range_to_native<1>(range);
+    const ::sycl::nd_range execution_range = execution_range_to_native<1>(range, kernel_invocation_type::nd_range);
     switch (kernel_) {
         case kernel_type::linear:
             devices_[device].parallel_for(execution_range, device_kernel_q_linear(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_features));
@@ -156,30 +183,49 @@ void csvm<T>::run_q_kernel(const std::size_t device, const ::plssvm::detail::exe
 
 template <typename T>
 void csvm<T>::run_svm_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, const real_type add, const std::size_t num_features) {
+    const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, invocation_type_);
     switch (kernel_) {
         case kernel_type::linear:
-            device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)();
+            if (invocation_type_ == kernel_invocation_type::nd_range) {
+                devices_[device].submit([&](::sycl::handler &cgh) {
+                    cgh.parallel_for(execution_range, nd_range_device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device));
+                });
+            } else {
+                hierarchical_device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)();
+            }
             break;
         case kernel_type::polynomial:
             PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!");
-            device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)();
+            if (invocation_type_ == kernel_invocation_type::nd_range) {
+                devices_[device].submit([&](::sycl::handler &cgh) {
+                    cgh.parallel_for(execution_range, nd_range_device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_));
+                });
+            } else {
+                hierarchical_device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)();
+            }
             break;
         case kernel_type::rbf:
             PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!");
-            device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)();
+            if (invocation_type_ == kernel_invocation_type::nd_range) {
+                devices_[device].submit([&](::sycl::handler &cgh) {
+                    cgh.parallel_for(execution_range, nd_range_device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_));
+                });
+            } else {
+                hierarchical_device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)();
+            }
             break;
     }
 }
 
 template <typename T>
 void csvm<T>::run_w_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, const std::size_t num_features) {
-    const ::sycl::nd_range execution_range = execution_range_to_native<1>(range);
+    const ::sycl::nd_range execution_range = execution_range_to_native<1>(range, kernel_invocation_type::nd_range);
     devices_[device].parallel_for(execution_range, device_kernel_w_linear(w_d.get(), data_d_[device].get(), data_last_d_[device].get(), alpha_d.get(), num_data_points_, num_features));
 }
 
 template <typename T>
 void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) {
-    [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range);
+    [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, kernel_invocation_type::nd_range);
 
     switch (kernel_) {
         case kernel_type::linear:
diff --git a/src/plssvm/backends/SYCL/kernel_invocation_type.cpp b/src/plssvm/backends/SYCL/kernel_invocation_type.cpp
new file mode 100644
index 000000000..89a8f2348
--- /dev/null
+++ b/src/plssvm/backends/SYCL/kernel_invocation_type.cpp
@@ -0,0 +1,49 @@
+/**
+* @author Alexander Van Craen
+* @author Marcel Breyer
+* @copyright 2018-today The PLSSVM project - All Rights Reserved
+* @license This file is part of the PLSSVM project which is released under the MIT license.
+*          See the LICENSE.md file in the project root for full license information.
+*/
+
+#include "plssvm/backends/SYCL/kernel_invocation_type.hpp"
+
+#include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
+
+#include <ios>      // std::ios::failbit
+#include <istream>  // std::istream
+#include <ostream>  // std::ostream
+#include <string>   // std::string
+
+namespace plssvm::sycl {
+
+std::ostream &operator<<(std::ostream &out, const kernel_invocation_type target) {
+    switch (target) {
+        case kernel_invocation_type::automatic:
+            return out << "automatic";
+        case kernel_invocation_type::nd_range:
+            return out << "nd_range";
+        case kernel_invocation_type::hierarchical:
+            return out << "hierarchical";
+    }
+    return out << "unknown";
+}
+
+std::istream &operator>>(std::istream &in, kernel_invocation_type &target) {
+    std::string str;
+    in >> str;
+    detail::to_lower_case(str);
+
+    if (str == "automatic") {
+        target = kernel_invocation_type::automatic;
+    } else if (str == "nd_range") {
+        target = kernel_invocation_type::nd_range;
+    } else if (str == "hierarchical") {
+        target = kernel_invocation_type::hierarchical;
+    } else {
+        in.setstate(std::ios::failbit);
+    }
+    return in;
+}
+
+}  // namespace plssvm::sycl
\ No newline at end of file
diff --git a/src/plssvm/parameter.cpp b/src/plssvm/parameter.cpp
index aa2d5a760..bc1adf4d4 100644
--- a/src/plssvm/parameter.cpp
+++ b/src/plssvm/parameter.cpp
@@ -535,20 +535,21 @@ void parameter<T>::parse_test_file(const std::string &filename) {
 template <typename T>
 std::ostream &operator<<(std::ostream &out, const parameter<T> &params) {
     return out << fmt::format(
-               "kernel_type       {}\n"
-               "degree            {}\n"
-               "gamma             {}\n"
-               "coef0             {}\n"
-               "cost              {}\n"
-               "epsilon           {}\n"
-               "print_info        {}\n"
-               "backend           {}\n"
-               "target platform   {}\n"
-               "input_filename    '{}'\n"
-               "model_filename    '{}'\n"
-               "predict_filename  '{}'\n"
-               "rho               {}\n"
-               "real_type         {}\n",
+               "kernel_type                 {}\n"
+               "degree                      {}\n"
+               "gamma                       {}\n"
+               "coef0                       {}\n"
+               "cost                        {}\n"
+               "epsilon                     {}\n"
+               "print_info                  {}\n"
+               "backend                     {}\n"
+               "target platform             {}\n"
+               "SYCL kernel invocation type {}\n"
+               "input_filename              '{}'\n"
+               "model_filename              '{}'\n"
+               "predict_filename            '{}'\n"
+               "rho                         {}\n"
+               "real_type                   {}\n",
                params.kernel,
                params.degree,
                params.gamma,
@@ -558,6 +559,7 @@ std::ostream &operator<<(std::ostream &out, const parameter<T> &params) {
                params.print_info,
                params.backend,
                params.target,
+               params.sycl_kernel_invocation_type,
                params.input_filename,
                params.model_filename,
                params.predict_filename,
diff --git a/src/plssvm/parameter_train.cpp b/src/plssvm/parameter_train.cpp
index 29e716f2c..b4cf3c55c 100644
--- a/src/plssvm/parameter_train.cpp
+++ b/src/plssvm/parameter_train.cpp
@@ -50,6 +50,7 @@ parameter_train<T>::parameter_train(int argc, char **argv) {
             ("e,epsilon", "set the tolerance of termination criterion", cxxopts::value<decltype(epsilon)>()->default_value(fmt::format("{}", epsilon)))
             ("b,backend", "choose the backend: openmp|cuda|opencl|sycl", cxxopts::value<decltype(backend)>()->default_value(detail::as_lower_case(fmt::format("{}", backend))))
             ("p,target_platform", "choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel", cxxopts::value<decltype(target)>()->default_value(detail::as_lower_case(fmt::format("{}", target))))
+            ("sycl_kernel_invocation_type", "choose the kernel invocation type when using SYCL as backend: automatic|nd_range|hierarchical", cxxopts::value<decltype(sycl_kernel_invocation_type)>()->default_value(detail::as_lower_case(fmt::format("{}", sycl_kernel_invocation_type))))
             ("q,quiet", "quiet mode (no outputs)", cxxopts::value<bool>(print_info)->default_value(fmt::format("{}", !print_info)))
             ("h,help", "print this helper message", cxxopts::value<bool>())
             ("input", "", cxxopts::value<decltype(input_filename)>(), "training_set_file")
@@ -105,6 +106,9 @@ parameter_train<T>::parameter_train(int argc, char **argv) {
     // parse target_platform and cast the value to the respective enum
     target = result["target_platform"].as<decltype(target)>();
 
+    // parse kernel invocation type when using SYCL as backend
+    sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as<decltype(sycl_kernel_invocation_type)>();
+
     // parse print info
     print_info = !print_info;
 

From 461919a22925e14d89a3782c80e1290bbee47e5b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 10:13:13 +0100
Subject: [PATCH 26/56] SYCL kernel invocation type now also works with
 automatic target platforms. Outputs for OpenCL and SYCL the used target
 platform if only automatic was provided.

---
 .../plssvm/backends/OpenCL/detail/utility.hpp |  7 +++---
 .../plssvm/backends/SYCL/detail/utility.hpp   | 10 ++++----
 src/plssvm/backends/OpenCL/csvm.cpp           | 13 +++++++----
 src/plssvm/backends/OpenCL/detail/utility.cpp | 20 +++++++++-------
 src/plssvm/backends/SYCL/csvm.cpp             | 16 ++++++++-----
 src/plssvm/backends/SYCL/detail/utility.cpp   | 23 +++++++++++--------
 6 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/include/plssvm/backends/OpenCL/detail/utility.hpp b/include/plssvm/backends/OpenCL/detail/utility.hpp
index fad3153dc..fcbdee701 100644
--- a/include/plssvm/backends/OpenCL/detail/utility.hpp
+++ b/include/plssvm/backends/OpenCL/detail/utility.hpp
@@ -46,7 +46,8 @@ namespace plssvm::opencl::detail {
 void device_assert(error_code code, std::string_view msg = "");
 
 /**
- * @brief Returns the list devices matching the target platform @p target.
+ * @brief Returns the list devices matching the target platform @p target and the actually used target platform
+ *        (only interesting if the provided @p target was automatic).
  * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order:
  *          1. NVIDIA GPUs
  *          2. AMD GPUs
@@ -54,9 +55,9 @@ void device_assert(error_code code, std::string_view msg = "");
  *          4. CPUs
  *
  * @param[in] target the target platform for which the devices must match
- * @return the command queues (`[[nodiscard]]`)
+ * @return the command queues and used target platform (`[[nodiscard]]`)
  */
-[[nodiscard]] std::vector<command_queue> get_command_queues(target_platform target);
+[[nodiscard]] std::pair<std::vector<command_queue>, target_platform> get_command_queues(target_platform target);
 
 /**
  * @brief Wait for the compute device associated with @p queue to finish.
diff --git a/include/plssvm/backends/SYCL/detail/utility.hpp b/include/plssvm/backends/SYCL/detail/utility.hpp
index d6afe8bb1..17f0b60e3 100644
--- a/include/plssvm/backends/SYCL/detail/utility.hpp
+++ b/include/plssvm/backends/SYCL/detail/utility.hpp
@@ -15,12 +15,14 @@
 
 #include "sycl/sycl.hpp"  // sycl::queue
 
-#include <vector>  // std::vector
+#include <utility>  // std::pair
+#include <vector>   // std::vector
 
 namespace plssvm::sycl::detail {
 
 /**
- * @brief Returns the list devices matching the target platform @p target.
+ * @brief Returns the list devices matching the target platform @p target and the actually used target platform
+ *        (only interesting if the provided @p target was automatic).
  * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order:
  *          1. NVIDIA GPUs
  *          2. AMD GPUs
@@ -28,9 +30,9 @@ namespace plssvm::sycl::detail {
  *          4. CPUs
  *
  * @param[in] target the target platform for which the devices must match
- * @return the devices (`[[nodiscard]]`)
+ * @return the devices and used target platform (`[[nodiscard]]`)
  */
-[[nodiscard]] std::vector<::sycl::queue> get_device_list(target_platform target);
+[[nodiscard]] std::pair<std::vector<::sycl::queue>, target_platform> get_device_list(target_platform target);
 /**
  * @brief Wait for the compute device associated with @p queue to finish.
  * @param[in] queue the SYCL queue to synchronize
diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp
index 71381a86e..03964ca7f 100644
--- a/src/plssvm/backends/OpenCL/csvm.cpp
+++ b/src/plssvm/backends/OpenCL/csvm.cpp
@@ -26,6 +26,7 @@
 
 #include <exception>  // std::terminate
 #include <string>     // std::string
+#include <tuple>      // std::tie
 #include <utility>    // std::pair, std::make_pair, std::move
 #include <vector>     // std::vector
 
@@ -60,14 +61,18 @@ csvm<T>::csvm(const parameter<T> &params) :
             break;
     }
 
+    // get all available devices wrt the requested target platform
+    target_platform used_target;
+    std::tie(devices_, used_target) = detail::get_command_queues(target_);
+    devices_.resize(std::min(devices_.size(), num_features_));
+
     if (print_info_) {
         fmt::print("Using OpenCL as backend.\n");
+        if (target_ == target_platform::automatic) {
+            fmt::print("Using {} as automatic target platform.\n", used_target);
+        }
     }
 
-    // get all available devices wrt the requested target platform
-    devices_ = detail::get_command_queues(target_);
-    devices_.resize(std::min(devices_.size(), num_features_));
-
     // throw exception if no devices for the requested target could be found
     if (devices_.empty()) {
         throw backend_exception{ fmt::format("OpenCL backend selected but no devices for the target {} were found!", target_) };
diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp
index 1ba75d35e..afb4a7d0f 100644
--- a/src/plssvm/backends/OpenCL/detail/utility.cpp
+++ b/src/plssvm/backends/OpenCL/detail/utility.cpp
@@ -45,7 +45,7 @@ void device_assert(const error_code ec, const std::string_view msg) {
     }
 }
 
-std::vector<command_queue> get_command_queues_impl(const target_platform target) {
+[[nodiscard]] std::vector<command_queue> get_command_queues_impl(const target_platform target) {
     std::map<cl_platform_id, std::vector<cl_device_id>> platform_devices;
 
     // get number of platforms
@@ -124,21 +124,25 @@ std::vector<command_queue> get_command_queues_impl(const target_platform target)
     return command_queues;
 }
 
-std::vector<command_queue> get_command_queues(const target_platform target) {
+std::pair<std::vector<command_queue>, target_platform> get_command_queues(const target_platform target) {
     if (target != target_platform::automatic) {
-        return get_command_queues_impl(target);
+        return std::make_pair(get_command_queues_impl(target), target);
     } else {
-        std::vector<command_queue> target_devices = get_command_queues_impl(target_platform::gpu_nvidia);
+        target_platform used_target = target_platform::gpu_nvidia;
+        std::vector<command_queue> target_devices = get_command_queues_impl(used_target);
         if (target_devices.empty()) {
-            target_devices = get_command_queues_impl(target_platform::gpu_amd);
+            used_target = target_platform::gpu_amd;
+            target_devices = get_command_queues_impl(used_target);
             if (target_devices.empty()) {
-                target_devices = get_command_queues_impl(target_platform::gpu_intel);
+                used_target = target_platform::gpu_intel;
+                target_devices = get_command_queues_impl(used_target);
                 if (target_devices.empty()) {
-                    target_devices = get_command_queues_impl(target_platform::cpu);
+                    used_target = target_platform::cpu;
+                    target_devices = get_command_queues_impl(used_target);
                 }
             }
         }
-        return target_devices;
+        return std::make_pair(std::move(target_devices), used_target);
     }
 }
 
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 5f6dc5890..669b3508a 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -30,6 +30,7 @@
 
 #include <cstddef>    // std::size_t
 #include <exception>  // std::terminate
+#include <tuple>      // std::tie
 #include <vector>     // std::vector
 
 namespace plssvm::sycl {
@@ -63,11 +64,15 @@ csvm<T>::csvm(const parameter<T> &params) :
             break;
     }
 
+    // get all available devices wrt the requested target platform
+    target_platform used_target;
+    std::tie(devices_, used_target) = detail::get_device_list(target_);
+    devices_.resize(std::min(devices_.size(), num_features_));
+
     // set correct kernel invocation type if "automatic" has been provided
     if (invocation_type_ == kernel_invocation_type::automatic) {
         // always use nd_range except for hipSYCL on the CPU
-        // TODO: automatic target_platform
-        if (target_ == target_platform::cpu && PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL) {
+        if (used_target == target_platform::cpu && PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL) {
             invocation_type_ = kernel_invocation_type::hierarchical;
         } else {
             invocation_type_ = kernel_invocation_type::nd_range;
@@ -82,12 +87,11 @@ csvm<T>::csvm(const parameter<T> &params) :
         constexpr std::string_view sycl_implementation_name = "DPC++";
 #endif
         fmt::print("Using SYCL ({}) as backend with the kernel invocation type \"{}\".\n", sycl_implementation_name, invocation_type_);
+        if (target_ == target_platform::automatic) {
+            fmt::print("Using {} as automatic target platform.\n", used_target);
+        }
     }
 
-    // get all available devices wrt the requested target platform
-    devices_ = detail::get_device_list(target_);
-    devices_.resize(std::min(devices_.size(), num_features_));
-
     // throw exception if no devices for the requested target could be found
     if (devices_.empty()) {
         throw backend_exception{ fmt::format("SYCL backend selected but no devices for the target {} were found!", target_) };
diff --git a/src/plssvm/backends/SYCL/detail/utility.cpp b/src/plssvm/backends/SYCL/detail/utility.cpp
index cac051ef0..d47550582 100644
--- a/src/plssvm/backends/SYCL/detail/utility.cpp
+++ b/src/plssvm/backends/SYCL/detail/utility.cpp
@@ -14,8 +14,9 @@
 
 #include "sycl/sycl.hpp"  // sycl::queue, sycl::platform, sycl::device, sycl::property::queue, sycl::info, sycl::gpu_selector
 
-#include <string>  // std::string
-#include <vector>  // std::vector
+#include <string>   // std::string
+#include <utility>  // std::pair, std::make_pair
+#include <vector>   // std::vector
 
 namespace plssvm::sycl::detail {
 
@@ -70,21 +71,25 @@ namespace plssvm::sycl::detail {
     return target_devices;
 }
 
-[[nodiscard]] std::vector<::sycl::queue> get_device_list(const target_platform target) {
+std::pair<std::vector<::sycl::queue>, ::plssvm::target_platform> get_device_list(const target_platform target) {
     if (target != target_platform::automatic) {
-        return get_device_list_impl(target);
+        return std::make_pair(get_device_list_impl(target), target);
     } else {
-        std::vector<::sycl::queue> target_devices = get_device_list_impl(target_platform::gpu_nvidia);
+        target_platform used_target = target_platform::gpu_nvidia;
+        std::vector<::sycl::queue> target_devices = get_device_list_impl(used_target);
         if (target_devices.empty()) {
-            target_devices = get_device_list_impl(target_platform::gpu_amd);
+            used_target = target_platform::gpu_amd;
+            target_devices = get_device_list_impl(used_target);
             if (target_devices.empty()) {
-                target_devices = get_device_list_impl(target_platform::gpu_intel);
+                used_target = target_platform::gpu_intel;
+                target_devices = get_device_list_impl(used_target);
                 if (target_devices.empty()) {
-                    target_devices = get_device_list_impl(target_platform::cpu);
+                    used_target = target_platform::cpu;
+                    target_devices = get_device_list_impl(used_target);
                 }
             }
         }
-        return target_devices;
+        return std::make_pair(std::move(target_devices), used_target);
     }
 }
 

From 5b84ea9c1e4efef6a215db455dfe4edc998c1f19 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 10:31:33 +0100
Subject: [PATCH 27/56] Fix wrong OpenMP svm kernel file name.

---
 .../OpenMP/{svm_kernel_hierarchical.cpp => svm_kernel.cpp}        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/plssvm/backends/OpenMP/{svm_kernel_hierarchical.cpp => svm_kernel.cpp} (100%)

diff --git a/src/plssvm/backends/OpenMP/svm_kernel_hierarchical.cpp b/src/plssvm/backends/OpenMP/svm_kernel.cpp
similarity index 100%
rename from src/plssvm/backends/OpenMP/svm_kernel_hierarchical.cpp
rename to src/plssvm/backends/OpenMP/svm_kernel.cpp

From dd704cd81b889d47765ab0e02d90b20fecdd097d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 10:33:32 +0100
Subject: [PATCH 28/56] Update documentation.

---
 .../plssvm/backends/SYCL/svm_kernel_nd_range.hpp  | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp
index 7d39c9867..634e1b9bb 100644
--- a/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp
@@ -6,11 +6,12 @@
 * @license This file is part of the PLSSVM project which is released under the MIT license.
 *          See the LICENSE.md file in the project root for full license information.
 *
-* @brief Defines the kernel functions for the C-SVM using the SYCL backend.
+* @brief Defines the kernel functions for the C-SVM in the nd_range formulation using the SYCL backend.
 */
 
 #pragma once
 
+#include "plssvm/backends/SYCL/detail/atomics.hpp"    // plssvm::sycl::atomic_op
 #include "plssvm/backends/SYCL/detail/constants.hpp"  // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
 #include "plssvm/constants.hpp"                       // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
 
@@ -30,7 +31,7 @@ template <typename T>
 using local_accessor = ::sycl::accessor<T, 2, ::sycl::access::mode::read_write, ::sycl::access::target::local>;
 
 /**
-* @brief Calculates the C-SVM kernel using the linear kernel function.
+* @brief Calculates the C-SVM kernel using the nd_range formulation and the linear kernel function.
 * @details Supports multi-GPU execution.
 * @tparam T the type of the data
 */
@@ -41,7 +42,7 @@ class nd_range_device_kernel_linear {
     using real_type = T;
 
     /**
-    * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel.
+    * @brief Construct a new device kernel calculating the C-SVM kernel using the linear C-SVM kernel.
     * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
     * @param[in] q the `q` vector
     * @param[out] ret the result vector
@@ -150,7 +151,7 @@ class nd_range_device_kernel_linear {
 };
 
 /**
-* @brief Calculates the C-SVM kernel using the polynomial kernel function.
+* @brief Calculates the C-SVM kernel using the nd_range formulation and the polynomial kernel function.
 * @details Currently only single GPU execution is supported.
 * @tparam T the type of the data
 */
@@ -161,7 +162,7 @@ class nd_range_device_kernel_poly {
     using real_type = T;
 
     /**
-    * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel.
+    * @brief Construct a new device kernel calculating the C-SVM kernel using the polynomial C-SVM kernel.
     * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
     * @param[in] q the `q` vector
     * @param[out] ret the result vector
@@ -265,7 +266,7 @@ class nd_range_device_kernel_poly {
 };
 
 /**
-* @brief Calculates the C-SVM kernel using the radial basis functions kernel function.
+* @brief Calculates the C-SVM kernel using the nd_range formulation and the radial basis functions kernel function.
 * @details Currently only single GPU execution is supported.
 * @tparam T the type of the data
 */
@@ -276,7 +277,7 @@ class nd_range_device_kernel_radial {
     using real_type = T;
 
     /**
-    * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel.
+    * @brief Construct a new device kernel calculating the C-SVM kernel using the radial basis functions C-SVM kernel.
     * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory
     * @param[in] q the `q` vector
     * @param[out] ret the result vector

From 64c0a99a141aacffb5b9333e0e998c493fa60730 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 10:34:58 +0100
Subject: [PATCH 29/56] Update documentation.

---
 include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp
index bb6c0a75a..3202e852c 100644
--- a/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp
@@ -23,7 +23,7 @@
 namespace plssvm::sycl {
 
 /**
- * @brief Calculates the C-SVM kernel using the linear kernel function.
+ * @brief Calculates the C-SVM kernel using the hierarchical formulation and the linear kernel function.
  * @details Supports multi-GPU execution.
  * @tparam T the type of the data
  */
@@ -197,7 +197,7 @@ class hierarchical_device_kernel_linear {
 };
 
 /**
- * @brief Calculates the C-SVM kernel using the polynomial kernel function.
+ * @brief Calculates the C-SVM kernel using the hierarchical formulation and the polynomial kernel function.
  * @details Currently only single GPU execution is supported.
  * @tparam T the type of the data
  */
@@ -368,7 +368,7 @@ class hierarchical_device_kernel_poly {
 };
 
 /**
- * @brief Calculates the C-SVM kernel using the radial basis functions kernel function.
+ * @brief Calculates the C-SVM kernel using the hierarchical formulation and the radial basis functions kernel function.
  * @details Currently only single GPU execution is supported.
  * @tparam T the type of the data
  */

From 7c16ef01f76973d76c736d27e1d55a115fb8a7fa Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 11:02:59 +0100
Subject: [PATCH 30/56] Update hierarchical svm kernel invocation to be more in
 line with the nd_range version.

---
 .../backends/SYCL/svm_kernel_hierarchical.hpp | 630 ++++++++----------
 src/plssvm/backends/SYCL/csvm.cpp             |  36 +-
 2 files changed, 305 insertions(+), 361 deletions(-)

diff --git a/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp
index 3202e852c..f367f45a8 100644
--- a/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp
+++ b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp
@@ -34,9 +34,7 @@ class hierarchical_device_kernel_linear {
     using real_type = T;
 
     /**
-     * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel.
-     * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued
-     * @param[in] range the execution range of the kernel
+     * @brief Construct a new device kernel calculating the C-SVM kernel using the linear C-SVM kernel.
      * @param[in] q the `q` vector
      * @param[out] ret the result vector
      * @param[in] d the right-hand side of the equation
@@ -48,142 +46,126 @@ class hierarchical_device_kernel_linear {
      * @param[in] add denotes whether the values are added or subtracted from the result vector
      * @param[in] id the id of the device
      */
-    hierarchical_device_kernel_linear(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
-        queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
+    hierarchical_device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
+        q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
 
     /**
      * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group the [`sycl::group`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#group-class)
+     *                  identifying an instance of the currently execution work-group
      */
-    void operator()() const {
-        queue_.submit([&](::sycl::handler &cgh) {
-            const real_type *q = q_;
-            real_type *ret = ret_;
-            const real_type *d = d_;
-            const real_type *data_d = data_d_;
-            const real_type QA_cost = QA_cost_;
-            const real_type cost = cost_;
-            const kernel_index_type num_rows = num_rows_;
-            const kernel_index_type feature_range = feature_range_;
-            const real_type add = add_;
-            const kernel_index_type device = device_;
-
-            cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
-                // allocate shared memory
-                real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-                real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-
-                // allocate memory for work-item local variables
-                // -> accessible across different 'parallel_for_work_item' invocations
-                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
-                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
-                ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
-                ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
-                ::sycl::private_memory<bool, 2> private_cond{ group };
-
-                // initialize private variables
-                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                    // indices and diagonal condition
-                    private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-                    private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-                    private_cond(idx) = private_i(idx) >= private_j(idx);
-                    if (private_cond(idx)) {
-                        private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-                        private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-                    }
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+        real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+        // allocate memory for work-item local variables
+        // -> accessible across different 'parallel_for_work_item' invocations
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+        ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
+        ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
+        ::sycl::private_memory<bool, 2> private_cond{ group };
+
+        // initialize private variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            // indices and diagonal condition
+            private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+            private_cond(idx) = private_i(idx) >= private_j(idx);
+            if (private_cond(idx)) {
+                private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+            }
+
+            // matrix
+            #pragma unroll INTERNAL_BLOCK_SIZE
+            for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                    private_matr(idx)[i][j] = real_type{ 0.0 };
+                }
+            }
+        });
 
-                    // matrix
-                    for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
-                        #pragma unroll INTERNAL_BLOCK_SIZE
-                        for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
-                            private_matr(idx)[i][j] = real_type{ 0.0 };
+        // implicit group barrier
+
+        // load data from global in shared memory
+        for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) {
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                if (private_cond(idx)) {
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                        const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                        if (idx.get_local_id(1) == idx_1) {
+                            data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)];
                         }
-                    }
-                });
-
-                // implicit group barrier
-
-                // load data from global in shared memory
-                for (kernel_index_type vec_index = 0; vec_index < feature_range * num_rows; vec_index += num_rows) {
-                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                        if (private_cond(idx)) {
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                                const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
-                                if (idx.get_local_id(1) == idx_1) {
-                                    data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)];
-                                }
-                                const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                                if (idx.get_local_id(0) == idx_2) {
-                                    data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)];
-                                }
-                            }
+                        const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                        if (idx.get_local_id(0) == idx_2) {
+                            data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)];
                         }
-                    });
+                    }
+                }
+            });
 
-                    // implicit group barrier
+            // implicit group barrier
 
-                    // load data from shared in private memory and perform scalar product
-                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                        if (private_cond(idx)) {
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                                private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
-                            }
+            // load data from shared in private memory and perform scalar product
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                if (private_cond(idx)) {
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                        private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                    }
 
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                                const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
-                                #pragma unroll INTERNAL_BLOCK_SIZE
-                                for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                                    private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
-                                }
-                            }
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                        const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                            private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
                         }
-                    });
-
-                    // implicit group barrier
+                    }
                 }
+            });
 
-                // kernel function
-                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                    if (private_cond(idx)) {
-                        #pragma unroll INTERNAL_BLOCK_SIZE
-                        for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                            real_type ret_jx = 0.0;
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                                real_type temp;
-                                if (device == 0) {
-                                    temp = (private_matr(idx)[x][y] + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add;
-                                } else {
-                                    temp = private_matr(idx)[x][y] * add;
-                                }
-                                if (private_i(idx) + x > private_j(idx) + y) {
-                                    // upper triangular matrix
-                                    atomic_op<real_type>{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x];
-                                    ret_jx += temp * d[private_i(idx) + y];
-                                } else if (private_i(idx) + x == private_j(idx) + y) {
-                                    // diagonal
-                                    if (device == 0) {
-                                        ret_jx += (temp + cost * add) * d[private_i(idx) + y];
-                                    } else {
-                                        ret_jx += temp * d[private_i(idx) + y];
-                                    }
-                                }
+            // implicit group barrier
+        }
+
+        // kernel function
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            if (private_cond(idx)) {
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                    real_type ret_jx = 0.0;
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                        real_type temp;
+                        if (device_ == 0) {
+                            temp = (private_matr(idx)[x][y] + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_;
+                        } else {
+                            temp = private_matr(idx)[x][y] * add_;
+                        }
+                        if (private_i(idx) + x > private_j(idx) + y) {
+                            // upper triangular matrix
+                            atomic_op<real_type>{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x];
+                            ret_jx += temp * d_[private_i(idx) + y];
+                        } else if (private_i(idx) + x == private_j(idx) + y) {
+                            // diagonal
+                            if (device_ == 0) {
+                                ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y];
+                            } else {
+                                ret_jx += temp * d_[private_i(idx) + y];
                             }
-                            atomic_op<real_type>{ ret[private_j(idx) + x] } += ret_jx;
                         }
                     }
-                });
-            });
+                    atomic_op<real_type>{ ret_[private_j(idx) + x] } += ret_jx;
+                }
+            }
         });
     }
 
   private:
-    ::sycl::queue &queue_;
-    ::sycl::range<2> global_range_;
-    ::sycl::range<2> local_range_;
-
     const real_type *q_;
     real_type *ret_;
     const real_type *d_;
@@ -208,9 +190,7 @@ class hierarchical_device_kernel_poly {
     using real_type = T;
 
     /**
-     * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel.
-     * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued
-     * @param[in] range the execution range of the kernel
+     * @brief Construct a new device kernel calculating the C-SVM kernel using the polynomial C-SVM kernel.
      * @param[in] q the `q` vector
      * @param[out] ret the result vector
      * @param[in] d the right-hand side of the equation
@@ -224,135 +204,117 @@ class hierarchical_device_kernel_poly {
      * @param[in] gamma the gamma parameter used in the polynomial kernel function
      * @param[in] coef0 the coef0 parameter used in the polynomial kernel function
      */
-    hierarchical_device_kernel_poly(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
-        queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
+    hierarchical_device_kernel_poly(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
+        q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
 
     /**
      * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group the [`sycl::group`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#group-class)
+     *                  identifying an instance of the currently execution work-group
      */
-    void operator()() const {
-        queue_.submit([&](::sycl::handler &cgh) {
-            const real_type *q = q_;
-            real_type *ret = ret_;
-            const real_type *d = d_;
-            const real_type *data_d = data_d_;
-            const real_type QA_cost = QA_cost_;
-            const real_type cost = cost_;
-            const kernel_index_type num_rows = num_rows_;
-            const kernel_index_type num_cols = num_cols_;
-            const real_type add = add_;
-            const int degree = degree_;
-            const real_type gamma = gamma_;
-            const real_type coef0 = coef0_;
-
-            cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
-                // allocate shared memory
-                real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-                real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-
-                // allocate memory for work-item local variables
-                // -> accessible across different 'parallel_for_work_item' invocations
-                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
-                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
-                ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
-                ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
-                ::sycl::private_memory<bool, 2> private_cond{ group };
-
-                // initialize private variables
-                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                    // indices and diagonal condition
-                    private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-                    private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-                    private_cond(idx) = private_i(idx) >= private_j(idx);
-                    if (private_cond(idx)) {
-                        private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-                        private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-                    }
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+        real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+        // allocate memory for work-item local variables
+        // -> accessible across different 'parallel_for_work_item' invocations
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+        ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
+        ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
+        ::sycl::private_memory<bool, 2> private_cond{ group };
+
+        // initialize private variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            // indices and diagonal condition
+            private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+            private_cond(idx) = private_i(idx) >= private_j(idx);
+            if (private_cond(idx)) {
+                private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+            }
+
+            // matrix
+            #pragma unroll INTERNAL_BLOCK_SIZE
+            for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                    private_matr(idx)[i][j] = real_type{ 0.0 };
+                }
+            }
+        });
 
-                    // matrix
-                    for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
-                        #pragma unroll INTERNAL_BLOCK_SIZE
-                        for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
-                            private_matr(idx)[i][j] = real_type{ 0.0 };
+        // implicit group barrier
+
+        // load data from global in shared memory
+        for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                if (private_cond(idx)) {
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                        const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                        if (idx.get_local_id(1) == idx_1) {
+                            data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)];
                         }
-                    }
-                });
-
-                // implicit group barrier
-
-                // load data from global in shared memory
-                for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
-                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                        if (private_cond(idx)) {
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                                const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
-                                if (idx.get_local_id(1) == idx_1) {
-                                    data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)];
-                                }
-                                const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                                if (idx.get_local_id(0) == idx_2) {
-                                    data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)];
-                                }
-                            }
+                        const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                        if (idx.get_local_id(0) == idx_2) {
+                            data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)];
                         }
-                    });
+                    }
+                }
+            });
 
-                    // implicit group barrier
+            // implicit group barrier
 
-                    // load data from shared in private memory and perform scalar product
-                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                        if (private_cond(idx)) {
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                                private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
-                            }
+            // load data from shared in private memory and perform scalar product
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                if (private_cond(idx)) {
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                        private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                    }
 
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                                const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
-                                #pragma unroll INTERNAL_BLOCK_SIZE
-                                for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                                    private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
-                                }
-                            }
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                        const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                            private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
                         }
-                    });
-
-                    // implicit group barrier
+                    }
                 }
+            });
 
-                // kernel function
-                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                    if (private_cond(idx)) {
-                        #pragma unroll INTERNAL_BLOCK_SIZE
-                        for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                            real_type ret_jx = 0.0;
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                                const real_type temp = (::sycl::pow(gamma * private_matr(idx)[x][y] + coef0, static_cast<real_type>(degree)) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add;
-                                if (private_i(idx) + x > private_j(idx) + y) {
-                                    // upper triangular matrix
-                                    atomic_op<real_type>{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x];
-                                    ret_jx += temp * d[private_i(idx) + y];
-                                } else if (private_i(idx) + x == private_j(idx) + y) {
-                                    // diagonal
-                                    ret_jx += (temp + cost * add) * d[private_i(idx) + y];
-                                }
-                            }
-                            atomic_op<real_type>{ ret[private_j(idx) + x] } += ret_jx;
+            // implicit group barrier
+        }
+
+        // kernel function
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            if (private_cond(idx)) {
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                    real_type ret_jx = 0.0;
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                        const real_type temp = (::sycl::pow(gamma_ * private_matr(idx)[x][y] + coef0_, static_cast<real_type>(degree_)) + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_;
+                        if (private_i(idx) + x > private_j(idx) + y) {
+                            // upper triangular matrix
+                            atomic_op<real_type>{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x];
+                            ret_jx += temp * d_[private_i(idx) + y];
+                        } else if (private_i(idx) + x == private_j(idx) + y) {
+                            // diagonal
+                            ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y];
                         }
                     }
-                });
-            });
+                    atomic_op<real_type>{ ret_[private_j(idx) + x] } += ret_jx;
+                }
+            }
         });
     }
 
   private:
-    ::sycl::queue &queue_;
-    ::sycl::range<2> global_range_;
-    ::sycl::range<2> local_range_;
-
     const real_type *q_;
     real_type *ret_;
     const real_type *d_;
@@ -379,9 +341,7 @@ class hierarchical_device_kernel_radial {
     using real_type = T;
 
     /**
-     * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel.
-     * @param[in] queue [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to which the kernel will be enqueued
-     * @param[in] range the execution range of the kernel
+     * @brief Construct a new device kernel calculating the C-SVM kernel using the radial basis functions kernel function.
      * @param[in] q the `q` vector
      * @param[out] ret the result vector
      * @param[in] d the right-hand side of the equation
@@ -393,133 +353,117 @@ class hierarchical_device_kernel_radial {
      * @param[in] add denotes whether the values are added or subtracted from the result vector
      * @param[in] gamma the gamma parameter used in the rbf kernel function
      */
-    hierarchical_device_kernel_radial(::sycl::queue &queue, const ::plssvm::detail::execution_range &range, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
-        queue_{ queue }, global_range_{ range.grid[0], range.grid[1] }, local_range_{ range.block[0], range.block[1] }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
+    hierarchical_device_kernel_radial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
+        q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
 
     /**
      * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group the [`sycl::group`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#group-class)
+     *                  identifying an instance of the currently execution work-group
      */
-    void operator()() const {
-        queue_.submit([&](::sycl::handler &cgh) {
-            const real_type *q = q_;
-            real_type *ret = ret_;
-            const real_type *d = d_;
-            const real_type *data_d = data_d_;
-            const real_type QA_cost = QA_cost_;
-            const real_type cost = cost_;
-            const kernel_index_type num_rows = num_rows_;
-            const kernel_index_type num_cols = num_cols_;
-            const real_type add = add_;
-            const real_type gamma = gamma_;
-
-            cgh.parallel_for_work_group(global_range_, local_range_, [=](::sycl::group<2> group) {
-                // allocate shared memory
-                real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-                real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-
-                // allocate memory for work-item local variables
-                // -> accessible across different 'parallel_for_work_item' invocations
-                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
-                ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
-                ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
-                ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
-                ::sycl::private_memory<bool, 2> private_cond{ group };
-
-                // initialize private variables
-                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                    // indices and diagonal condition
-                    private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
-                    private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
-                    private_cond(idx) = private_i(idx) >= private_j(idx);
-                    if (private_cond(idx)) {
-                        private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
-                        private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
-                    }
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+        real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+        // allocate memory for work-item local variables
+        // -> accessible across different 'parallel_for_work_item' invocations
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
+        ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
+        ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
+        ::sycl::private_memory<bool, 2> private_cond{ group };
+
+        // initialize private variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            // indices and diagonal condition
+            private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
+            private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
+            private_cond(idx) = private_i(idx) >= private_j(idx);
+            if (private_cond(idx)) {
+                private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
+                private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
+            }
+
+            // matrix
+            #pragma unroll INTERNAL_BLOCK_SIZE
+            for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+                    private_matr(idx)[i][j] = real_type{ 0.0 };
+                }
+            }
+        });
 
-                    // matrix
-                    for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
-                        #pragma unroll INTERNAL_BLOCK_SIZE
-                        for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
-                            private_matr(idx)[i][j] = real_type{ 0.0 };
+        // implicit group barrier
+
+        // load data from global in shared memory
+        for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                if (private_cond(idx)) {
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
+                        const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
+                        if (idx.get_local_id(1) == idx_1) {
+                            data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)];
                         }
-                    }
-                });
-
-                // implicit group barrier
-
-                // load data from global in shared memory
-                for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
-                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                        if (private_cond(idx)) {
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
-                                const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
-                                if (idx.get_local_id(1) == idx_1) {
-                                    data_intern_i[idx.get_local_id(0)][block_id] = data_d[block_id + vec_index + private_i(idx)];
-                                }
-                                const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
-                                if (idx.get_local_id(0) == idx_2) {
-                                    data_intern_j[idx.get_local_id(1)][block_id] = data_d[block_id + vec_index + private_j(idx)];
-                                }
-                            }
+                        const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
+                        if (idx.get_local_id(0) == idx_2) {
+                            data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)];
                         }
-                    });
+                    }
+                }
+            });
 
-                    // implicit group barrier
+            // implicit group barrier
 
-                    // load data from shared in private memory and perform scalar product
-                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                        if (private_cond(idx)) {
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
-                                private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
-                            }
+            // load data from shared in private memory and perform scalar product
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                if (private_cond(idx)) {
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
+                        private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
+                    }
 
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
-                                const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
-                                #pragma unroll INTERNAL_BLOCK_SIZE
-                                for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
-                                    private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]);
-                                }
-                            }
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
+                        const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
+                        #pragma unroll INTERNAL_BLOCK_SIZE
+                        for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
+                            private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]);
                         }
-                    });
-
-                    // implicit group barrier
+                    }
                 }
+            });
 
-                // kernel function
-                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                    if (private_cond(idx)) {
-                        #pragma unroll INTERNAL_BLOCK_SIZE
-                        for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
-                            real_type ret_jx = 0.0;
-                            #pragma unroll INTERNAL_BLOCK_SIZE
-                            for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
-                                const real_type temp = (::sycl::exp(-gamma * private_matr(idx)[x][y]) + QA_cost - q[private_i(idx) + y] - q[private_j(idx) + x]) * add;
-                                if (private_i(idx) + x > private_j(idx) + y) {
-                                    // upper triangular matrix
-                                    atomic_op<real_type>{ ret[private_i(idx) + y] } += temp * d[private_j(idx) + x];
-                                    ret_jx += temp * d[private_i(idx) + y];
-                                } else if (private_i(idx) + x == private_j(idx) + y) {
-                                    // diagonal
-                                    ret_jx += (temp + cost * add) * d[private_i(idx) + y];
-                                }
-                            }
-                            atomic_op<real_type>{ ret[private_j(idx) + x] } += ret_jx;
+            // implicit group barrier
+        }
+
+        // kernel function
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            if (private_cond(idx)) {
+                #pragma unroll INTERNAL_BLOCK_SIZE
+                for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
+                    real_type ret_jx = 0.0;
+                    #pragma unroll INTERNAL_BLOCK_SIZE
+                    for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
+                        const real_type temp = (::sycl::exp(-gamma_ * private_matr(idx)[x][y]) + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_;
+                        if (private_i(idx) + x > private_j(idx) + y) {
+                            // upper triangular matrix
+                            atomic_op<real_type>{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x];
+                            ret_jx += temp * d_[private_i(idx) + y];
+                        } else if (private_i(idx) + x == private_j(idx) + y) {
+                            // diagonal
+                            ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y];
                         }
                     }
-                });
-            });
+                    atomic_op<real_type>{ ret_[private_j(idx) + x] } += ret_jx;
+                }
+            }
         });
     }
 
   private:
-    ::sycl::queue &queue_;
-    ::sycl::range<2> global_range_;
-    ::sycl::range<2> local_range_;
-
     const real_type *q_;
     real_type *ret_;
     const real_type *d_;
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 669b3508a..e218fde52 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -190,33 +190,33 @@ void csvm<T>::run_svm_kernel(const std::size_t device, const ::plssvm::detail::e
     const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, invocation_type_);
     switch (kernel_) {
         case kernel_type::linear:
-            if (invocation_type_ == kernel_invocation_type::nd_range) {
-                devices_[device].submit([&](::sycl::handler &cgh) {
+            devices_[device].submit([&](::sycl::handler &cgh) {
+                if (invocation_type_ == kernel_invocation_type::nd_range) {
                     cgh.parallel_for(execution_range, nd_range_device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device));
-                });
-            } else {
-                hierarchical_device_kernel_linear(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)();
-            }
+                } else if (invocation_type_ == kernel_invocation_type::hierarchical) {
+                    cgh.parallel_for_work_group(execution_range.get_global_range(), execution_range.get_local_range(), hierarchical_device_kernel_linear(q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device));
+                }
+            });
             break;
         case kernel_type::polynomial:
             PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!");
-            if (invocation_type_ == kernel_invocation_type::nd_range) {
-                devices_[device].submit([&](::sycl::handler &cgh) {
+            devices_[device].submit([&](::sycl::handler &cgh) {
+                if (invocation_type_ == kernel_invocation_type::nd_range) {
                     cgh.parallel_for(execution_range, nd_range_device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_));
-                });
-            } else {
-                hierarchical_device_kernel_poly(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)();
-            }
+                } else if (invocation_type_ == kernel_invocation_type::hierarchical) {
+                    cgh.parallel_for_work_group(execution_range.get_global_range(), execution_range.get_local_range(), hierarchical_device_kernel_poly(q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_));
+                }
+            });
             break;
         case kernel_type::rbf:
             PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!");
-            if (invocation_type_ == kernel_invocation_type::nd_range) {
-                devices_[device].submit([&](::sycl::handler &cgh) {
+            devices_[device].submit([&](::sycl::handler &cgh) {
+                if (invocation_type_ == kernel_invocation_type::nd_range) {
                     cgh.parallel_for(execution_range, nd_range_device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_));
-                });
-            } else {
-                hierarchical_device_kernel_radial(devices_[device], range, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)();
-            }
+                } else if (invocation_type_ == kernel_invocation_type::hierarchical) {
+                    cgh.parallel_for_work_group(execution_range.get_global_range(), execution_range.get_local_range(), hierarchical_device_kernel_radial(q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_));
+                }
+            });
             break;
     }
 }

From ec0ee3728968aed7fc0cc97d7a68097b0d73c341 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 11:34:57 +0100
Subject: [PATCH 31/56] Move source file from SYCL to base library.

---
 CMakeLists.txt                          | 1 +
 src/plssvm/backends/SYCL/CMakeLists.txt | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 967b5cfe5..44eca562b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,7 @@ set(PLSSVM_BASE_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_predict.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_train.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/target_platforms.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/src/plssvm/backends/SYCL/kernel_invocation_type.cpp
 )
 
 ## create base library: linked against all backend libraries
diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index b38db8494..cef2b4073 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -45,7 +45,6 @@ set(PLSSVM_SYCL_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/kernel_invocation_type.cpp
     ${CMAKE_CURRENT_LIST_DIR}/../gpu_csvm.cpp
 )
 

From ff7432b45fb14d4b79bc96db42ff716e50d8048e Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 11:56:57 +0100
Subject: [PATCH 32/56] Fix parameter output test after adding
 sycl_kernel_invocation_type.

---
 tests/parameter_test.cpp | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/tests/parameter_test.cpp b/tests/parameter_test.cpp
index e997ca9b9..362f3a328 100644
--- a/tests/parameter_test.cpp
+++ b/tests/parameter_test.cpp
@@ -639,20 +639,21 @@ TYPED_TEST(Parameter, output_operator) {
 
     // correct output string
     std::string correct_output =
-        fmt::format("kernel_type       linear\n"
-                    "degree            3\n"
-                    "gamma             0\n"
-                    "coef0             0\n"
-                    "cost              1\n"
-                    "epsilon           0.001\n"
-                    "print_info        true\n"
-                    "backend           openmp\n"
-                    "target platform   automatic\n"
-                    "input_filename    ''\n"
-                    "model_filename    ''\n"
-                    "predict_filename  ''\n"
-                    "rho               0\n"
-                    "real_type         {}\n",
+        fmt::format("kernel_type                 linear\n"
+                    "degree                      3\n"
+                    "gamma                       0\n"
+                    "coef0                       0\n"
+                    "cost                        1\n"
+                    "epsilon                     0.001\n"
+                    "print_info                  true\n"
+                    "backend                     openmp\n"
+                    "target platform             automatic\n"
+                    "SYCL kernel invocation type automatic\n"
+                    "input_filename              ''\n"
+                    "model_filename              ''\n"
+                    "predict_filename            ''\n"
+                    "rho                         0\n"
+                    "real_type                   {}\n",
                     plssvm::detail::arithmetic_type_name<TypeParam>());
 
     // check for equality

From 2baa30e7b65322447a99732a90e1a7d17c7984e6 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 11:57:19 +0100
Subject: [PATCH 33/56] Add tests for SYCL nd_range AND hierarchical kernel
 formulations.

---
 tests/backends/SYCL/test.cpp     | 17 +++++++++++------
 tests/backends/generic_tests.hpp | 16 +++++++++-------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/tests/backends/SYCL/test.cpp b/tests/backends/SYCL/test.cpp
index 89d8d9d00..727ff0e12 100644
--- a/tests/backends/SYCL/test.cpp
+++ b/tests/backends/SYCL/test.cpp
@@ -13,9 +13,10 @@
 #include "backends/generic_tests.hpp"  // generic::write_model_test, generic::generate_q_test, generic::device_kernel_test, generic::predict_test, generic::accuracy_test
 #include "utility.hpp"                 // util::google_test::parameter_definition, util::google_test::parameter_definition_to_name
 
-#include "plssvm/backends/SYCL/csvm.hpp"  // plssvm::sycl::csvm
-#include "plssvm/kernel_types.hpp"        // plssvm::kernel_type
-#include "plssvm/parameter.hpp"           // plssvm::parameter
+#include "plssvm/backends/SYCL/csvm.hpp"                    // plssvm::sycl::csvm
+#include "plssvm/backends/SYCL/kernel_invocation_type.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/kernel_types.hpp"                          // plssvm::kernel_type
+#include "plssvm/parameter.hpp"                             // plssvm::parameter
 
 #include "gtest/gtest.h"  // ::testing::StaticAssertTypeEq, ::testing::Test, ::testing::Types, TYPED_TEST_SUITE, TYPED_TEST
 
@@ -47,9 +48,13 @@ TYPED_TEST(SYCL_CSVM, generate_q) {
     generic::generate_q_test<mock_sycl_csvm, typename TypeParam::real_type, TypeParam::kernel>();
 }
 
-// check whether the device kernels are correct
-TYPED_TEST(SYCL_CSVM, device_kernel) {
-    generic::device_kernel_test<mock_sycl_csvm, typename TypeParam::real_type, TypeParam::kernel>();
+// check whether the nd_range device kernels are correct
+TYPED_TEST(SYCL_CSVM, device_kernel_nd_range) {
+    generic::device_kernel_test<mock_sycl_csvm, typename TypeParam::real_type, TypeParam::kernel, plssvm::sycl::kernel_invocation_type::nd_range>();
+}
+// check whether the hierarchical device kernels are correct
+TYPED_TEST(SYCL_CSVM, device_kernel_hierarchical) {
+    generic::device_kernel_test<mock_sycl_csvm, typename TypeParam::real_type, TypeParam::kernel, plssvm::sycl::kernel_invocation_type::hierarchical>();
 }
 
 // check whether the correct labels are predicted
diff --git a/tests/backends/generic_tests.hpp b/tests/backends/generic_tests.hpp
index 3fadd734d..1cdf24b33 100644
--- a/tests/backends/generic_tests.hpp
+++ b/tests/backends/generic_tests.hpp
@@ -15,12 +15,13 @@
 #include "mock_csvm.hpp"         // mock_csvm
 #include "utility.hpp"           // util::gtest_assert_floating_point_near, util::gtest_assert_floating_point_eq, util::gtest_expect_correct_csvm_factory, util::create_temp_file
 
-#include "plssvm/backend_types.hpp"             // plssvm::backend_type
-#include "plssvm/constants.hpp"                 // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
-#include "plssvm/detail/string_conversion.hpp"  // plssvm::detail::convert_to
-#include "plssvm/exceptions/exceptions.hpp"     // plssvm::exception
-#include "plssvm/kernel_types.hpp"              // plssvm::kernel_type
-#include "plssvm/parameter.hpp"                 // plssvm::parameter
+#include "plssvm/backend_types.hpp"                         // plssvm::backend_type
+#include "plssvm/backends/SYCL/kernel_invocation_type.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/constants.hpp"                             // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
+#include "plssvm/detail/string_conversion.hpp"              // plssvm::detail::convert_to
+#include "plssvm/exceptions/exceptions.hpp"                 // plssvm::exception
+#include "plssvm/kernel_types.hpp"                          // plssvm::kernel_type
+#include "plssvm/parameter.hpp"                             // plssvm::parameter
 
 #include "fmt/format.h"   // fmt::format
 #include "fmt/ostream.h"  // can use fmt using operator<< overloads
@@ -124,12 +125,13 @@ inline void generate_q_test() {
     }
 }
 
-template <template <typename> typename csvm_type, typename real_type, plssvm::kernel_type kernel>
+template <template <typename> typename csvm_type, typename real_type, plssvm::kernel_type kernel, plssvm::sycl::kernel_invocation_type invocation_type = plssvm::sycl::kernel_invocation_type::automatic>
 inline void device_kernel_test() {
     // create parameter object
     plssvm::parameter<real_type> params;
     params.print_info = false;
     params.kernel = kernel;
+    params.sycl_kernel_invocation_type = invocation_type;
 
     params.parse_train_file(PLSSVM_TEST_FILE);
 

From d6e396f3ae2f3a4b737a504d7581a0b3f5194ad8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 12:08:54 +0100
Subject: [PATCH 34/56] Change nd_item<1> to id<1> in q_kernel: no penalty on
 GPUs but drastically faster on CPUs with hipSYCL.

---
 include/plssvm/backends/SYCL/q_kernel.hpp | 21 +++++++++------------
 src/plssvm/backends/SYCL/csvm.cpp         |  9 ++++-----
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/include/plssvm/backends/SYCL/q_kernel.hpp b/include/plssvm/backends/SYCL/q_kernel.hpp
index 6c5a6e375..2a27998d6 100644
--- a/include/plssvm/backends/SYCL/q_kernel.hpp
+++ b/include/plssvm/backends/SYCL/q_kernel.hpp
@@ -41,11 +41,10 @@ class device_kernel_q_linear {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] item the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
-     *                 identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class)
+     *                  identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<1> item) const {
-        const kernel_index_type index = item.get_global_linear_id();
+    void operator()(::sycl::id<1> index) const {
         real_type temp{ 0.0 };
         for (kernel_index_type i = 0; i < feature_range_; ++i) {
             temp += data_d_[i * num_rows_ + index] * data_last_[i];
@@ -88,11 +87,10 @@ class device_kernel_q_poly {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] item the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
-     *                 identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class)
+     *                  identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<1> item) const {
-        const kernel_index_type index = item.get_global_linear_id();
+    void operator()(::sycl::id<1> index) const {
         real_type temp{ 0.0 };
         for (kernel_index_type i = 0; i < num_cols_; ++i) {
             temp += data_d_[i * num_rows_ + index] * data_last_[i];
@@ -136,11 +134,10 @@ class device_kernel_q_radial {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] item the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
-     *                 identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class)
+     *                  identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<1> item) const {
-        const kernel_index_type index = item.get_global_linear_id();
+    void operator()(::sycl::id<1> index) const {
         real_type temp{ 0.0 };
         for (kernel_index_type i = 0; i < num_cols_; ++i) {
             temp += (data_d_[i * num_rows_ + index] - data_last_[i]) * (data_d_[i * num_rows_ + index] - data_last_[i]);
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index e218fde52..32366a44d 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -168,19 +168,18 @@ ::sycl::nd_range<I> execution_range_to_native(const ::plssvm::detail::execution_
 }
 
 template <typename T>
-void csvm<T>::run_q_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, const std::size_t num_features) {
-    const ::sycl::nd_range execution_range = execution_range_to_native<1>(range, kernel_invocation_type::nd_range);
+void csvm<T>::run_q_kernel(const std::size_t device, [[maybe_unused]] const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, const std::size_t num_features) {
     switch (kernel_) {
         case kernel_type::linear:
-            devices_[device].parallel_for(execution_range, device_kernel_q_linear(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_features));
+            devices_[device].parallel_for(::sycl::range<1>{ num_data_points_ }, device_kernel_q_linear(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_features));
             break;
         case kernel_type::polynomial:
             PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!");
-            devices_[device].parallel_for(execution_range, device_kernel_q_poly(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, degree_, gamma_, coef0_));
+            devices_[device].parallel_for(::sycl::range<1>{ num_data_points_ }, device_kernel_q_poly(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, degree_, gamma_, coef0_));
             break;
         case kernel_type::rbf:
             PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!");
-            devices_[device].parallel_for(execution_range, device_kernel_q_radial(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, gamma_));
+            devices_[device].parallel_for(::sycl::range<1>{ num_data_points_ }, device_kernel_q_radial(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, gamma_));
             break;
     }
 }

From a77dc46d6ef4d9146e581fd376324d761c57e944 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 12:36:59 +0100
Subject: [PATCH 35/56] Change to correct dept_ instead of num_data_points_.

---
 include/plssvm/backends/SYCL/csvm.hpp | 1 +
 src/plssvm/backends/SYCL/csvm.cpp     | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/plssvm/backends/SYCL/csvm.hpp b/include/plssvm/backends/SYCL/csvm.hpp
index 4d2018fff..2287fedff 100644
--- a/include/plssvm/backends/SYCL/csvm.hpp
+++ b/include/plssvm/backends/SYCL/csvm.hpp
@@ -46,6 +46,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<T, ::plssvm::sycl::detail::device
     using base_type::coef0_;
     using base_type::cost_;
     using base_type::degree_;
+    using base_type::dept_;
     using base_type::gamma_;
     using base_type::kernel_;
     using base_type::num_data_points_;
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 32366a44d..6cad682ee 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -171,15 +171,15 @@ template <typename T>
 void csvm<T>::run_q_kernel(const std::size_t device, [[maybe_unused]] const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, const std::size_t num_features) {
     switch (kernel_) {
         case kernel_type::linear:
-            devices_[device].parallel_for(::sycl::range<1>{ num_data_points_ }, device_kernel_q_linear(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_features));
+            devices_[device].parallel_for(::sycl::range<1>{ dept_ }, device_kernel_q_linear(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_features));
             break;
         case kernel_type::polynomial:
             PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!");
-            devices_[device].parallel_for(::sycl::range<1>{ num_data_points_ }, device_kernel_q_poly(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, degree_, gamma_, coef0_));
+            devices_[device].parallel_for(::sycl::range<1>{ dept_ }, device_kernel_q_poly(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, degree_, gamma_, coef0_));
             break;
         case kernel_type::rbf:
             PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!");
-            devices_[device].parallel_for(::sycl::range<1>{ num_data_points_ }, device_kernel_q_radial(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, gamma_));
+            devices_[device].parallel_for(::sycl::range<1>{ dept_ }, device_kernel_q_radial(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, gamma_));
             break;
     }
 }

From bdb12704d6001a1b83d42133a29368ed633af132 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 12:37:31 +0100
Subject: [PATCH 36/56] Change predict timing to exclude setup_data_on_device()
 runtime.

---
 src/plssvm/backends/OpenMP/csvm.cpp | 5 ++---
 src/plssvm/backends/gpu_csvm.cpp    | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp
index f0fcb961d..7e95d8879 100644
--- a/src/plssvm/backends/OpenMP/csvm.cpp
+++ b/src/plssvm/backends/OpenMP/csvm.cpp
@@ -184,9 +184,6 @@ void csvm<T>::update_w() {
 
 template <typename T>
 auto csvm<T>::predict(const std::vector<std::vector<real_type>> &points) -> std::vector<real_type> {
-    // time prediction
-    auto start_time = std::chrono::steady_clock::now();
-
     using namespace plssvm::operators;
 
     PLSSVM_ASSERT(data_ptr_ != nullptr, "No data is provided!");  // exception in constructor
@@ -208,6 +205,8 @@ auto csvm<T>::predict(const std::vector<std::vector<real_type>> &points) -> std:
 
     PLSSVM_ASSERT(data_ptr_->size() == alpha_ptr_->size(), "Sizes mismatch!: {} != {}", data_ptr_->size(), alpha_ptr_->size());  // exception in constructor
 
+    auto start_time = std::chrono::steady_clock::now();
+
     std::vector<real_type> out(points.size(), bias_);
     if (kernel_ == kernel_type::linear) {
         // use faster methode in case of the linear kernel function
diff --git a/src/plssvm/backends/gpu_csvm.cpp b/src/plssvm/backends/gpu_csvm.cpp
index b07a12e89..d9b563756 100644
--- a/src/plssvm/backends/gpu_csvm.cpp
+++ b/src/plssvm/backends/gpu_csvm.cpp
@@ -39,9 +39,6 @@ gpu_csvm<T, device_ptr_t, queue_t>::gpu_csvm(const parameter<T> &params) :
 
 template <typename T, typename device_ptr_t, typename queue_t>
 auto gpu_csvm<T, device_ptr_t, queue_t>::predict(const std::vector<std::vector<real_type>> &points) -> std::vector<real_type> {
-    // time prediction
-    auto start_time = std::chrono::steady_clock::now();
-
     using namespace plssvm::operators;
 
     PLSSVM_ASSERT(data_ptr_ != nullptr, "No data is provided!");  // exception in constructor
@@ -68,6 +65,8 @@ auto gpu_csvm<T, device_ptr_t, queue_t>::predict(const std::vector<std::vector<r
         setup_data_on_device();
     }
 
+    auto start_time = std::chrono::steady_clock::now();
+
     std::vector<real_type> out(points.size());
 
     if (kernel_ == kernel_type::linear) {

From c2e99a8a5a466d89297c1eff503351d9fb802dd9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 12:39:47 +0100
Subject: [PATCH 37/56] Change w kernel to use id instead of nd_item.

---
 include/plssvm/backends/SYCL/predict_kernel.hpp | 9 ++++-----
 src/plssvm/backends/SYCL/csvm.cpp               | 5 ++---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp
index c9fce1a0b..f626bcfcf 100644
--- a/include/plssvm/backends/SYCL/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/predict_kernel.hpp
@@ -44,12 +44,11 @@ class device_kernel_w_linear {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
-     *                   identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class)
+     *                  identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::nd_item<1> nd_idx) const {
-        const kernel_index_type index = nd_idx.get_global_linear_id();
-        real_type temp = 0;
+    void operator()(::sycl::id<1> index) const {
+        real_type temp{ 0.0 };
         if (index < num_features_) {
             for (kernel_index_type dat = 0; dat < num_data_points_ - 1; ++dat) {
                 temp += alpha_d_[dat] * data_d_[dat + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * index];
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 6cad682ee..0d3704b30 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -221,9 +221,8 @@ void csvm<T>::run_svm_kernel(const std::size_t device, const ::plssvm::detail::e
 }
 
 template <typename T>
-void csvm<T>::run_w_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, const std::size_t num_features) {
-    const ::sycl::nd_range execution_range = execution_range_to_native<1>(range, kernel_invocation_type::nd_range);
-    devices_[device].parallel_for(execution_range, device_kernel_w_linear(w_d.get(), data_d_[device].get(), data_last_d_[device].get(), alpha_d.get(), num_data_points_, num_features));
+void csvm<T>::run_w_kernel(const std::size_t device, [[maybe_unused]] const ::plssvm::detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, const std::size_t num_features) {
+    devices_[device].parallel_for(::sycl::range<1>{ num_features_ }, device_kernel_w_linear(w_d.get(), data_d_[device].get(), data_last_d_[device].get(), alpha_d.get(), num_data_points_, num_features));
 }
 
 template <typename T>

From ec74418f3aaefe3d0c046ce7f04e708bb8639c76 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 3 Mar 2022 16:41:16 +0100
Subject: [PATCH 38/56] Remove cpu from DPC++ target platforms in Jenkins
 tests.

---
 .jenkins/Jenkinsfile-multigpu-tests | 2 +-
 .jenkins/Jenkinsfile-tests          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.jenkins/Jenkinsfile-multigpu-tests b/.jenkins/Jenkinsfile-multigpu-tests
index 7b3a6e3e6..fd2ae2440 100644
--- a/.jenkins/Jenkinsfile-multigpu-tests
+++ b/.jenkins/Jenkinsfile-multigpu-tests
@@ -193,7 +193,7 @@ pipeline {
                                         mkdir -p build/Release_dpcpp &&\
                                         cd build/Release_dpcpp &&\
                                         rm -rf * &&\
-                                        cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS='cpu;nvidia:sm_61' -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON -DPLSSVM_ENABLE_LTO=OFF -S ../../ &&\
+                                        cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS='nvidia:sm_61' -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON -DPLSSVM_ENABLE_LTO=OFF -S ../../ &&\
                                         make -j4 "
                                     '''
                                 }
diff --git a/.jenkins/Jenkinsfile-tests b/.jenkins/Jenkinsfile-tests
index edf238dfd..511b6c4c6 100644
--- a/.jenkins/Jenkinsfile-tests
+++ b/.jenkins/Jenkinsfile-tests
@@ -149,7 +149,7 @@ pipeline {
                         mkdir -p build/Release_dpcpp
                         cd build/Release_dpcpp
                         rm -rf *
-                        /import/sgs.scratch/vancraar/spack/opt/spack/linux-ubuntu20.04-cascadelake/clang-12.0.0/cmake-3.20.2-z3urlvzqm5igtwxj25nnd5olciuq7ayb/bin/cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS="cpu;nvidia:sm_80" -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON ../../
+                        /import/sgs.scratch/vancraar/spack/opt/spack/linux-ubuntu20.04-cascadelake/clang-12.0.0/cmake-3.20.2-z3urlvzqm5igtwxj25nnd5olciuq7ayb/bin/cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS="nvidia:sm_80" -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON ../../
                         make -j4
                     '''
                 }
@@ -213,7 +213,7 @@ pipeline {
                         mkdir -p build/Debug_dpcpp
                         cd build/Debug_dpcpp
                         rm -rf *
-                        /import/sgs.scratch/vancraar/spack/opt/spack/linux-ubuntu20.04-cascadelake/clang-12.0.0/cmake-3.20.2-z3urlvzqm5igtwxj25nnd5olciuq7ayb/bin/cmake -DCMAKE_BUILD_TYPE=Debug -DPLSSVM_TARGET_PLATFORMS="cpu;nvidia:sm_80" -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON ../../
+                        /import/sgs.scratch/vancraar/spack/opt/spack/linux-ubuntu20.04-cascadelake/clang-12.0.0/cmake-3.20.2-z3urlvzqm5igtwxj25nnd5olciuq7ayb/bin/cmake -DCMAKE_BUILD_TYPE=Debug -DPLSSVM_TARGET_PLATFORMS="nvidia:sm_80" -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON ../../
                         make -j4
                     '''
                 }

From 8a34353413a16edf8455860c2df057723a0ff7b4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 4 Mar 2022 09:25:28 +0100
Subject: [PATCH 39/56] Use compiler definition to print the SYCL
 implementation name.

---
 src/plssvm/backends/SYCL/CMakeLists.txt | 1 +
 src/plssvm/backends/SYCL/csvm.cpp       | 7 +------
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index ce64e03bd..c4bc20832 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -142,6 +142,7 @@ target_link_libraries(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PUBLIC ${PLSSVM_BASE_L
 
 # set compile definition that the SYCL backend is available
 target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_HAS_SYCL_BACKEND)
+target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_SYCL_BACKEND_COMPILER_NAME="${PLSSVM_SYCL_BACKEND_COMPILER}")
 
 # link against interface library
 target_link_libraries(${PLSSVM_ALL_LIBRARY_NAME} INTERFACE ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 9f70416fc..b5375dcbf 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -63,12 +63,7 @@ csvm<T>::csvm(const parameter<T> &params) :
     }
 
     if (print_info_) {
-#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL
-        fmt::print("Using SYCL (hipSYCL) as backend.\n");
-#endif
-#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP
-        fmt::print("Using SYCL (DPC++) as backend.\n");
-#endif
+        fmt::print("Using SYCL ({}) as backend.\n", PLSSVM_SYCL_BACKEND_COMPILER_NAME);
     }
 
     // get all available devices wrt the requested target platform

From f0e33124abaaeffd15923a2973947800bc8a1909 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 4 Mar 2022 13:09:22 +0100
Subject: [PATCH 40/56] Change in notation from level-zero to level_zero.

---
 src/plssvm/backends/SYCL/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index c4bc20832..3d3508a29 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -130,7 +130,7 @@ elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++")
     # be able to choose between the Level-Zero and OpenCL DPC++ backend
     option(PLSSVM_SYCL_DPCPP_USE_LEVEL_ZERO "Enable Level-Zero backend in favor of OpenCL when using DPC++." OFF)
     if(PLSSVM_SYCL_BACKEND_DPCPP_USE_LEVEL_ZERO)
-        target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_DPCPP_BACKEND_TYPE="level-zero")
+        target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_DPCPP_BACKEND_TYPE="level_zero")
     else()
         target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_DPCPP_BACKEND_TYPE="opencl")
     endif()

From 599ac544560ddf403025fe6b12ee93539e1d5eb2 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 4 Mar 2022 13:40:19 +0100
Subject: [PATCH 41/56] Add a short description for SVMs and LS-SVMs and the
 reason why we decided to use an LS-SVM to the README.

---
 README.md | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d4ed4d1f2..14c29d4a1 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,33 @@
-# Least Squares Support Vector Machine
+# PLSSVM - Parallel Least Squares Support Vector Machine
 
 [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e780a63075ce40c29c49d3df4f57c2af)](https://www.codacy.com/gh/SC-SGS/PLSSVM/dashboard?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=SC-SGS/PLSSVM&amp;utm_campaign=Badge_Grade) &ensp; [![Generate documentation](https://github.com/SC-SGS/PLSSVM/actions/workflows/documentation.yml/badge.svg)](https://sc-sgs.github.io/PLSSVM/) &ensp; [![Build Status Linux CPU + GPU](https://simsgs.informatik.uni-stuttgart.de/jenkins/buildStatus/icon?job=PLSSVM%2FMultibranch-Github%2Fmain&subject=Linux+CPU/GPU)](https://simsgs.informatik.uni-stuttgart.de/jenkins/view/PLSSVM/job/PLSSVM/job/Multibranch-Github/job/main/) &ensp; [![Windows CPU](https://github.com/SC-SGS/PLSSVM/actions/workflows/msvc_windows.yml/badge.svg)](https://github.com/SC-SGS/PLSSVM/actions/workflows/msvc_windows.yml)
 
-Implementation of a parallel [least squares support vector machine](https://en.wikipedia.org/wiki/Least-squares_support-vector_machine) using multiple different backends.
-The currently available backends are:
+A [Support Vector Machine (SVM)](https://en.wikipedia.org/wiki/Support-vector_machine) is a supervised machine learning model.
+In its basic form SVMs are used for binary classification tasks. 
+Their fundamental idea is to learn a hyperplane which separates the two classes best, i.e., where the widest possible margin around its decision boundary is free of data.
+This is also the reason, why SVMs are also called "large margin classifiers".
+To predict to which class a new, unseen data point belongs, the SVM simply has to calculate on which side of the previously calculated hyperplane the data point lies.
+This is very efficient since it only involves a single scalar product of the size corresponding to the numer of features of the data set.
+
+However, normal SVMs suffer in their potential parallelizability.
+Determining the hyperplane boils down to solving a konvex quadratic problem.
+For this, most SVM implementations use Sequential Minimal Optimization (SMO), an inherently sequential algorithm.
+The basic idea of this algorithm is that it takes a pair of data points and calculates the hyperplane between them.
+Afterward, two new data points are selected and the existing hyperplane is adjusted accordingly.
+This procedure is repeat until a new adjustment would be smaller than some epsilon greater than zero.
+
+Some SVM implementations try to harness some parallelization potential by not drawing point pairs but group of points.
+In this case, the hyperplane calculation inside this group is parallelized.
+However, even then modern highly parallel hardware can not be utilized efficiently.
+
+Therefore, we implemented a version of the original proposed SVM called [Least Squares Support Vector Machine (LS-SVM)](https://en.wikipedia.org/wiki/Least-squares_support-vector_machine).
+The LS-SVMs reformulated the original problem such that it boils down to solving a system of linear equations.
+For this kind of problem many highly parallel algorithms and implementations are known.
+We decided to use the [Conjugate Gradient (CG)](https://en.wikipedia.org/wiki/Conjugate_gradient_method) to solve the system of linear equations.
+
+Since one of our main goals was performance, we parallelized the implicit matrix-vector multiplication inside the CG algorithm.
+To do so, we use multiple different frameworks to be able to target a broad variety of different hardware platforms.
+The currently available frameworks (also called backends in our PLSSVM implementation) are:
   - [OpenMP](https://www.openmp.org/)
   - [CUDA](https://developer.nvidia.com/cuda-zone)
   - [OpenCL](https://www.khronos.org/opencl/)
@@ -20,7 +44,7 @@ General dependencies:
   - [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful)
   - [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled
   - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up file parsing
-  - multiple Python3 modules used in the utility scripts; <br>to install all modules use `pip install --user -r install/python_requirements.txt`
+  - multiple Python modules used in the utility scripts; <br>to install all modules use `pip install --user -r install/python_requirements.txt`
 
 Additional dependencies for the OpenMP backend:
   - compiler with OpenMP support

From 5b9faa920786dc74c336ceea312170605d9b3928 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 4 Mar 2022 14:06:58 +0100
Subject: [PATCH 42/56] Use newer hipSYCL version in Jenkins files.

---
 .jenkins/Jenkinsfile-multigpu-tests | 4 ++--
 .jenkins/Jenkinsfile-tests          | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.jenkins/Jenkinsfile-multigpu-tests b/.jenkins/Jenkinsfile-multigpu-tests
index fd2ae2440..472987f8e 100644
--- a/.jenkins/Jenkinsfile-multigpu-tests
+++ b/.jenkins/Jenkinsfile-multigpu-tests
@@ -148,7 +148,7 @@ pipeline {
                                         source /scratch/jenkins/spack/share/spack/setup-env.sh && spack load cmake@3.20.2 &&\
                                         module load cuda/11.2.2 &&\
                                         module use /home/breyerml/.modulefiles/ &&\
-                                        module load pcsgs05/hipsycl_20_11_16 &&\
+                                        module load plssvm/argon-gtx/hipsycl &&\
                                         mkdir -p build/Release_hip &&\
                                         cd build/Release_hip &&\
                                         rm -rf * &&\
@@ -166,7 +166,7 @@ pipeline {
                                             srun -w argon-gtx -N 1 -n 1 -t 01:00:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM --gres=gpu:2 bash -c "\
                                             module load cuda/11.2.2 &&\
                                             module use /home/breyerml/.modulefiles/ &&\
-                                            module load pcsgs05/hipsycl_20_11_16 &&\
+                                            module load plssvm/argon-gtx/hipsycl &&\
                                             cd build/Release_hip &&\
                                             ctest -j4 --no-compress-output -T Test --timeout 1200; \
                                             returncode=$? && \
diff --git a/.jenkins/Jenkinsfile-tests b/.jenkins/Jenkinsfile-tests
index 511b6c4c6..05fa88b5b 100644
--- a/.jenkins/Jenkinsfile-tests
+++ b/.jenkins/Jenkinsfile-tests
@@ -114,7 +114,7 @@ pipeline {
                     sh '''
                         module load cuda/11.2.2
                         module use /home/breyerml/.modulefiles/
-                        module load pcsgs05/hipsycl_20_11_16
+                        module load pcsgs05/hipsycl_2022_02_04
                         mkdir -p build/Release_hip
                         cd build/Release_hip
                         rm -rf *
@@ -131,7 +131,7 @@ pipeline {
                         sh '''
                             module load cuda/11.2.2
                             module use /home/breyerml/.modulefiles/
-                            module load pcsgs05/hipsycl_20_11_16
+                            module load pcsgs05/hipsycl_2022_02_04
                             cd build/Release_hip
                             ctest -j4 --no-compress-output -T Test
                         '''
@@ -193,7 +193,7 @@ pipeline {
                     sh '''
                         module load cuda/11.2.2
                         module use /home/breyerml/.modulefiles/
-                        module load pcsgs05/hipsycl_20_11_16
+                        module load pcsgs05/hipsycl_2022_02_04
                         mkdir -p build/Debug_hip
                         cd build/Debug_hip
                         rm -rf *

From b4584b4a5d5e2ccdae5934021c731e36a44fcbe4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 4 Mar 2022 15:42:56 +0100
Subject: [PATCH 43/56] Update SYCL installations in Jenkins file.

---
 .jenkins/Jenkinsfile-multigpu-tests |  4 ++--
 .jenkins/Jenkinsfile-tests          | 18 ++++++------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/.jenkins/Jenkinsfile-multigpu-tests b/.jenkins/Jenkinsfile-multigpu-tests
index 472987f8e..a317e803a 100644
--- a/.jenkins/Jenkinsfile-multigpu-tests
+++ b/.jenkins/Jenkinsfile-multigpu-tests
@@ -189,7 +189,7 @@ pipeline {
                                         source /scratch/jenkins/spack/share/spack/setup-env.sh && spack load cmake@3.20.2 &&\
                                         module load cuda/11.2.2 &&\
                                         module use /home/breyerml/.modulefiles/ &&\
-                                        module load pcsgs05/dpcpp_2022_01_18 &&\
+                                        module load plssvm/argon-gtx/dpcpp &&\
                                         mkdir -p build/Release_dpcpp &&\
                                         cd build/Release_dpcpp &&\
                                         rm -rf * &&\
@@ -207,7 +207,7 @@ pipeline {
                                             srun -w argon-gtx -N 1 -n 1 -t 01:00:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM --gres=gpu:2 bash -c "\
                                             module load cuda/11.2.2 &&\
                                             module use /home/breyerml/.modulefiles/ &&\
-                                            module load pcsgs05/dpcpp_2022_01_18 &&\
+                                            module load plssvm/argon-gtx/dpcpp &&\
                                             cd build/Release_dpcpp &&\
                                             ctest -j4 --no-compress-output -T Test --timeout 1200; \
                                             returncode=$? && \
diff --git a/.jenkins/Jenkinsfile-tests b/.jenkins/Jenkinsfile-tests
index 05fa88b5b..9ef8a65ea 100644
--- a/.jenkins/Jenkinsfile-tests
+++ b/.jenkins/Jenkinsfile-tests
@@ -112,9 +112,8 @@ pipeline {
             steps {
                 dir('plssvm') {
                     sh '''
-                        module load cuda/11.2.2
                         module use /home/breyerml/.modulefiles/
-                        module load pcsgs05/hipsycl_2022_02_04
+                        module load network_hipsycl_cuda
                         mkdir -p build/Release_hip
                         cd build/Release_hip
                         rm -rf *
@@ -129,9 +128,8 @@ pipeline {
                 dir('plssvm') {
                     warnError('hipSYCL Release tests failed!') {
                         sh '''
-                            module load cuda/11.2.2
                             module use /home/breyerml/.modulefiles/
-                            module load pcsgs05/hipsycl_2022_02_04
+                            module load network_hipsycl_cuda
                             cd build/Release_hip
                             ctest -j4 --no-compress-output -T Test
                         '''
@@ -143,9 +141,8 @@ pipeline {
             steps {
                 dir('plssvm') {
                     sh '''
-                        module load cuda/11.2.2
                         module use /home/breyerml/.modulefiles/
-                        module load pcsgs05/dpcpp_2022_01_18
+                        module load network_dpcpp_cuda
                         mkdir -p build/Release_dpcpp
                         cd build/Release_dpcpp
                         rm -rf *
@@ -160,9 +157,8 @@ pipeline {
                 dir('plssvm') {
                     warnError('DPC++ Release tests failed!') {
                         sh '''
-                            module load cuda/11.2.2
                             module use /home/breyerml/.modulefiles/
-                            module load pcsgs05/dpcpp_2022_01_18
+                            module load network_dpcpp_cuda
                             cd build/Release_dpcpp
                             ctest -j4 --no-compress-output -T Test
                         '''
@@ -191,9 +187,8 @@ pipeline {
             steps {
                 dir('plssvm') {
                     sh '''
-                        module load cuda/11.2.2
                         module use /home/breyerml/.modulefiles/
-                        module load pcsgs05/hipsycl_2022_02_04
+                        module load network_hipsycl_cuda
                         mkdir -p build/Debug_hip
                         cd build/Debug_hip
                         rm -rf *
@@ -207,9 +202,8 @@ pipeline {
             steps {
                 dir('plssvm') {
                     sh '''
-                        module load cuda/11.2.2
                         module use /home/breyerml/.modulefiles/
-                        module load pcsgs05/dpcpp_2022_01_18
+                        module load network_dpcpp_cuda
                         mkdir -p build/Debug_dpcpp
                         cd build/Debug_dpcpp
                         rm -rf *

From 3bec094251df5c0b94d30b982ab5d45c71a9f4ba Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Mar 2022 12:55:31 +0100
Subject: [PATCH 44/56] Add heading to markdown LICENSE.

---
 LICENSE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE.md b/LICENSE.md
index 800ee95c5..b328e9846 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,4 +1,4 @@
-MIT License
+## MIT License
 
 Copyright (c) 2021 Alexander Van Craen and Marcel Breyer @ University of Stuttgart
 

From bb50033dc5517f62a5aee2f54560aa23b7a794ea Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Mar 2022 12:55:52 +0100
Subject: [PATCH 45/56] Fix indentation and newline codacy markdown warnings.

---
 README.md | 118 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 65 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index 14c29d4a1..57b86a738 100644
--- a/README.md
+++ b/README.md
@@ -28,40 +28,46 @@ We decided to use the [Conjugate Gradient (CG)](https://en.wikipedia.org/wiki/Co
 Since one of our main goals was performance, we parallelized the implicit matrix-vector multiplication inside the CG algorithm.
 To do so, we use multiple different frameworks to be able to target a broad variety of different hardware platforms.
 The currently available frameworks (also called backends in our PLSSVM implementation) are:
-  - [OpenMP](https://www.openmp.org/)
-  - [CUDA](https://developer.nvidia.com/cuda-zone)
-  - [OpenCL](https://www.khronos.org/opencl/)
-  - [SYCL](https://www.khronos.org/sycl/) (tested implementations are [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL))
+
+- [OpenMP](https://www.openmp.org/)
+- [CUDA](https://developer.nvidia.com/cuda-zone)
+- [OpenCL](https://www.khronos.org/opencl/)
+- [SYCL](https://www.khronos.org/sycl/) (tested implementations are [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL))
 
 ## Getting Started
 
 ### Dependencies
 
 General dependencies:
-  - a C++17 capable compiler (e.g. [`gcc`](https://gcc.gnu.org/) or [`clang`](https://clang.llvm.org/))
-  - [CMake](https://cmake.org/) 3.18 or newer
-  - [cxxopts](https://github.com/jarro2783/cxxopts), [fast_float](https://github.com/fastfloat/fast_float) and [{fmt}](https://github.com/fmtlib/fmt) (all three are automatically build during the CMake configuration if they couldn't be found using the respective `find_package` call)
-  - [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful)
-  - [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled
-  - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up file parsing
-  - multiple Python modules used in the utility scripts; <br>to install all modules use `pip install --user -r install/python_requirements.txt`
+
+- a C++17 capable compiler (e.g. [`gcc`](https://gcc.gnu.org/) or [`clang`](https://clang.llvm.org/))
+- [CMake](https://cmake.org/) 3.18 or newer
+- [cxxopts](https://github.com/jarro2783/cxxopts), [fast_float](https://github.com/fastfloat/fast_float) and [{fmt}](https://github.com/fmtlib/fmt) (all three are automatically build during the CMake configuration if they couldn't be found using the respective `find_package` call)
+- [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful)
+- [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled
+- [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up file parsing
+- multiple Python modules used in the utility scripts; <br>to install all modules use `pip install --user -r install/python_requirements.txt`
 
 Additional dependencies for the OpenMP backend:
-  - compiler with OpenMP support
+
+- compiler with OpenMP support
 
 Additional dependencies for the CUDA backend:
-  - CUDA SDK
-  - either NVIDIA [`nvcc`](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) or [`clang` with CUDA support enabled](https://llvm.org/docs/CompileCudaWithLLVM.html)
+
+- CUDA SDK
+- either NVIDIA [`nvcc`](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) or [`clang` with CUDA support enabled](https://llvm.org/docs/CompileCudaWithLLVM.html)
 
 Additional dependencies for the OpenCL backend:
-  - OpenCL runtime and header files
+
+- OpenCL runtime and header files
 
 Additional dependencies for the SYCL backend:
-  - the code must be compiled with a SYCL capable compiler; currently tested with [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL)
+
+- the code must be compiled with a SYCL capable compiler; currently tested with [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL)
 
 Additional dependencies if `PLSSVM_ENABLE_TESTING` and `PLSSVM_GENERATE_TEST_FILE` are both set to `ON`:
-  - [Python3](https://www.python.org/) with the [`argparse`](https://docs.python.org/3/library/argparse.html), [`timeit`](https://docs.python.org/3/library/timeit.html) and [`sklearn`](https://scikit-learn.org/stable/) modules
 
+- [Python3](https://www.python.org/) with the [`argparse`](https://docs.python.org/3/library/argparse.html), [`timeit`](https://docs.python.org/3/library/timeit.html) and [`sklearn`](https://scikit-learn.org/stable/) modules
 
 ### Building
 
@@ -79,10 +85,11 @@ cmake --build .
 
 The **required** CMake option `PLSSVM_TARGET_PLATFORMS` is used to determine for which targets the backends should be compiled.
 Valid targets are:
-  - `cpu`: compile for the CPU; an **optional** architectural specifications is allowed but only used when compiling with DPC++, e.g., `cpu:avx2`
-  - `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g., `nvidia:sm_86,sm_70`
-  - `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g., `amd:gfx906`
-  - `intel`: compile for Intel GPUs; **at least one** architectural specification is necessary, e.g., `intel:skl`
+
+- `cpu`: compile for the CPU; an **optional** architectural specifications is allowed but only used when compiling with DPC++, e.g., `cpu:avx2`
+- `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g., `nvidia:sm_86,sm_70`
+- `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g., `amd:gfx906`
+- `intel`: compile for Intel GPUs; **at least one** architectural specification is necessary, e.g., `intel:skl`
 
 At least one of the above targets must be present.
 
@@ -120,46 +127,50 @@ cpu:avx512;intel:dg1
 ```
 
 If the architectural information for the requested GPU could not be retrieved, one option would be to have a look at:
-  - for NVIDIA GPUs:  [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)
-  - for AMD GPUs: [clang AMDGPU backend usage](https://llvm.org/docs/AMDGPUUsage.html)
-  - for Intel GPUs and CPUs: [Ahead of Time Compilation](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html) and [Intel graphics processor table](https://dgpu-docs.intel.com/devices/hardware-table.html)
 
+- for NVIDIA GPUs:  [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)
+- for AMD GPUs: [clang AMDGPU backend usage](https://llvm.org/docs/AMDGPUUsage.html)
+- for Intel GPUs and CPUs: [Ahead of Time Compilation](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html) and [Intel graphics processor table](https://dgpu-docs.intel.com/devices/hardware-table.html)
 
 #### Optional CMake Options
 
 The `[optional_options]` can be one or multiple of:
 
-  - `PLSSVM_ENABLE_OPENMP_BACKEND=ON|OFF|AUTO` (default: `AUTO`):
-    - `ON`: check for the OpenMP backend and fail if not available
-    - `AUTO`: check for the OpenMP backend but **do not** fail if not available
-    - `OFF`: do not check for the OpenMP backend
-  - `PLSSVM_ENABLE_CUDA_BACKEND=ON|OFF|AUTO` (default: `AUTO`):
-    - `ON`: check for the CUDA backend and fail if not available
-    - `AUTO`: check for the CUDA backend but **do not** fail if not available
-    - `OFF`: do not check for the CUDA backend
-  - `PLSSVM_ENABLE_OPENCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`):
-    - `ON`: check for the OpenCL backend and fail if not available
-    - `AUTO`: check for the OpenCL backend but **do not** fail if not available
-    - `OFF`: do not check for the OpenCL backend
-  - `PLSSVM_ENABLE_SYCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`):
-    - `ON`: check for the SYCL backend and fail if not available
-    - `AUTO`: check for the SYCL backend but **do not** fail if not available
-    - `OFF`: do not check for the SYCL backend
+- `PLSSVM_ENABLE_OPENMP_BACKEND=ON|OFF|AUTO` (default: `AUTO`):
+  - `ON`: check for the OpenMP backend and fail if not available
+  - `AUTO`: check for the OpenMP backend but **do not** fail if not available
+  - `OFF`: do not check for the OpenMP backend
+
+- `PLSSVM_ENABLE_CUDA_BACKEND=ON|OFF|AUTO` (default: `AUTO`):
+  - `ON`: check for the CUDA backend and fail if not available
+  - `AUTO`: check for the CUDA backend but **do not** fail if not available
+  - `OFF`: do not check for the CUDA backend
+
+- `PLSSVM_ENABLE_OPENCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`):
+  - `ON`: check for the OpenCL backend and fail if not available
+  - `AUTO`: check for the OpenCL backend but **do not** fail if not available
+  - `OFF`: do not check for the OpenCL backend
+
+- `PLSSVM_ENABLE_SYCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`):
+  - `ON`: check for the SYCL backend and fail if not available
+  - `AUTO`: check for the SYCL backend but **do not** fail if not available
+  - `OFF`: do not check for the SYCL backend
 
 **Attention:** at least one backend must be enabled and available!
 
-  - `PLSSVM_ENABLE_ASSERTS=ON|OFF` (default: `OFF`): enables custom assertions regardless whether the `DEBUG` macro is defined or not
-  - `PLSSVM_THREAD_BLOCK_SIZE` (default: `16`): set a specific thread block size used in the GPU kernels (for fine-tuning optimizations)
-  - `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `6`: set a specific internal block size used in the GPU kernels (for fine-tuning optimizations)
-  - `PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION` (default: `OFF`): enables single precision calculations instead of double precision for the `svm-train` and `svm-predict` executables
-  - `PLSSVM_ENABLE_LTO=ON|OFF` (default: `ON`): enable interprocedural optimization (IPO/LTO) if supported by the compiler
-  - `PLSSVM_ENABLE_DOCUMENTATION=ON|OFF` (default: `OFF`): enable the `doc` target using doxygen
-  - `PLSSVM_ENABLE_TESTING=ON|OFF` (default: `ON`): enable testing using GoogleTest and ctest
-  - `PLSSVM_GENERATE_TIMING_SCRIPT=ON|OFF` (default: `OFF`): configure a timing script usable for performance measurement
+- `PLSSVM_ENABLE_ASSERTS=ON|OFF` (default: `OFF`): enables custom assertions regardless whether the `DEBUG` macro is defined or not
+- `PLSSVM_THREAD_BLOCK_SIZE` (default: `16`): set a specific thread block size used in the GPU kernels (for fine-tuning optimizations)
+- `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `6`: set a specific internal block size used in the GPU kernels (for fine-tuning optimizations)
+- `PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION` (default: `OFF`): enables single precision calculations instead of double precision for the `svm-train` and `svm-predict` executables
+- `PLSSVM_ENABLE_LTO=ON|OFF` (default: `ON`): enable interprocedural optimization (IPO/LTO) if supported by the compiler
+- `PLSSVM_ENABLE_DOCUMENTATION=ON|OFF` (default: `OFF`): enable the `doc` target using doxygen
+- `PLSSVM_ENABLE_TESTING=ON|OFF` (default: `ON`): enable testing using GoogleTest and ctest
+- `PLSSVM_GENERATE_TIMING_SCRIPT=ON|OFF` (default: `OFF`): configure a timing script usable for performance measurement
 
 If `PLSSVM_ENABLE_TESTING` is set to `ON`, the following options can also be set:
-  - `PLSSVM_GENERATE_TEST_FILE=ON|OFF` (default: `ON`): automatically generate test files
-    - `PLSSVM_TEST_FILE_NUM_DATA_POINTS` (default: `5000`): the number of data points in the test file
+
+- `PLSSVM_GENERATE_TEST_FILE=ON|OFF` (default: `ON`): automatically generate test files
+  - `PLSSVM_TEST_FILE_NUM_DATA_POINTS` (default: `5000`): the number of data points in the test file
 
 If the SYCL backend is available and DPC++ is used, the option `PLSSVM_SYCL_DPCPP_USE_LEVEL_ZERO` can be used to select Level-Zero as the
 DPC++ backend instead of OpenCL.
@@ -190,9 +201,11 @@ The resulting `html` coverage report is located in the `coverage` folder in the
 ### Creating the documentation
 
 If doxygen is installed and `PLSSVM_ENABLE_DOCUMENTATION` is set to `ON` the documentation can be build using
+
 ```bash
 make doc
 ```
+
 The documentation of the current state of the main branch can be found [here](https://sc-sgs.github.io/PLSSVM/).
 
 ## Installing
@@ -211,8 +224,8 @@ The repository comes with a Python3 script (in the `utility_scripts/` directory)
 
 In order to use all functionality, the following Python3 modules must be installed:
 [`argparse`](https://docs.python.org/3/library/argparse.html), [`timeit`](https://docs.python.org/3/library/timeit.html),
-[`numpy`](https://pypi.org/project/numpy/), [`pandas`](https://pypi.org/project/pandas/), 
-[`sklearn`](https://scikit-learn.org/stable/), [`arff`](https://pypi.org/project/arff/), 
+[`numpy`](https://pypi.org/project/numpy/), [`pandas`](https://pypi.org/project/pandas/),
+[`sklearn`](https://scikit-learn.org/stable/), [`arff`](https://pypi.org/project/arff/),
 [`matplotlib`](https://pypi.org/project/matplotlib/) and
 [`mpl_toolkits`](https://pypi.org/project/matplotlib/)
 
@@ -374,7 +387,6 @@ target_compile_features(prog PUBLIC cxx_std_17)
 target_link_libraries(prog PUBLIC plssvm::svm-all)
 ```
 
-
 ## License
 
 The PLSSVM library is distributed under the MIT [license](https://github.com/SC-SGS/PLSSVM/blob/main/LICENSE.md).

From 1aab8dd2651aee5af08e14203c45bf2fb6da155f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Mar 2022 17:21:21 +0100
Subject: [PATCH 46/56] Fix comparison of unsigned value with <= 0.

---
 src/plssvm/parameter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/parameter.cpp b/src/plssvm/parameter.cpp
index aa2d5a760..d6a555d44 100644
--- a/src/plssvm/parameter.cpp
+++ b/src/plssvm/parameter.cpp
@@ -425,7 +425,7 @@ void parameter<T>::parse_model_file(const std::string &filename) {
             } else if (detail::starts_with(line, "total_sv")) {
                 // the total number of support vectors must be greater than 0
                 num_sv = detail::convert_to<decltype(num_sv)>(value);
-                if (num_sv <= 0) {
+                if (num_sv == 0) {
                     throw invalid_file_format_exception{ fmt::format("The number of support vectors must be greater than 0, but is {}!", num_sv) };
                 }
             } else if (detail::starts_with(line, "rho")) {

From e9197f5cdef45fb9ff589052d23afe2b4abb188d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Mar 2022 17:37:48 +0100
Subject: [PATCH 47/56] Change model file check from GoogleTest regex to
 std::regex.

---
 tests/backends/generic_tests.hpp | 45 ++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/tests/backends/generic_tests.hpp b/tests/backends/generic_tests.hpp
index 3fadd734d..6355a926d 100644
--- a/tests/backends/generic_tests.hpp
+++ b/tests/backends/generic_tests.hpp
@@ -25,12 +25,13 @@
 #include "fmt/format.h"   // fmt::format
 #include "fmt/ostream.h"  // can use fmt using operator<< overloads
 #include "gmock/gmock.h"  // EXPECT_THAT
-#include "gtest/gtest.h"  // GTEST_USES_POSIX_RE, ASSERT_EQ, EXPECT_EQ, EXPECT_GT, testing::ContainsRegex, testing::StaticAssertTypeEq
+#include "gtest/gtest.h"  // ASSERT_GT, ASSERT_TRUE, ASSERT_EQ, EXPECT_EQ, EXPECT_GT, testing::ContainsRegex, testing::StaticAssertTypeEq
 
 #include <algorithm>   // std::generate
 #include <filesystem>  // std::filesystem::remove
 #include <fstream>     // std::ifstream
 #include <random>      // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <regex>       // std::regex, std::regex_match
 #include <string>      // std::string, std::getline
 #include <vector>      // std::vector
 
@@ -69,26 +70,48 @@ inline void write_model_test() {
     // write learned model to file
     csvm.write_model(model_file);
 
-    // read content of model file and delete it
-    std::ifstream model_ifs(model_file);
-    std::string file_content((std::istreambuf_iterator<char>(model_ifs)), std::istreambuf_iterator<char>());
-    model_ifs.close();
+    // read content of model file line by line and delete it
+    std::vector<std::string> lines;
+    {
+        std::ifstream model_ifs(model_file);
+        std::string line;
+        while (std::getline(model_ifs, line)) {
+            lines.push_back(std::move(line));
+        }
+    }
     std::filesystem::remove(model_file);
 
-    // check model file content for correctness
-#ifdef GTEST_USES_POSIX_RE
+    // create vector containing correct regex
+    std::vector<std::string> regex_patterns;
+    regex_patterns.emplace_back("svm_type c_svc");
+    regex_patterns.emplace_back(fmt::format("kernel_type {}", params.kernel));
     switch (params.kernel) {
         case plssvm::kernel_type::linear:
-            EXPECT_THAT(file_content, testing::ContainsRegex("^svm_type c_svc\nkernel_type linear\nnr_class 2\ntotal_sv [0-9]+\nrho [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nlabel 1 -1\nnr_sv [0-9]+ [0-9]+\nSV"));
             break;
         case plssvm::kernel_type::polynomial:
-            EXPECT_THAT(file_content, testing::ContainsRegex("^svm_type c_svc\nkernel_type polynomial\ndegree [0-9]+\ngamma [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\ncoef0 [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nnr_class 2\ntotal_sv [0-9]+\nrho [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nlabel 1 -1\nnr_sv [0-9]+ [0-9]+\nSV"));
+            regex_patterns.emplace_back("degree [0-9]+");
+            regex_patterns.emplace_back("gamma [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?");
+            regex_patterns.emplace_back("coef0 [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?");
             break;
         case plssvm::kernel_type::rbf:
-            EXPECT_THAT(file_content, testing::ContainsRegex("^svm_type c_svc\nkernel_type rbf\ngamma [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nnr_class 2\ntotal_sv [0-9]+\nrho [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nlabel 1 -1\nnr_sv [0-9]+ [0-9]+\nSV"));
+            regex_patterns.emplace_back("gamma [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?");
             break;
     }
-#endif
+    regex_patterns.emplace_back("nr_class 2");
+    regex_patterns.emplace_back("total_sv [0-9]+");
+    regex_patterns.emplace_back("rho [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?");
+    regex_patterns.emplace_back("label 1 -1");
+    regex_patterns.emplace_back("nr_sv [0-9]+ [0-9]+");
+    regex_patterns.emplace_back("SV");
+
+    // at least number of header entries lines must be present
+    ASSERT_GT(lines.size(), regex_patterns.size());
+
+    // check if the model header is valid
+    for (std::vector<std::string>::size_type i = 0; i < regex_patterns.size(); ++i) {
+        std::regex reg(regex_patterns[i], std::regex::extended);
+        ASSERT_TRUE(std::regex_match(lines[i], reg)) << "line: " << i << " doesn't match regex pattern: " << regex_patterns[i];
+    }
 }
 
 template <template <typename> typename csvm_type, typename real_type, plssvm::kernel_type kernel>

From d9fe272cf0540284b24c7d501c77b5a9ad8f7fe4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Mar 2022 17:47:16 +0100
Subject: [PATCH 48/56] Fix additional markdown issues.

---
 LICENSE.md | 2 +-
 README.md  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/LICENSE.md b/LICENSE.md
index b328e9846..aa3947026 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,4 +1,4 @@
-## MIT License
+# MIT License
 
 Copyright (c) 2021 Alexander Van Craen and Marcel Breyer @ University of Stuttgart
 
diff --git a/README.md b/README.md
index 57b86a738..2a183e466 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e780a63075ce40c29c49d3df4f57c2af)](https://www.codacy.com/gh/SC-SGS/PLSSVM/dashboard?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=SC-SGS/PLSSVM&amp;utm_campaign=Badge_Grade) &ensp; [![Generate documentation](https://github.com/SC-SGS/PLSSVM/actions/workflows/documentation.yml/badge.svg)](https://sc-sgs.github.io/PLSSVM/) &ensp; [![Build Status Linux CPU + GPU](https://simsgs.informatik.uni-stuttgart.de/jenkins/buildStatus/icon?job=PLSSVM%2FMultibranch-Github%2Fmain&subject=Linux+CPU/GPU)](https://simsgs.informatik.uni-stuttgart.de/jenkins/view/PLSSVM/job/PLSSVM/job/Multibranch-Github/job/main/) &ensp; [![Windows CPU](https://github.com/SC-SGS/PLSSVM/actions/workflows/msvc_windows.yml/badge.svg)](https://github.com/SC-SGS/PLSSVM/actions/workflows/msvc_windows.yml)
 
 A [Support Vector Machine (SVM)](https://en.wikipedia.org/wiki/Support-vector_machine) is a supervised machine learning model.
-In its basic form SVMs are used for binary classification tasks. 
+In its basic form SVMs are used for binary classification tasks.
 Their fundamental idea is to learn a hyperplane which separates the two classes best, i.e., where the widest possible margin around its decision boundary is free of data.
 This is also the reason, why SVMs are also called "large margin classifiers".
 To predict to which class a new, unseen data point belongs, the SVM simply has to calculate on which side of the previously calculated hyperplane the data point lies.
@@ -46,7 +46,7 @@ General dependencies:
 - [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful)
 - [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled
 - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up file parsing
-- multiple Python modules used in the utility scripts; <br>to install all modules use `pip install --user -r install/python_requirements.txt`
+- multiple Python modules used in the utility scripts, to install all modules use `pip install --user -r install/python_requirements.txt`
 
 Additional dependencies for the OpenMP backend:
 
@@ -96,7 +96,7 @@ At least one of the above targets must be present.
 Note that when using DPC++ only a single architectural specification for `cpu` or `amd` is allowed.
 
 To retrieve the architectural specifications of the current system, a simple Python3 script `utility/plssvm_target_platforms.py` is provided
-(required Python3 dependencies: 
+(required Python3 dependencies:
 [`argparse`](https://docs.python.org/3/library/argparse.html), [`py-cpuinfo`](https://pypi.org/project/py-cpuinfo/),
 [`GPUtil`](https://pypi.org/project/GPUtil/), [`pyamdgpuinfo`](https://pypi.org/project/pyamdgpuinfo/), and
 [`pylspci`](https://pypi.org/project/pylspci/))

From f2b629d73d01d555ad205474ce779d6aa8ab8cdb Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Mar 2022 15:08:49 +0100
Subject: [PATCH 49/56] Merge.

---
 src/plssvm/backends/SYCL/CMakeLists.txt | 77 ++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 15 deletions(-)

diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index cef2b4073..3d3508a29 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -8,17 +8,27 @@
 message(CHECK_START "Checking for SYCL backend")
 
 # reformat PLSSVM_TARGET_PLATFORMS to be usable with HIPSYCL_TARGETS (in case hipSYCL may be available)
-set(HIPSYCL_TARGETS ${PLSSVM_TARGET_PLATFORMS} CACHE STRING "" FORCE)
+set(HIPSYCL_TARGETS "${PLSSVM_TARGET_PLATFORMS}" CACHE STRING "" FORCE)
 list(TRANSFORM HIPSYCL_TARGETS REPLACE "cpu" "omp")
 list(TRANSFORM HIPSYCL_TARGETS REPLACE "nvidia" "cuda")
 list(TRANSFORM HIPSYCL_TARGETS REPLACE "amd" "hip")
 list(TRANSFORM HIPSYCL_TARGETS REPLACE "intel" "spirv")
+# remove CPU and Intel GPU target architectures since they are not supported when using hipSYCL
+if(DEFINED PLSSVM_CPU_TARGET_ARCHS AND PLSSVM_NUM_CPU_TARGET_ARCHS GREATER 0)
+    string(REPLACE ";" "," PLSSVM_CPU_TARGET_ARCHS_COMMA "${PLSSVM_CPU_TARGET_ARCHS}")
+    string(REPLACE ":${PLSSVM_CPU_TARGET_ARCHS_COMMA}" "" HIPSYCL_TARGETS "${HIPSYCL_TARGETS}")
+endif()
+if(DEFINED PLSSVM_INTEL_TARGET_ARCHS)
+    string(REPLACE ";" "," PLSSVM_INTEL_TARGET_ARCHS_COMMA "${PLSSVM_INTEL_TARGET_ARCHS}")
+    string(REPLACE ":${PLSSVM_INTEL_TARGET_ARCHS_COMMA}" "" HIPSYCL_TARGETS "${HIPSYCL_TARGETS}")
+endif()
 
 # check if hipSYCL is used as SYCL compiler
 find_package(hipSYCL CONFIG)
 if(hipSYCL_FOUND)
     set(PLSSVM_SYCL_BACKEND_COMPILER "hipSYCL" CACHE STRING "" FORCE)
     message(CHECK_PASS "found hipSYCL")
+    message(STATUS "Setting HIPSYCL_TARGETS to \"${HIPSYCL_TARGETS}\".")
 else()
     # if not, check if DPC++ is used instead
     try_compile(PLSSVM_SYCL_BACKEND_CHECK_FOR_DPCPP_COMPILER
@@ -58,30 +68,61 @@ if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL")
     # set backend compiler to hipSYCL (= 1)
     target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=1)
     # silence unknown options warnings
-    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -Wno-unknown-warning-option)
+    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Wno-unknown-warning-option)
+
+    # print note that Intel GPU architecture specifications are ignored when using hipSYCL
+    if(DEFINED PLSSVM_INTEL_TARGET_ARCHS)
+        message(STATUS "Ignoring specified Intel architectures \"${PLSSVM_INTEL_TARGET_ARCHS}\" in favor of SPIR-V when using hipSYCL!")
+    endif()
 elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++")
     # enable DPC++ SYCL support
     target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl)
     target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl)
 
+    set(PLSSVM_DPCPP_FSYCL_TARGETS "")
+    # cpu targets
+    if(DEFINED PLSSVM_CPU_TARGET_ARCHS)
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "spir64_x86_64")
+        # add target specific flags for AOT
+        if(PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1)
+            target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
+            target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
+        endif()
+    endif()
     # nvidia targets
     if(DEFINED PLSSVM_NVIDIA_TARGET_ARCHS)
-        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda)
-        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda)
-        foreach(PLSSVM_NVIDIA_TARGET_ARCH_NAME ${PLSSVM_NVIDIA_TARGET_ARCHS})
-            target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME})
-            target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCH_NAME})
-        endforeach()
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "nvptx64-nvidia-cuda")
+        # add target specific flags for AOT
+        list(JOIN PLSSVM_NVIDIA_TARGET_ARCHS "," PLSSVM_NVIDIA_TARGET_ARCHS_STRING)
+        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda "-cuda-gpu-arch=${PLSSVM_NVIDIA_TARGET_ARCHS_STRING}")
+        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda "-cuda-gpu-arch=${PLSSVM_NVIDIA_TARGET_ARCHS_STRING}")
     endif()
     # amd targets
     if(DEFINED PLSSVM_AMD_TARGET_ARCHS)
-        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=amdgcn-amd-amdhsa)
-        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=amdgcn-amd-amdhsa)
-        foreach(PLSSVM_AMD_TARGET_ARCH_NAME ${PLSSVM_AMD_TARGET_ARCHS})
-            target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCH_NAME})
-            target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCH_NAME})
-        endforeach()
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "amdgcn-amd-amdhsa")
+        # add target specific flags for AOT
+        if(NOT PLSSVM_NUM_AMD_TARGET_ARCHS EQUAL 1)
+            message(FATAL_ERROR "DPC++ currently only supports a single AMD architecture specification but ${PLSSVM_NUM_AMD_TARGET_ARCHS} were provided!")
+        endif()
+        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS})
+        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS})
+    endif()
+    # intel targets
+    if(DEFINED PLSSVM_INTEL_TARGET_ARCHS)
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "spir64_gen")
+        # add target specific flags for AOT
+        list(JOIN PLSSVM_INTEL_TARGET_ARCHS "," PLSSVM_INTEL_TARGET_ARCHS_STRING)
+        target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
+        target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
     endif()
+    # set -fsycl-targets
+    list(JOIN PLSSVM_DPCPP_FSYCL_TARGETS "," PLSSVM_DPCPP_FSYCL_TARGETS_STRING)
+    target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=${PLSSVM_DPCPP_FSYCL_TARGETS_STRING})
+    target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=${PLSSVM_DPCPP_FSYCL_TARGETS_STRING})
 
     # set backend compiler to DPC++ (= 0)
     target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=0)
@@ -89,7 +130,7 @@ elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++")
     # be able to choose between the Level-Zero and OpenCL DPC++ backend
     option(PLSSVM_SYCL_DPCPP_USE_LEVEL_ZERO "Enable Level-Zero backend in favor of OpenCL when using DPC++." OFF)
     if(PLSSVM_SYCL_BACKEND_DPCPP_USE_LEVEL_ZERO)
-        target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_DPCPP_BACKEND_TYPE="level-zero")
+        target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_DPCPP_BACKEND_TYPE="level_zero")
     else()
         target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_DPCPP_BACKEND_TYPE="opencl")
     endif()
@@ -101,6 +142,7 @@ target_link_libraries(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PUBLIC ${PLSSVM_BASE_L
 
 # set compile definition that the SYCL backend is available
 target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_HAS_SYCL_BACKEND)
+target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_SYCL_BACKEND_COMPILER_NAME="${PLSSVM_SYCL_BACKEND_COMPILER}")
 
 # link against interface library
 target_link_libraries(${PLSSVM_ALL_LIBRARY_NAME} INTERFACE ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
@@ -114,5 +156,10 @@ set(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_TARGETS_TO_INSTALL} PARENT_SCOPE)
 set(PLSSVM_SYCL_BACKEND_SUMMARY_STRING_COMPILER " - SYCL (${PLSSVM_SYCL_BACKEND_COMPILER}):")
 include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake)
 assemble_summary_string(PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS)
+# do not print CPU and Intel GPU target architectures when using hipSYCL
+if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL")
+    string(REPLACE " (${PLSSVM_CPU_TARGET_ARCHS})" "" PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}")
+    string(REPLACE " (${PLSSVM_INTEL_TARGET_ARCHS})" "" PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}")
+endif()
 set(PLSSVM_SYCL_BACKEND_SUMMARY_STRING "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_COMPILER}${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}" PARENT_SCOPE)
 

From 20f0c68232b5683d11ca716e537b5343ada09a30 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Mar 2022 18:08:37 +0100
Subject: [PATCH 50/56] Change poly and rbf kernel back to nd_range (the
 fastest after a view tests on multiple hardware).

---
 include/plssvm/backends/SYCL/predict_kernel.hpp | 16 ++++++++--------
 src/plssvm/backends/SYCL/csvm.cpp               |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp
index f626bcfcf..3e1b02435 100644
--- a/include/plssvm/backends/SYCL/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/predict_kernel.hpp
@@ -98,11 +98,11 @@ class device_kernel_predict_poly {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] idx the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::id<2> idx) const {
-        const kernel_index_type data_point_index = idx[0];
-        const kernel_index_type predict_point_index = idx[1];
+    void operator()(::sycl::nd_item<2> idx) const {
+        const kernel_index_type data_point_index = idx.get_global_id(0);
+        const kernel_index_type predict_point_index = idx.get_global_id(1);
 
         real_type temp = 0;
         if (predict_point_index < num_predict_points_) {
@@ -163,11 +163,11 @@ class device_kernel_predict_radial {
 
     /**
      * @brief Function call operator overload performing the actual calculation.
-     * @param[in] idx the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
+     * @param[in] idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
      */
-    void operator()(::sycl::id<2> idx) const {
-        const kernel_index_type data_point_index = idx[0];
-        const kernel_index_type predict_point_index = idx[1];
+    void operator()(::sycl::nd_item<2> idx) const {
+        const kernel_index_type data_point_index = idx.get_global_id(0);
+        const kernel_index_type predict_point_index = idx.get_global_id(1);
 
         real_type temp = 0;
         if (predict_point_index < num_predict_points_) {
diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 566f23ce2..8f47c2e06 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -221,16 +221,16 @@ void csvm<T>::run_w_kernel(const std::size_t device, [[maybe_unused]] const ::pl
 
 template <typename T>
 void csvm<T>::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) {
-    [[maybe_unused]] const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, kernel_invocation_type::nd_range);
-
+    const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, kernel_invocation_type::nd_range);
     switch (kernel_) {
         case kernel_type::linear:
             break;
         case kernel_type::polynomial:
-            devices_[0].parallel_for(::sycl::range<2>{ num_data_points_, num_predict_points }, device_kernel_predict_poly(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, degree_, gamma_, coef0_));
+            devices_[0].parallel_for(execution_range, device_kernel_predict_poly(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, degree_, gamma_, coef0_));
+
             break;
         case kernel_type::rbf:
-            devices_[0].parallel_for(::sycl::range<2>{ num_data_points_, num_predict_points }, device_kernel_predict_radial(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, gamma_));
+            devices_[0].parallel_for(execution_range, device_kernel_predict_radial(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, gamma_));
             break;
     }
 }

From 93d077fe2b08a82f97457694b1a5b647e57473f4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Mar 2022 18:10:05 +0100
Subject: [PATCH 51/56] Remove warning output since the kernel invocation
 mechanism is not used when predicting.

---
 src/main_predict.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index ab2a42856..d4b73f900 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -32,14 +32,6 @@ int main(int argc, char *argv[]) {
         // parse SVM parameter from command line
         plssvm::parameter_predict<real_type> params{ argc, argv };
 
-        // warn if kernel invocation type nd_range or hierarchical are explicitly set but SYCL isn't the current backend
-        if (params.backend != plssvm::backend_type::sycl && params.sycl_kernel_invocation_type != plssvm::sycl::kernel_invocation_type::automatic) {
-            std::clog << fmt::format(
-                "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}",
-                params.sycl_kernel_invocation_type)
-                      << std::endl;
-        }
-
         // output used parameter
         if (params.print_info) {
             fmt::print("\n");

From 8038953252e60dda12bc63389b84bbb37295f4b9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Mar 2022 18:10:29 +0100
Subject: [PATCH 52/56] Bump version to 1.1.0 and change spelling error.

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66b7c780c..03f2a6e18 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,10 +6,10 @@
 
 cmake_minimum_required(VERSION 3.18)
 
-project("PLSSVM - Parallel Least-Squares Support Vector Machine"
-        VERSION 1.0.1
+project("PLSSVM - Parallel Least Squares Support Vector Machine"
+        VERSION 1.1.0
         LANGUAGES CXX
-        DESCRIPTION "A Support Vector Machine implementation using different backends.")
+        DESCRIPTION "A Least Squares Support Vector Machine implementation using different backends.")
 
 
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/add_custom_build_type.cmake)

From 4a4cefa016566b01bcc19cce89bd5567a072ef43 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Mar 2022 18:11:08 +0100
Subject: [PATCH 53/56] Make clear that the different invocation types are only
 used in the svm_kernel.

---
 src/plssvm/backends/SYCL/csvm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp
index 8f47c2e06..09e18f17d 100644
--- a/src/plssvm/backends/SYCL/csvm.cpp
+++ b/src/plssvm/backends/SYCL/csvm.cpp
@@ -80,7 +80,7 @@ csvm<T>::csvm(const parameter<T> &params) :
     }
 
     if (print_info_) {
-        fmt::print("Using SYCL ({}) as backend with the kernel invocation type \"{}\".\n", PLSSVM_SYCL_BACKEND_COMPILER_NAME, invocation_type_);
+        fmt::print("Using SYCL ({}) as backend with the kernel invocation type \"{}\" for the svm_kernel.\n", PLSSVM_SYCL_BACKEND_COMPILER_NAME, invocation_type_);
         if (target_ == target_platform::automatic) {
             fmt::print("Using {} as automatic target platform.\n", used_target);
         }

From 06cd4e04a11dabde90e01c5523135f7075d13be6 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Mar 2022 18:17:41 +0100
Subject: [PATCH 54/56] Update README with new flag.

---
 README.md | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 2a183e466..619fdcace 100644
--- a/README.md
+++ b/README.md
@@ -259,9 +259,9 @@ LS-SVM with multiple (GPU-)backends
 Usage:
   ./svm-train [OPTION...] training_set_file [model_file]
 
-  -t, --kernel_type arg         set type of kernel function.
+  -t, --kernel_type arg         set type of kernel function. 
                                          0 -- linear: u'*v
-                                         1 -- polynomial: (gamma*u'*v + coef0)^degree
+                                         1 -- polynomial: (gamma*u'*v + coef0)^degree 
                                          2 -- radial basis function: exp(-gamma*|u-v|^2) (default: 0)
   -d, --degree arg              set degree in kernel function (default: 3)
   -g, --gamma arg               set gamma in kernel function (default: 1 / num_features)
@@ -270,11 +270,13 @@ Usage:
   -e, --epsilon arg             set the tolerance of termination criterion (default: 0.001)
   -b, --backend arg             choose the backend: openmp|cuda|opencl|sycl (default: openmp)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
+      --sycl_kernel_invocation_type arg
+                                choose the kernel invocation type when using SYCL as backend: automatic|nd_range|hierarchical (default: automatic)
   -q, --quiet                   quiet mode (no outputs)
   -h, --help                    print this helper message
       --input training_set_file
-
-      --model model_file
+                                
+      --model model_file  
 ```
 
 An example invocation using the CUDA backend could look like:
@@ -289,13 +291,17 @@ Another example targeting NVIDIA GPUs using the SYCL backend looks like:
 ./svm-train --backend sycl --target_platform gpu_nvidia --input /path/to/data_file
 ```
 
-The `--target_platform=automatic` flags works for the different backends as follows:
+The `--target_platform=automatic` flag works for the different backends as follows:
 
 - `OpenMP`: always selects a CPU
 - `CUDA`: always selects an NVIDIA GPU (if no NVIDIA GPU is available, throws an exception)
 - `OpenCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU
 - `SYCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU
 
+The `--sycl_kernel_invocation_type` flag is only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`.
+If the `--sycl_kernel_invocation_type` is `automatic`, the `nd_range` invocation type is always used, 
+except for hipSYCL on CPUs where the hierarchical formulation is used instead.
+
 ### Predicting
 
 ```bash

From 12d4f80cd00b6b8ef7e24335d313e6d74cd0fcf2 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 8 Mar 2022 16:20:29 +0100
Subject: [PATCH 55/56] Disallow DPC++ with CMake build type Debug due to a
 compiler bug (https://github.com/intel/llvm/issues/5754).

---
 .jenkins/Jenkinsfile-tests              | 2 ++
 src/plssvm/backends/SYCL/CMakeLists.txt | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/.jenkins/Jenkinsfile-tests b/.jenkins/Jenkinsfile-tests
index 9ef8a65ea..49ae24613 100644
--- a/.jenkins/Jenkinsfile-tests
+++ b/.jenkins/Jenkinsfile-tests
@@ -198,6 +198,7 @@ pipeline {
                 }
             }
         }
+/*
         stage('build plssvm DPC++ Debug') {
             steps {
                 dir('plssvm') {
@@ -214,6 +215,7 @@ pipeline {
             }
         }
     }
+*/
     post {
         always {
             // Process the CTest xml output with the xUnit plugin
diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index 3d3508a29..10665c1be 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -75,6 +75,11 @@ if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL")
         message(STATUS "Ignoring specified Intel architectures \"${PLSSVM_INTEL_TARGET_ARCHS}\" in favor of SPIR-V when using hipSYCL!")
     endif()
 elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++")
+    # TODO: remove if DPC++ bug is fixed
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+        message(FATAL_ERROR "The SYCL backend when using DPC++ currently does not support the Debug build type. For more information see: https://github.com/intel/llvm/issues/5754")
+    endif()
+
     # enable DPC++ SYCL support
     target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl)
     target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl)

From 2d341573ed8517a9bc4973e2f9f1ecfe3e372d9b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 8 Mar 2022 16:23:46 +0100
Subject: [PATCH 56/56] Fix error in Jenkins file.

---
 .jenkins/Jenkinsfile-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/Jenkinsfile-tests b/.jenkins/Jenkinsfile-tests
index 49ae24613..4664e8a06 100644
--- a/.jenkins/Jenkinsfile-tests
+++ b/.jenkins/Jenkinsfile-tests
@@ -214,8 +214,8 @@ pipeline {
                 }
             }
         }
-    }
 */
+    }
     post {
         always {
             // Process the CTest xml output with the xUnit plugin