diff --git a/.jenkins/Jenkinsfile-multigpu-tests b/.jenkins/Jenkinsfile-multigpu-tests index 7b3a6e3e6..a317e803a 100644 --- a/.jenkins/Jenkinsfile-multigpu-tests +++ b/.jenkins/Jenkinsfile-multigpu-tests @@ -148,7 +148,7 @@ pipeline { source /scratch/jenkins/spack/share/spack/setup-env.sh && spack load cmake@3.20.2 &&\ module load cuda/11.2.2 &&\ module use /home/breyerml/.modulefiles/ &&\ - module load pcsgs05/hipsycl_20_11_16 &&\ + module load plssvm/argon-gtx/hipsycl &&\ mkdir -p build/Release_hip &&\ cd build/Release_hip &&\ rm -rf * &&\ @@ -166,7 +166,7 @@ pipeline { srun -w argon-gtx -N 1 -n 1 -t 01:00:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM --gres=gpu:2 bash -c "\ module load cuda/11.2.2 &&\ module use /home/breyerml/.modulefiles/ &&\ - module load pcsgs05/hipsycl_20_11_16 &&\ + module load plssvm/argon-gtx/hipsycl &&\ cd build/Release_hip &&\ ctest -j4 --no-compress-output -T Test --timeout 1200; \ returncode=$? && \ @@ -189,11 +189,11 @@ pipeline { source /scratch/jenkins/spack/share/spack/setup-env.sh && spack load cmake@3.20.2 &&\ module load cuda/11.2.2 &&\ module use /home/breyerml/.modulefiles/ &&\ - module load pcsgs05/dpcpp_2022_01_18 &&\ + module load plssvm/argon-gtx/dpcpp &&\ mkdir -p build/Release_dpcpp &&\ cd build/Release_dpcpp &&\ rm -rf * &&\ - cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS='cpu;nvidia:sm_61' -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON -DPLSSVM_ENABLE_LTO=OFF -S ../../ &&\ + cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS='nvidia:sm_61' -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON -DPLSSVM_ENABLE_LTO=OFF -S ../../ &&\ make -j4 " ''' } @@ -207,7 +207,7 @@ pipeline { srun -w argon-gtx -N 1 -n 1 -t 01:00:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM --gres=gpu:2 bash -c "\ module load cuda/11.2.2 &&\ module use /home/breyerml/.modulefiles/ &&\ - module load pcsgs05/dpcpp_2022_01_18 &&\ + module load plssvm/argon-gtx/dpcpp &&\ cd build/Release_dpcpp &&\ ctest -j4 --no-compress-output -T Test --timeout 1200; \ returncode=$? && \ diff --git a/.jenkins/Jenkinsfile-tests b/.jenkins/Jenkinsfile-tests index edf238dfd..4664e8a06 100644 --- a/.jenkins/Jenkinsfile-tests +++ b/.jenkins/Jenkinsfile-tests @@ -112,9 +112,8 @@ pipeline { steps { dir('plssvm') { sh ''' - module load cuda/11.2.2 module use /home/breyerml/.modulefiles/ - module load pcsgs05/hipsycl_20_11_16 + module load network_hipsycl_cuda mkdir -p build/Release_hip cd build/Release_hip rm -rf * @@ -129,9 +128,8 @@ pipeline { dir('plssvm') { warnError('hipSYCL Release tests failed!') { sh ''' - module load cuda/11.2.2 module use /home/breyerml/.modulefiles/ - module load pcsgs05/hipsycl_20_11_16 + module load network_hipsycl_cuda cd build/Release_hip ctest -j4 --no-compress-output -T Test ''' @@ -143,13 +141,12 @@ pipeline { steps { dir('plssvm') { sh ''' - module load cuda/11.2.2 module use /home/breyerml/.modulefiles/ - module load pcsgs05/dpcpp_2022_01_18 + module load network_dpcpp_cuda mkdir -p build/Release_dpcpp cd build/Release_dpcpp rm -rf * - /import/sgs.scratch/vancraar/spack/opt/spack/linux-ubuntu20.04-cascadelake/clang-12.0.0/cmake-3.20.2-z3urlvzqm5igtwxj25nnd5olciuq7ayb/bin/cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS="cpu;nvidia:sm_80" -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON ../../ + /import/sgs.scratch/vancraar/spack/opt/spack/linux-ubuntu20.04-cascadelake/clang-12.0.0/cmake-3.20.2-z3urlvzqm5igtwxj25nnd5olciuq7ayb/bin/cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS="nvidia:sm_80" -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON ../../ make -j4 ''' } @@ -160,9 +157,8 @@ pipeline { dir('plssvm') { warnError('DPC++ Release tests failed!') { sh ''' - module load cuda/11.2.2 module use /home/breyerml/.modulefiles/ - module load pcsgs05/dpcpp_2022_01_18 + module load network_dpcpp_cuda cd build/Release_dpcpp ctest -j4 --no-compress-output -T Test ''' @@ -191,9 +187,8 @@ pipeline { steps { dir('plssvm') { sh ''' - module load cuda/11.2.2 module use /home/breyerml/.modulefiles/ - module load pcsgs05/hipsycl_20_11_16 + module load network_hipsycl_cuda mkdir -p build/Debug_hip cd build/Debug_hip rm -rf * @@ -203,22 +198,23 @@ pipeline { } } } +/* stage('build plssvm DPC++ Debug') { steps { dir('plssvm') { sh ''' - module load cuda/11.2.2 module use /home/breyerml/.modulefiles/ - module load pcsgs05/dpcpp_2022_01_18 + module load network_dpcpp_cuda mkdir -p build/Debug_dpcpp cd build/Debug_dpcpp rm -rf * - /import/sgs.scratch/vancraar/spack/opt/spack/linux-ubuntu20.04-cascadelake/clang-12.0.0/cmake-3.20.2-z3urlvzqm5igtwxj25nnd5olciuq7ayb/bin/cmake -DCMAKE_BUILD_TYPE=Debug -DPLSSVM_TARGET_PLATFORMS="cpu;nvidia:sm_80" -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON ../../ + /import/sgs.scratch/vancraar/spack/opt/spack/linux-ubuntu20.04-cascadelake/clang-12.0.0/cmake-3.20.2-z3urlvzqm5igtwxj25nnd5olciuq7ayb/bin/cmake -DCMAKE_BUILD_TYPE=Debug -DPLSSVM_TARGET_PLATFORMS="nvidia:sm_80" -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON ../../ make -j4 ''' } } } +*/ } post { always { diff --git a/CMakeLists.txt b/CMakeLists.txt index 967b5cfe5..03f2a6e18 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,10 +6,10 @@ cmake_minimum_required(VERSION 3.18) -project("PLSSVM - Parallel Least-Squares Support Vector Machine" - VERSION 1.0.1 +project("PLSSVM - Parallel Least Squares Support Vector Machine" + VERSION 1.1.0 LANGUAGES CXX - DESCRIPTION "A Support Vector Machine implementation using different backends.") + DESCRIPTION "A Least Squares Support Vector Machine implementation using different backends.") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/add_custom_build_type.cmake) @@ -39,6 +39,7 @@ set(PLSSVM_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_predict.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_train.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/target_platforms.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/plssvm/backends/SYCL/kernel_invocation_type.cpp ) ## create base library: linked against all backend libraries @@ -255,28 +256,29 @@ foreach(PLSSVM_PLATFORM ${PLSSVM_TARGET_PLATFORMS}) if(PLSSVM_PLATFORM MATCHES "^cpu") # parse provided CPU architectures parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_CPU_TARGET_ARCHS PLSSVM_NUM_CPU_TARGET_ARCHS) - if(NOT PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 0) - message(FATAL_ERROR "Target platform \"cpu\" must not have any architecture specifications!") + if(PLSSVM_NUM_CPU_TARGET_ARCHS GREATER 1) + message(FATAL_ERROR "Target platform \"cpu\" must at most have one architecture specification!") endif() target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_CPU_TARGET) elseif(PLSSVM_PLATFORM MATCHES "^nvidia") # parse provided NVIDIA GPU architectures parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_NVIDIA_TARGET_ARCHS PLSSVM_NUM_NVIDIA_TARGET_ARCHS) if(PLSSVM_NUM_NVIDIA_TARGET_ARCHS EQUAL 0) - message(FATAL_ERROR "Target platform \"nvidia\" must at least have one architecture specifications!") + message(FATAL_ERROR "Target platform \"nvidia\" must at least have one architecture specification!") endif() target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_NVIDIA_TARGET) elseif(PLSSVM_PLATFORM MATCHES "^amd") # parse provided AMD GPU architectures parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_AMD_TARGET_ARCHS PLSSVM_NUM_AMD_TARGET_ARCHS) if(PLSSVM_NUM_AMD_TARGET_ARCHS EQUAL 0) - message(FATAL_ERROR "Target platform \"amd\" must at least have one architecture specifications!") + message(FATAL_ERROR "Target platform \"amd\" must at least have one architecture specification!") endif() target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_AMD_TARGET) elseif(PLSSVM_PLATFORM MATCHES "^intel") + # parse provided Intel GPU architectures parse_architecture_info(${PLSSVM_PLATFORM} PLSSVM_INTEL_TARGET_ARCHS PLSSVM_NUM_INTEL_TARGET_ARCHS) - if(NOT PLSSVM_NUM_INTEL_TARGET_ARCHS EQUAL 0) - message(FATAL_ERROR "Target platform \"intel\" must not have any architecture specifications!") + if(PLSSVM_NUM_INTEL_TARGET_ARCHS EQUAL 0) + message(FATAL_ERROR "Target platform \"intel\" must at least have one architecture specification!") endif() target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_HAS_INTEL_TARGET) else() diff --git a/LICENSE.md b/LICENSE.md index 800ee95c5..aa3947026 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,4 +1,4 @@ -MIT License +# MIT License Copyright (c) 2021 Alexander Van Craen and Marcel Breyer @ University of Stuttgart diff --git a/README.md b/README.md index 3f963626d..619fdcace 100644 --- a/README.md +++ b/README.md @@ -1,130 +1,176 @@ -# Least Squares Support Vector Machine +# PLSSVM - Parallel Least Squares Support Vector Machine [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e780a63075ce40c29c49d3df4f57c2af)](https://www.codacy.com/gh/SC-SGS/PLSSVM/dashboard?utm_source=github.com&utm_medium=referral&utm_content=SC-SGS/PLSSVM&utm_campaign=Badge_Grade)   [![Generate documentation](https://github.com/SC-SGS/PLSSVM/actions/workflows/documentation.yml/badge.svg)](https://sc-sgs.github.io/PLSSVM/)   [![Build Status Linux CPU + GPU](https://simsgs.informatik.uni-stuttgart.de/jenkins/buildStatus/icon?job=PLSSVM%2FMultibranch-Github%2Fmain&subject=Linux+CPU/GPU)](https://simsgs.informatik.uni-stuttgart.de/jenkins/view/PLSSVM/job/PLSSVM/job/Multibranch-Github/job/main/)   [![Windows CPU](https://github.com/SC-SGS/PLSSVM/actions/workflows/msvc_windows.yml/badge.svg)](https://github.com/SC-SGS/PLSSVM/actions/workflows/msvc_windows.yml) -Implementation of a parallel [least squares support vector machine](https://en.wikipedia.org/wiki/Least-squares_support-vector_machine) using multiple different backends. -The currently available backends are: - - [OpenMP](https://www.openmp.org/) - - [CUDA](https://developer.nvidia.com/cuda-zone) - - [OpenCL](https://www.khronos.org/opencl/) - - [SYCL](https://www.khronos.org/sycl/) +A [Support Vector Machine (SVM)](https://en.wikipedia.org/wiki/Support-vector_machine) is a supervised machine learning model. +In its basic form SVMs are used for binary classification tasks. +Their fundamental idea is to learn a hyperplane which separates the two classes best, i.e., where the widest possible margin around its decision boundary is free of data. +This is also the reason, why SVMs are also called "large margin classifiers". +To predict to which class a new, unseen data point belongs, the SVM simply has to calculate on which side of the previously calculated hyperplane the data point lies. +This is very efficient since it only involves a single scalar product of the size corresponding to the numer of features of the data set. + +However, normal SVMs suffer in their potential parallelizability. +Determining the hyperplane boils down to solving a konvex quadratic problem. +For this, most SVM implementations use Sequential Minimal Optimization (SMO), an inherently sequential algorithm. +The basic idea of this algorithm is that it takes a pair of data points and calculates the hyperplane between them. +Afterward, two new data points are selected and the existing hyperplane is adjusted accordingly. +This procedure is repeat until a new adjustment would be smaller than some epsilon greater than zero. + +Some SVM implementations try to harness some parallelization potential by not drawing point pairs but group of points. +In this case, the hyperplane calculation inside this group is parallelized. +However, even then modern highly parallel hardware can not be utilized efficiently. + +Therefore, we implemented a version of the original proposed SVM called [Least Squares Support Vector Machine (LS-SVM)](https://en.wikipedia.org/wiki/Least-squares_support-vector_machine). +The LS-SVMs reformulated the original problem such that it boils down to solving a system of linear equations. +For this kind of problem many highly parallel algorithms and implementations are known. +We decided to use the [Conjugate Gradient (CG)](https://en.wikipedia.org/wiki/Conjugate_gradient_method) to solve the system of linear equations. + +Since one of our main goals was performance, we parallelized the implicit matrix-vector multiplication inside the CG algorithm. +To do so, we use multiple different frameworks to be able to target a broad variety of different hardware platforms. +The currently available frameworks (also called backends in our PLSSVM implementation) are: + +- [OpenMP](https://www.openmp.org/) +- [CUDA](https://developer.nvidia.com/cuda-zone) +- [OpenCL](https://www.khronos.org/opencl/) +- [SYCL](https://www.khronos.org/sycl/) (tested implementations are [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL)) ## Getting Started ### Dependencies General dependencies: - - a C++17 capable compiler (e.g. [`gcc`](https://gcc.gnu.org/) or [`clang`](https://clang.llvm.org/)) - - [CMake](https://cmake.org/) 3.18 or newer - - [cxxopts](https://github.com/jarro2783/cxxopts), [fast_float](https://github.com/fastfloat/fast_float) and [{fmt}](https://github.com/fmtlib/fmt) (all three are automatically build during the CMake configuration if they couldn't be found using the respective `find_package` call) - - [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful) - - [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled - - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up file parsing + +- a C++17 capable compiler (e.g. [`gcc`](https://gcc.gnu.org/) or [`clang`](https://clang.llvm.org/)) +- [CMake](https://cmake.org/) 3.18 or newer +- [cxxopts](https://github.com/jarro2783/cxxopts), [fast_float](https://github.com/fastfloat/fast_float) and [{fmt}](https://github.com/fmtlib/fmt) (all three are automatically build during the CMake configuration if they couldn't be found using the respective `find_package` call) +- [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful) +- [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled +- [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up file parsing +- multiple Python modules used in the utility scripts, to install all modules use `pip install --user -r install/python_requirements.txt` Additional dependencies for the OpenMP backend: - - compiler with OpenMP support + +- compiler with OpenMP support Additional dependencies for the CUDA backend: - - CUDA SDK - - either NVIDIA [`nvcc`](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) or [`clang` with CUDA support enabled](https://llvm.org/docs/CompileCudaWithLLVM.html) + +- CUDA SDK +- either NVIDIA [`nvcc`](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) or [`clang` with CUDA support enabled](https://llvm.org/docs/CompileCudaWithLLVM.html) Additional dependencies for the OpenCL backend: - - OpenCL runtime and header files + +- OpenCL runtime and header files Additional dependencies for the SYCL backend: - - the code must be compiled with a SYCL capable compiler; currently tested with [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL) + +- the code must be compiled with a SYCL capable compiler; currently tested with [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL) Additional dependencies if `PLSSVM_ENABLE_TESTING` and `PLSSVM_GENERATE_TEST_FILE` are both set to `ON`: - - [Python3](https://www.python.org/) with the [`argparse`](https://docs.python.org/3/library/argparse.html), [`timeit`](https://docs.python.org/3/library/timeit.html) and [`sklearn`](https://scikit-learn.org/stable/) modules +- [Python3](https://www.python.org/) with the [`argparse`](https://docs.python.org/3/library/argparse.html), [`timeit`](https://docs.python.org/3/library/timeit.html) and [`sklearn`](https://scikit-learn.org/stable/) modules ### Building Building the library can be done using the normal CMake approach: ```bash -> git clone git@gitlab-sim.informatik.uni-stuttgart.de:vancraar/Bachelor-Code.git SVM -> cd SVM/SVM -> mkdir build && cd build -> cmake -DPLSSVM_TARGET_PLATFORMS="..." [optional_options] .. -> cmake --build . +git clone git@github.com:SC-SGS/PLSSVM.git +cd PLSSVM +mkdir build && cd build +cmake -DPLSSVM_TARGET_PLATFORMS="..." [optional_options] .. +cmake --build . ``` #### Target Platform Selection The **required** CMake option `PLSSVM_TARGET_PLATFORMS` is used to determine for which targets the backends should be compiled. Valid targets are: - - `cpu`: compile for the CPU; **no** architectural specifications is allowed - - `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g. `nvidia:sm_86,sm_70` - - `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g. `amd:gfx906` - - `intel`: compile for Intel GPUs; **no** architectural specification is allowed + +- `cpu`: compile for the CPU; an **optional** architectural specifications is allowed but only used when compiling with DPC++, e.g., `cpu:avx2` +- `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g., `nvidia:sm_86,sm_70` +- `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g., `amd:gfx906` +- `intel`: compile for Intel GPUs; **at least one** architectural specification is necessary, e.g., `intel:skl` At least one of the above targets must be present. -To retrieve the architectural specification, given an NVIDIA or AMD GPU name, a simple Python3 script `utility/gpu_name_to_arch.py` is provided -(requiring Python3 [`argparse`](https://docs.python.org/3/library/argparse.html) as dependency): +Note that when using DPC++ only a single architectural specification for `cpu` or `amd` is allowed. + +To retrieve the architectural specifications of the current system, a simple Python3 script `utility/plssvm_target_platforms.py` is provided +(required Python3 dependencies: +[`argparse`](https://docs.python.org/3/library/argparse.html), [`py-cpuinfo`](https://pypi.org/project/py-cpuinfo/), +[`GPUtil`](https://pypi.org/project/GPUtil/), [`pyamdgpuinfo`](https://pypi.org/project/pyamdgpuinfo/), and +[`pylspci`](https://pypi.org/project/pylspci/)) ```bash -> python3 utility/gpu_name_to_arch.py --help -usage: gpu_name_to_arch.py [-h] [--name NAME] +python3 utility/plssvm_target_platforms.py --help +usage: plssvm_target_platforms.py [-h] [--quiet] optional arguments: - -h, --help show this help message and exit - --name NAME the full name of the GPU (e.g. GeForce RTX 3080) + -h, --help show this help message and exit + --quiet only output the final PLSSVM_TARGET_PLATFORMS string ``` Example invocations: ```bash -> python3 utility_scripts/gpu_name_to_arch.py --name "GeForce RTX 3080" -sm_86 -> python3 utility_scripts/gpu_name_to_arch.py --name "Radeon VII" -gfx906 -``` +python3 utility_scripts/plssvm_target_platforms.py +Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz: {'avx512': True, 'avx2': True, 'avx': True, 'sse4_2': True} + +Found 1 NVIDIA GPU(s): + 1x NVIDIA GeForce RTX 3080: sm_86 -If no GPU name is provided, the script tries to automatically detect any NVIDIA or AMD GPU -(requires the Python3 dependencies [`GPUtil`](https://pypi.org/project/GPUtil/) and [`pyamdgpuinfo`](https://pypi.org/project/pyamdgpuinfo/)). +Possible -DPLSSVM_TARGET_PLATFORMS entries: +cpu:avx512;nvidia:sm_86 + +python3 utility_scripts/plssvm_target_platforms.py --quiet +cpu:avx512;intel:dg1 +``` If the architectural information for the requested GPU could not be retrieved, one option would be to have a look at: - - for NVIDIA GPUs: [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) - - for AMD GPUs: [ROCm Documentation](https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/master/ROCm_Compiler_SDK/ROCm-Native-ISA.rst) +- for NVIDIA GPUs: [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) +- for AMD GPUs: [clang AMDGPU backend usage](https://llvm.org/docs/AMDGPUUsage.html) +- for Intel GPUs and CPUs: [Ahead of Time Compilation](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html) and [Intel graphics processor table](https://dgpu-docs.intel.com/devices/hardware-table.html) #### Optional CMake Options The `[optional_options]` can be one or multiple of: - - `PLSSVM_ENABLE_OPENMP_BACKEND=ON|OFF|AUTO` (default: `AUTO`): - - `ON`: check for the OpenMP backend and fail if not available - - `AUTO`: check for the OpenMP backend but **do not** fail if not available - - `OFF`: do not check for the OpenMP backend - - `PLSSVM_ENABLE_CUDA_BACKEND=ON|OFF|AUTO` (default: `AUTO`): - - `ON`: check for the CUDA backend and fail if not available - - `AUTO`: check for the CUDA backend but **do not** fail if not available - - `OFF`: do not check for the CUDA backend - - `PLSSVM_ENABLE_OPENCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`): - - `ON`: check for the OpenCL backend and fail if not available - - `AUTO`: check for the OpenCL backend but **do not** fail if not available - - `OFF`: do not check for the OpenCL backend - - `PLSSVM_ENABLE_SYCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`): - - `ON`: check for the SYCL backend and fail if not available - - `AUTO`: check for the SYCL backend but **do not** fail if not available - - `OFF`: do not check for the SYCL backend +- `PLSSVM_ENABLE_OPENMP_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the OpenMP backend and fail if not available + - `AUTO`: check for the OpenMP backend but **do not** fail if not available + - `OFF`: do not check for the OpenMP backend + +- `PLSSVM_ENABLE_CUDA_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the CUDA backend and fail if not available + - `AUTO`: check for the CUDA backend but **do not** fail if not available + - `OFF`: do not check for the CUDA backend + +- `PLSSVM_ENABLE_OPENCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the OpenCL backend and fail if not available + - `AUTO`: check for the OpenCL backend but **do not** fail if not available + - `OFF`: do not check for the OpenCL backend + +- `PLSSVM_ENABLE_SYCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the SYCL backend and fail if not available + - `AUTO`: check for the SYCL backend but **do not** fail if not available + - `OFF`: do not check for the SYCL backend **Attention:** at least one backend must be enabled and available! - - `PLSSVM_ENABLE_ASSERTS=ON|OFF` (default: `OFF`): enables custom assertions regardless whether the `DEBUG` macro is defined or not - - `PLSSVM_THREAD_BLOCK_SIZE` (default: `16`): set a specific thread block size used in the GPU kernels (for fine-tuning optimizations) - - `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `6`: set a specific internal block size used in the GPU kernels (for fine-tuning optimizations) - - `PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION` (default: `OFF`): enables single precision calculations instead of double precision for the `svm-train` and `svm-predict` executables - - `PLSSVM_ENABLE_LTO=ON|OFF` (default: `ON`): enable interprocedural optimization (IPO/LTO) if supported by the compiler - - `PLSSVM_ENABLE_DOCUMENTATION=ON|OFF` (default: `OFF`): enable the `doc` target using doxygen - - `PLSSVM_ENABLE_TESTING=ON|OFF` (default: `ON`): enable testing using GoogleTest and ctest - - `PLSSVM_GENERATE_TIMING_SCRIPT=ON|OFF` (default: `OFF`): configure a timing script usable for performance measurement +- `PLSSVM_ENABLE_ASSERTS=ON|OFF` (default: `OFF`): enables custom assertions regardless whether the `DEBUG` macro is defined or not +- `PLSSVM_THREAD_BLOCK_SIZE` (default: `16`): set a specific thread block size used in the GPU kernels (for fine-tuning optimizations) +- `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `6`: set a specific internal block size used in the GPU kernels (for fine-tuning optimizations) +- `PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION` (default: `OFF`): enables single precision calculations instead of double precision for the `svm-train` and `svm-predict` executables +- `PLSSVM_ENABLE_LTO=ON|OFF` (default: `ON`): enable interprocedural optimization (IPO/LTO) if supported by the compiler +- `PLSSVM_ENABLE_DOCUMENTATION=ON|OFF` (default: `OFF`): enable the `doc` target using doxygen +- `PLSSVM_ENABLE_TESTING=ON|OFF` (default: `ON`): enable testing using GoogleTest and ctest +- `PLSSVM_GENERATE_TIMING_SCRIPT=ON|OFF` (default: `OFF`): configure a timing script usable for performance measurement If `PLSSVM_ENABLE_TESTING` is set to `ON`, the following options can also be set: - - `PLSSVM_GENERATE_TEST_FILE=ON|OFF` (default: `ON`): automatically generate test files - - `PLSSVM_TEST_FILE_NUM_DATA_POINTS` (default: `5000`): the number of data points in the test file + +- `PLSSVM_GENERATE_TEST_FILE=ON|OFF` (default: `ON`): automatically generate test files + - `PLSSVM_TEST_FILE_NUM_DATA_POINTS` (default: `5000`): the number of data points in the test file If the SYCL backend is available and DPC++ is used, the option `PLSSVM_SYCL_DPCPP_USE_LEVEL_ZERO` can be used to select Level-Zero as the DPC++ backend instead of OpenCL. @@ -135,7 +181,7 @@ To use DPC++ as compiler simply set the `CMAKE_CXX_COMPILER` to the respective D To run the tests after building the library (with `PLSSVM_ENABLE_TESTING` set to `ON`) use: ```bash -> ctest +ctest ``` ### Generating test coverage results @@ -144,10 +190,10 @@ To enable the generation of test coverage reports using `locv` the library must Additionally, it's advisable to use smaller test files to shorten the `ctest` step. ```bash -> cmake -DCMAKE_BUILD_TYPE=Coverage -DPLSSVM_TARGET_PLATFORMS="..." \ - -DPLSSVM_TEST_FILE_NUM_DATA_POINTS=100 \ - -DPLSSVM_TEST_FILE_NUM_FEATURES=50 .. -> cmake --build . -- coverage +cmake -DCMAKE_BUILD_TYPE=Coverage -DPLSSVM_TARGET_PLATFORMS="..." \ + -DPLSSVM_TEST_FILE_NUM_DATA_POINTS=100 \ + -DPLSSVM_TEST_FILE_NUM_FEATURES=50 .. +cmake --build . -- coverage ``` The resulting `html` coverage report is located in the `coverage` folder in the build directory. @@ -155,9 +201,11 @@ The resulting `html` coverage report is located in the `coverage` folder in the ### Creating the documentation If doxygen is installed and `PLSSVM_ENABLE_DOCUMENTATION` is set to `ON` the documentation can be build using + ```bash -> make doc +make doc ``` + The documentation of the current state of the main branch can be found [here](https://sc-sgs.github.io/PLSSVM/). ## Installing @@ -165,7 +213,7 @@ The documentation of the current state of the main branch can be found [here](ht The library supports the `install` target: ```bash -> cmake --build . -- install +cmake --build . -- install ``` ## Usage @@ -176,13 +224,13 @@ The repository comes with a Python3 script (in the `utility_scripts/` directory) In order to use all functionality, the following Python3 modules must be installed: [`argparse`](https://docs.python.org/3/library/argparse.html), [`timeit`](https://docs.python.org/3/library/timeit.html), -[`numpy`](https://pypi.org/project/numpy/), [`pandas`](https://pypi.org/project/pandas/), -[`sklearn`](https://scikit-learn.org/stable/), [`arff`](https://pypi.org/project/arff/), +[`numpy`](https://pypi.org/project/numpy/), [`pandas`](https://pypi.org/project/pandas/), +[`sklearn`](https://scikit-learn.org/stable/), [`arff`](https://pypi.org/project/arff/), [`matplotlib`](https://pypi.org/project/matplotlib/) and [`mpl_toolkits`](https://pypi.org/project/matplotlib/) ```bash -> python3 utility_scripts/generate_data**.py --help +python3 utility_scripts/generate_data**.py --help usage: generate_data.py [-h] --output OUTPUT --format FORMAT [--problem PROBLEM] --samples SAMPLES [--test_samples TEST_SAMPLES] --features FEATURES [--plot] optional arguments: @@ -200,20 +248,20 @@ optional arguments: An example invocation generating a data set consisting of blobs with 1000 data points with 200 features each could look like: ```bash -> python3 generate_data.py --ouput data_file --format libsvm --problem blobs --samples 1000 --features 200 +python3 generate_data.py --ouput data_file --format libsvm --problem blobs --samples 1000 --features 200 ``` ### Training ```bash -> ./svm-train --help +./svm-train --help LS-SVM with multiple (GPU-)backends Usage: ./svm-train [OPTION...] training_set_file [model_file] - -t, --kernel_type arg set type of kernel function. + -t, --kernel_type arg set type of kernel function. 0 -- linear: u'*v - 1 -- polynomial: (gamma*u'*v + coef0)^degree + 1 -- polynomial: (gamma*u'*v + coef0)^degree 2 -- radial basis function: exp(-gamma*|u-v|^2) (default: 0) -d, --degree arg set degree in kernel function (default: 3) -g, --gamma arg set gamma in kernel function (default: 1 / num_features) @@ -222,36 +270,42 @@ Usage: -e, --epsilon arg set the tolerance of termination criterion (default: 0.001) -b, --backend arg choose the backend: openmp|cuda|opencl|sycl (default: openmp) -p, --target_platform arg choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic) + --sycl_kernel_invocation_type arg + choose the kernel invocation type when using SYCL as backend: automatic|nd_range|hierarchical (default: automatic) -q, --quiet quiet mode (no outputs) -h, --help print this helper message --input training_set_file - - --model model_file + + --model model_file ``` An example invocation using the CUDA backend could look like: ```bash -> ./svm-train --backend cuda --input /path/to/data_file +./svm-train --backend cuda --input /path/to/data_file ``` Another example targeting NVIDIA GPUs using the SYCL backend looks like: ```bash -> ./svm-train --backend sycl --target_platform gpu_nvidia --input /path/to/data_file +./svm-train --backend sycl --target_platform gpu_nvidia --input /path/to/data_file ``` -The `--target_platform=automatic` flags works for the different backends as follows: +The `--target_platform=automatic` flag works for the different backends as follows: - `OpenMP`: always selects a CPU - `CUDA`: always selects an NVIDIA GPU (if no NVIDIA GPU is available, throws an exception) - `OpenCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU - `SYCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU +The `--sycl_kernel_invocation_type` flag is only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`. +If the `--sycl_kernel_invocation_type` is `automatic`, the `nd_range` invocation type is always used, +except for hipSYCL on CPUs where the hierarchical formulation is used instead. + ### Predicting ```bash -> ./svm-predict --help +./svm-predict --help LS-SVM with multiple (GPU-)backends Usage: ./svm-predict [OPTION...] test_file model_file [output_file] @@ -268,13 +322,13 @@ Usage: An example invocation could look like: ```bash -> ./svm-predict --backend cuda --test /path/to/test_file --model /path/to/model_file +./svm-predict --backend cuda --test /path/to/test_file --model /path/to/model_file ``` Another example targeting NVIDIA GPUs using the SYCL backend looks like: ```bash -> ./svm-predict --backend sycl --target_platform gpu_nvidia --test /path/to/test_file --model /path/to/model_file +./svm-predict --backend sycl --target_platform gpu_nvidia --test /path/to/test_file --model /path/to/model_file ``` The `--target_platform=automatic` flags works like in the training (`./svm-train`) case. @@ -290,7 +344,7 @@ A simple C++ program (`main.cpp`) using this library could look like: #include #include -int main(i) { +int main() { try { // parse SVM parameter from command line plssvm::parameter params; @@ -339,7 +393,6 @@ target_compile_features(prog PUBLIC cxx_std_17) target_link_libraries(prog PUBLIC plssvm::svm-all) ``` - ## License The PLSSVM library is distributed under the MIT [license](https://github.com/SC-SGS/PLSSVM/blob/main/LICENSE.md). diff --git a/cmake/assemble_summary_string.cmake b/cmake/assemble_summary_string.cmake index 80e73c24e..d4c31d6c7 100644 --- a/cmake/assemble_summary_string.cmake +++ b/cmake/assemble_summary_string.cmake @@ -8,7 +8,11 @@ function(assemble_summary_string out_var) set(PLSSVM_SUMMARY_STRING_ASSEMBLE "") if(DEFINED PLSSVM_CPU_TARGET_ARCHS) # add cpu platform - string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " cpu,") + if(PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 0) + string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " cpu,") + else() + string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " cpu (${PLSSVM_CPU_TARGET_ARCHS}),") + endif() endif() if(DEFINED PLSSVM_NVIDIA_TARGET_ARCHS) # add nvidia platform @@ -20,7 +24,7 @@ function(assemble_summary_string out_var) endif() if(DEFINED PLSSVM_INTEL_TARGET_ARCHS) # add intel platform - string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " intel,") + string(APPEND PLSSVM_SUMMARY_STRING_ASSEMBLE " intel (${PLSSVM_INTEL_TARGET_ARCHS}),") endif() # remove last comma string(REGEX REPLACE ",$" "" PLSSVM_SUMMARY_STRING_ASSEMBLE "${PLSSVM_SUMMARY_STRING_ASSEMBLE}") diff --git a/include/plssvm/backends/OpenCL/detail/utility.hpp b/include/plssvm/backends/OpenCL/detail/utility.hpp index fad3153dc..fcbdee701 100644 --- a/include/plssvm/backends/OpenCL/detail/utility.hpp +++ b/include/plssvm/backends/OpenCL/detail/utility.hpp @@ -46,7 +46,8 @@ namespace plssvm::opencl::detail { void device_assert(error_code code, std::string_view msg = ""); /** - * @brief Returns the list devices matching the target platform @p target. + * @brief Returns the list devices matching the target platform @p target and the actually used target platform + * (only interesting if the provided @p target was automatic). * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order: * 1. NVIDIA GPUs * 2. AMD GPUs @@ -54,9 +55,9 @@ void device_assert(error_code code, std::string_view msg = ""); * 4. CPUs * * @param[in] target the target platform for which the devices must match - * @return the command queues (`[[nodiscard]]`) + * @return the command queues and used target platform (`[[nodiscard]]`) */ -[[nodiscard]] std::vector get_command_queues(target_platform target); +[[nodiscard]] std::pair, target_platform> get_command_queues(target_platform target); /** * @brief Wait for the compute device associated with @p queue to finish. diff --git a/include/plssvm/backends/SYCL/csvm.hpp b/include/plssvm/backends/SYCL/csvm.hpp index 1ecda3705..2287fedff 100644 --- a/include/plssvm/backends/SYCL/csvm.hpp +++ b/include/plssvm/backends/SYCL/csvm.hpp @@ -11,8 +11,9 @@ #pragma once -#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::sycl::detail::device_ptr -#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm +#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::sycl::detail::device_ptr +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm #include "sycl/sycl.hpp" // sycl::queue @@ -45,6 +46,7 @@ class csvm : public ::plssvm::detail::gpu_csvm; diff --git a/include/plssvm/backends/SYCL/detail/utility.hpp b/include/plssvm/backends/SYCL/detail/utility.hpp index d6afe8bb1..17f0b60e3 100644 --- a/include/plssvm/backends/SYCL/detail/utility.hpp +++ b/include/plssvm/backends/SYCL/detail/utility.hpp @@ -15,12 +15,14 @@ #include "sycl/sycl.hpp" // sycl::queue -#include // std::vector +#include // std::pair +#include // std::vector namespace plssvm::sycl::detail { /** - * @brief Returns the list devices matching the target platform @p target. + * @brief Returns the list devices matching the target platform @p target and the actually used target platform + * (only interesting if the provided @p target was automatic). * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order: * 1. NVIDIA GPUs * 2. AMD GPUs @@ -28,9 +30,9 @@ namespace plssvm::sycl::detail { * 4. CPUs * * @param[in] target the target platform for which the devices must match - * @return the devices (`[[nodiscard]]`) + * @return the devices and used target platform (`[[nodiscard]]`) */ -[[nodiscard]] std::vector<::sycl::queue> get_device_list(target_platform target); +[[nodiscard]] std::pair, target_platform> get_device_list(target_platform target); /** * @brief Wait for the compute device associated with @p queue to finish. * @param[in] queue the SYCL queue to synchronize diff --git a/include/plssvm/backends/SYCL/kernel_invocation_type.hpp b/include/plssvm/backends/SYCL/kernel_invocation_type.hpp new file mode 100644 index 000000000..1bd291d04 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel_invocation_type.hpp @@ -0,0 +1,46 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines all available kernel invoke types when using SYCL. + */ + +#pragma once + +#include // forward declare std::ostream and std::istream + +namespace plssvm::sycl { + +/** + * @brief Enum class for all possible SYCL kernel invocation types. + */ +enum class kernel_invocation_type { + /** Use the best kernel invocation type for the current SYCL implementation and target hardware platform. */ + automatic, + /** Use the [*nd_range* invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke). */ + nd_range, + /** Use the SYCL specific [hierarchical invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_hierarchical_invoke). */ + hierarchical +}; + +/** + * @brief Output the @p invocation type to the given output-stream @p out. + * @param[in,out] out the output-stream to write the backend type to + * @param[in] invocation the SYCL kernel invocation type + * @return the output-stream + */ +std::ostream &operator<<(std::ostream &out, kernel_invocation_type invocation); + +/** + * @brief Use the input-stream @p in to initialize the @p invocation type. + * @param[in,out] in input-stream to extract the backend type from + * @param[in] invocation the SYCL kernel invocation type + * @return the input-stream + */ +std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation); + +} // namespace plssvm::sycl diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp index 98b666676..3e1b02435 100644 --- a/include/plssvm/backends/SYCL/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/predict_kernel.hpp @@ -44,12 +44,11 @@ class device_kernel_w_linear { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<1> nd_idx) const { - const kernel_index_type index = nd_idx.get_global_linear_id(); - real_type temp = 0; + void operator()(::sycl::id<1> index) const { + real_type temp{ 0.0 }; if (index < num_features_) { for (kernel_index_type dat = 0; dat < num_data_points_ - 1; ++dat) { temp += alpha_d_[dat] * data_d_[dat + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * index]; @@ -99,12 +98,11 @@ class device_kernel_predict_poly { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - const kernel_index_type data_point_index = nd_idx.get_global_id(0); - const kernel_index_type predict_point_index = nd_idx.get_global_id(1); + void operator()(::sycl::nd_item<2> idx) const { + const kernel_index_type data_point_index = idx.get_global_id(0); + const kernel_index_type predict_point_index = idx.get_global_id(1); real_type temp = 0; if (predict_point_index < num_predict_points_) { @@ -165,12 +163,11 @@ class device_kernel_predict_radial { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<2> nd_idx) const { - const kernel_index_type data_point_index = nd_idx.get_global_id(0); - const kernel_index_type predict_point_index = nd_idx.get_global_id(1); + void operator()(::sycl::nd_item<2> idx) const { + const kernel_index_type data_point_index = idx.get_global_id(0); + const kernel_index_type predict_point_index = idx.get_global_id(1); real_type temp = 0; if (predict_point_index < num_predict_points_) { diff --git a/include/plssvm/backends/SYCL/q_kernel.hpp b/include/plssvm/backends/SYCL/q_kernel.hpp index 6c5a6e375..2a27998d6 100644 --- a/include/plssvm/backends/SYCL/q_kernel.hpp +++ b/include/plssvm/backends/SYCL/q_kernel.hpp @@ -41,11 +41,10 @@ class device_kernel_q_linear { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] item the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<1> item) const { - const kernel_index_type index = item.get_global_linear_id(); + void operator()(::sycl::id<1> index) const { real_type temp{ 0.0 }; for (kernel_index_type i = 0; i < feature_range_; ++i) { temp += data_d_[i * num_rows_ + index] * data_last_[i]; @@ -88,11 +87,10 @@ class device_kernel_q_poly { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] item the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<1> item) const { - const kernel_index_type index = item.get_global_linear_id(); + void operator()(::sycl::id<1> index) const { real_type temp{ 0.0 }; for (kernel_index_type i = 0; i < num_cols_; ++i) { temp += data_d_[i * num_rows_ + index] * data_last_[i]; @@ -136,11 +134,10 @@ class device_kernel_q_radial { /** * @brief Function call operator overload performing the actual calculation. - * @param[in] item the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + * @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ - void operator()(::sycl::nd_item<1> item) const { - const kernel_index_type index = item.get_global_linear_id(); + void operator()(::sycl::id<1> index) const { real_type temp{ 0.0 }; for (kernel_index_type i = 0; i < num_cols_; ++i) { temp += (data_d_[i * num_rows_ + index] - data_last_[i]) * (data_d_[i * num_rows_ + index] - data_last_[i]); diff --git a/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp new file mode 100644 index 000000000..f367f45a8 --- /dev/null +++ b/include/plssvm/backends/SYCL/svm_kernel_hierarchical.hpp @@ -0,0 +1,479 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the kernel functions for the C-SVM using the SYCL backend. + */ + +#pragma once + +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::atomic_op +#include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL +#include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE +#include "plssvm/detail/execution_range.hpp" // plssvm::detail::execution_range + +#include "sycl/sycl.hpp" // sycl::queue, sycl::handler, sycl::h_item, sycl::range, sycl::private_memory, sycl::pow, sycl::exp + +#include // std::size_t + +namespace plssvm::sycl { + +/** + * @brief Calculates the C-SVM kernel using the hierarchical formulation and the linear kernel function. + * @details Supports multi-GPU execution. + * @tparam T the type of the data + */ +template +class hierarchical_device_kernel_linear { + public: + /// The type of the data. + using real_type = T; + + /** + * @brief Construct a new device kernel calculating the C-SVM kernel using the linear C-SVM kernel. + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] feature_range number of features used for the calculation on the device @p id + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] id the id of the device + */ + hierarchical_device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : + q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group the [`sycl::group`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#group-class) + * identifying an instance of the currently execution work-group + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + } + + // matrix + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } + } + }); + + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)]; + } + } + } + }); + + // implicit group barrier + + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; + } + } + } + }); + + // implicit group barrier + } + + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + real_type temp; + if (device_ == 0) { + temp = (private_matr(idx)[x][y] + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_; + } else { + temp = private_matr(idx)[x][y] * add_; + } + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x]; + ret_jx += temp * d_[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + if (device_ == 0) { + ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y]; + } else { + ret_jx += temp * d_[private_i(idx) + y]; + } + } + } + atomic_op{ ret_[private_j(idx) + x] } += ret_jx; + } + } + }); + } + + private: + const real_type *q_; + real_type *ret_; + const real_type *d_; + const real_type *data_d_; + const real_type QA_cost_; + const real_type cost_; + const kernel_index_type num_rows_; + const kernel_index_type feature_range_; + const real_type add_; + const kernel_index_type device_; +}; + +/** + * @brief Calculates the C-SVM kernel using the hierarchical formulation and the polynomial kernel function. + * @details Currently only single GPU execution is supported. + * @tparam T the type of the data + */ +template +class hierarchical_device_kernel_poly { + public: + /// The type of the data. + using real_type = T; + + /** + * @brief Construct a new device kernel calculating the C-SVM kernel using the polynomial C-SVM kernel. + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] num_cols the number of rows in the data matrix + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] degree the degree parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] coef0 the coef0 parameter used in the polynomial kernel function + */ + hierarchical_device_kernel_poly(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : + q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group the [`sycl::group`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#group-class) + * identifying an instance of the currently execution work-group + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + } + + // matrix + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } + } + }); + + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)]; + } + } + } + }); + + // implicit group barrier + + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += data_i * private_data_j(idx)[k]; + } + } + } + }); + + // implicit group barrier + } + + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::pow(gamma_ * private_matr(idx)[x][y] + coef0_, static_cast(degree_)) + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_; + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x]; + ret_jx += temp * d_[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y]; + } + } + atomic_op{ ret_[private_j(idx) + x] } += ret_jx; + } + } + }); + } + + private: + const real_type *q_; + real_type *ret_; + const real_type *d_; + const real_type *data_d_; + const real_type QA_cost_; + const real_type cost_; + const kernel_index_type num_rows_; + const kernel_index_type num_cols_; + const real_type add_; + const int degree_; + const real_type gamma_; + const real_type coef0_; +}; + +/** + * @brief Calculates the C-SVM kernel using the hierarchical formulation and the radial basis functions kernel function. + * @details Currently only single GPU execution is supported. + * @tparam T the type of the data + */ +template +class hierarchical_device_kernel_radial { + public: + /// The type of the data. + using real_type = T; + + /** + * @brief Construct a new device kernel calculating the C-SVM kernel using the radial basis functions kernel function. + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] num_cols the number of rows in the data matrix + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] gamma the gamma parameter used in the rbf kernel function + */ + hierarchical_device_kernel_radial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : + q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group the [`sycl::group`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#group-class) + * identifying an instance of the currently execution work-group + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // allocate memory for work-item local variables + // -> accessible across different 'parallel_for_work_item' invocations + ::sycl::private_memory private_matr{ group }; + ::sycl::private_memory private_data_j{ group }; + ::sycl::private_memory private_i{ group }; + ::sycl::private_memory private_j{ group }; + ::sycl::private_memory private_cond{ group }; + + // initialize private variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // indices and diagonal condition + private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + private_cond(idx) = private_i(idx) >= private_j(idx); + if (private_cond(idx)) { + private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; + private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; + } + + // matrix + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + private_matr(idx)[i][j] = real_type{ 0.0 }; + } + } + }); + + // implicit group barrier + + // load data from global in shared memory + for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(1) == idx_1) { + data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)]; + } + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; + if (idx.get_local_id(0) == idx_2) { + data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)]; + } + } + } + }); + + // implicit group barrier + + // load data from shared in private memory and perform scalar product + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index]; + } + + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + const real_type data_i = data_intern_i[idx.get_local_id(0)][l]; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]); + } + } + } + }); + + // implicit group barrier + } + + // kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + if (private_cond(idx)) { + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + real_type ret_jx = 0.0; + #pragma unroll INTERNAL_BLOCK_SIZE + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + const real_type temp = (::sycl::exp(-gamma_ * private_matr(idx)[x][y]) + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_; + if (private_i(idx) + x > private_j(idx) + y) { + // upper triangular matrix + atomic_op{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x]; + ret_jx += temp * d_[private_i(idx) + y]; + } else if (private_i(idx) + x == private_j(idx) + y) { + // diagonal + ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y]; + } + } + atomic_op{ ret_[private_j(idx) + x] } += ret_jx; + } + } + }); + } + + private: + const real_type *q_; + real_type *ret_; + const real_type *d_; + const real_type *data_d_; + const real_type QA_cost_; + const real_type cost_; + const kernel_index_type num_rows_; + const kernel_index_type num_cols_; + const real_type add_; + const real_type gamma_; +}; + +} // namespace plssvm::sycl diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp similarity index 67% rename from include/plssvm/backends/SYCL/svm_kernel.hpp rename to include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp index a0b1a669b..634e1b9bb 100644 --- a/include/plssvm/backends/SYCL/svm_kernel.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel_nd_range.hpp @@ -1,16 +1,17 @@ /** - * @file - * @author Alexander Van Craen - * @author Marcel Breyer - * @copyright 2018-today The PLSSVM project - All Rights Reserved - * @license This file is part of the PLSSVM project which is released under the MIT license. - * See the LICENSE.md file in the project root for full license information. - * - * @brief Defines the kernel functions for the C-SVM using the SYCL backend. - */ +* @file +* @author Alexander Van Craen +* @author Marcel Breyer +* @copyright 2018-today The PLSSVM project - All Rights Reserved +* @license This file is part of the PLSSVM project which is released under the MIT license. +* See the LICENSE.md file in the project root for full license information. +* +* @brief Defines the kernel functions for the C-SVM in the nd_range formulation using the SYCL backend. +*/ #pragma once +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::atomic_op #include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL #include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE @@ -23,45 +24,45 @@ namespace plssvm::sycl { // TODO: change to ::sycl::local_accessor once implemented in the SYCL implementations /** - * @brief Shortcut alias for a SYCL local accessor. - * @tparam T the type of the accessed values - */ +* @brief Shortcut alias for a SYCL local accessor. +* @tparam T the type of the accessed values +*/ template using local_accessor = ::sycl::accessor; /** - * @brief Calculates the C-SVM kernel using the linear kernel function. - * @details Supports multi-GPU execution. - * @tparam T the type of the data - */ +* @brief Calculates the C-SVM kernel using the nd_range formulation and the linear kernel function. +* @details Supports multi-GPU execution. +* @tparam T the type of the data +*/ template -class device_kernel_linear { +class nd_range_device_kernel_linear { public: /// The type of the data. using real_type = T; /** - * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel. - * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory - * @param[in] q the `q` vector - * @param[out] ret the result vector - * @param[in] d the right-hand side of the equation - * @param[in] data_d the one-dimension data matrix - * @param[in] QA_cost he bottom right matrix entry multiplied by cost - * @param[in] cost 1 / the cost parameter in the C-SVM - * @param[in] num_rows the number of columns in the data matrix - * @param[in] feature_range number of features used for the calculation on the device @p id - * @param[in] add denotes whether the values are added or subtracted from the result vector - * @param[in] id the id of the device - */ - device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : + * @brief Construct a new device kernel calculating the C-SVM kernel using the linear C-SVM kernel. + * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] feature_range number of features used for the calculation on the device @p id + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] id the id of the device + */ + nd_range_device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} /** - * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) - */ + * @brief Function call operator overload performing the actual calculation. + * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + */ void operator()(::sycl::nd_item<2> nd_idx) const { kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; @@ -150,40 +151,40 @@ class device_kernel_linear { }; /** - * @brief Calculates the C-SVM kernel using the polynomial kernel function. - * @details Currently only single GPU execution is supported. - * @tparam T the type of the data - */ +* @brief Calculates the C-SVM kernel using the nd_range formulation and the polynomial kernel function. +* @details Currently only single GPU execution is supported. +* @tparam T the type of the data +*/ template -class device_kernel_poly { +class nd_range_device_kernel_poly { public: /// The type of the data. using real_type = T; /** - * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel. - * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory - * @param[in] q the `q` vector - * @param[out] ret the result vector - * @param[in] d the right-hand side of the equation - * @param[in] data_d the one-dimension data matrix - * @param[in] QA_cost he bottom right matrix entry multiplied by cost - * @param[in] cost 1 / the cost parameter in the C-SVM - * @param[in] num_rows the number of columns in the data matrix - * @param[in] num_cols the number of rows in the data matrix - * @param[in] add denotes whether the values are added or subtracted from the result vector - * @param[in] degree the degree parameter used in the polynomial kernel function - * @param[in] gamma the gamma parameter used in the polynomial kernel function - * @param[in] coef0 the coef0 parameter used in the polynomial kernel function - */ - device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : + * @brief Construct a new device kernel calculating the C-SVM kernel using the polynomial C-SVM kernel. + * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] num_cols the number of rows in the data matrix + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] degree the degree parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] coef0 the coef0 parameter used in the polynomial kernel function + */ + nd_range_device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} /** - * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) - */ + * @brief Function call operator overload performing the actual calculation. + * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + */ void operator()(::sycl::nd_item<2> nd_idx) const { kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; @@ -265,38 +266,38 @@ class device_kernel_poly { }; /** - * @brief Calculates the C-SVM kernel using the radial basis functions kernel function. - * @details Currently only single GPU execution is supported. - * @tparam T the type of the data - */ +* @brief Calculates the C-SVM kernel using the nd_range formulation and the radial basis functions kernel function. +* @details Currently only single GPU execution is supported. +* @tparam T the type of the data +*/ template -class device_kernel_radial { +class nd_range_device_kernel_radial { public: /// The type of the data. using real_type = T; /** - * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel. - * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory - * @param[in] q the `q` vector - * @param[out] ret the result vector - * @param[in] d the right-hand side of the equation - * @param[in] data_d the one-dimension data matrix - * @param[in] QA_cost he bottom right matrix entry multiplied by cost - * @param[in] cost 1 / the cost parameter in the C-SVM - * @param[in] num_rows the number of columns in the data matrix - * @param[in] num_cols the number of rows in the data matrix - * @param[in] add denotes whether the values are added or subtracted from the result vector - * @param[in] gamma the gamma parameter used in the rbf kernel function - */ - device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : + * @brief Construct a new device kernel calculating the C-SVM kernel using the radial basis functions C-SVM kernel. + * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] num_cols the number of rows in the data matrix + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] gamma the gamma parameter used in the rbf kernel function + */ + nd_range_device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} /** - * @brief Function call operator overload performing the actual calculation. - * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) - * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) - */ + * @brief Function call operator overload performing the actual calculation. + * @param[in] nd_idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + */ void operator()(::sycl::nd_item<2> nd_idx) const { kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp index 28e2c5d77..62336def6 100644 --- a/include/plssvm/core.hpp +++ b/include/plssvm/core.hpp @@ -25,6 +25,8 @@ #include "plssvm/exceptions/exceptions.hpp" #include "plssvm/version/version.hpp" +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" + /// The main namespace containing all public API functions. namespace plssvm {} diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp index ccc102124..47d9ee323 100644 --- a/include/plssvm/parameter.hpp +++ b/include/plssvm/parameter.hpp @@ -11,9 +11,10 @@ #pragma once -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include // forward declare std::ostream #include // std::shared_ptr @@ -191,6 +192,9 @@ class parameter { /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD or Intel. target_platform target = target_platform::automatic; + /// The kernel invocation type when using SYCL as backend. + sycl::kernel_invocation_type sycl_kernel_invocation_type = sycl::kernel_invocation_type::automatic; + /// The name of the data/test file to parse. std::string input_filename{}; /// The name of the model file to write the learned support vectors to/to parse the saved model from. diff --git a/include/plssvm/parameter_train.hpp b/include/plssvm/parameter_train.hpp index 223b9c9d6..9bb4903dc 100644 --- a/include/plssvm/parameter_train.hpp +++ b/include/plssvm/parameter_train.hpp @@ -36,6 +36,7 @@ class parameter_train : public parameter { using base_type::kernel; using base_type::print_info; using base_type::target; + using base_type::sycl_kernel_invocation_type; using base_type::input_filename; using base_type::model_filename; diff --git a/install/python_requirements.txt b/install/python_requirements.txt new file mode 100644 index 000000000..4e86a4e9a --- /dev/null +++ b/install/python_requirements.txt @@ -0,0 +1,11 @@ +### optional and required python packages +argparse +sklearn +py-cpuinfo +GPUtil +pyamdgpuinfo +pylspci +numpy +pandas +arff +matplotlib \ No newline at end of file diff --git a/src/main_predict.cpp b/src/main_predict.cpp index e83442ad9..d4b73f900 100644 --- a/src/main_predict.cpp +++ b/src/main_predict.cpp @@ -17,7 +17,7 @@ #include // std::chrono #include // std::exception #include // std::ofstream -#include // std::cerr, std::endl +#include // std::cerr, std::clog, std::endl #include // std::vector // perform calculations in single precision if requested diff --git a/src/main_train.cpp b/src/main_train.cpp index a14ae55a1..62e1e2ddf 100644 --- a/src/main_train.cpp +++ b/src/main_train.cpp @@ -10,10 +10,11 @@ #include "plssvm/core.hpp" +#include "fmt/core.h" // std::format #include "fmt/ostream.h" // use operator<< to output enum class #include // std::exception -#include // std::cerr, std::endl +#include // std::cerr, std::clog, std::endl // perform calculations in single precision if requested #ifdef PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION @@ -27,6 +28,14 @@ int main(int argc, char *argv[]) { // parse SVM parameter from command line plssvm::parameter_train params{ argc, argv }; + // warn if kernel invocation type nd_range or hierarchical are explicitly set but SYCL isn't the current backend + if (params.backend != plssvm::backend_type::sycl && params.sycl_kernel_invocation_type != plssvm::sycl::kernel_invocation_type::automatic) { + std::clog << fmt::format( + "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}", + params.sycl_kernel_invocation_type) + << std::endl; + } + // output used parameter if (params.print_info) { fmt::print("\n"); diff --git a/src/plssvm/backends/OpenCL/CMakeLists.txt b/src/plssvm/backends/OpenCL/CMakeLists.txt index cd28a2913..0273ca652 100644 --- a/src/plssvm/backends/OpenCL/CMakeLists.txt +++ b/src/plssvm/backends/OpenCL/CMakeLists.txt @@ -62,4 +62,9 @@ set(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_TARGETS_TO_INSTALL} PARENT_SCOPE) set(PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_COMPILER " - OpenCL:") include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake) assemble_summary_string(PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS) +# do not print any special target architecture information +string(REPLACE " (${PLSSVM_CPU_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}") +string(REPLACE " (${PLSSVM_NVIDIA_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}") +string(REPLACE " (${PLSSVM_AMD_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}") +string(REPLACE " (${PLSSVM_INTEL_TARGET_ARCHS})" "" PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}") set(PLSSVM_OPENCL_BACKEND_SUMMARY_STRING "${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_COMPILER}${PLSSVM_OPENCL_BACKEND_SUMMARY_STRING_ARCHS}" PARENT_SCOPE) \ No newline at end of file diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp index 71381a86e..74a2be02b 100644 --- a/src/plssvm/backends/OpenCL/csvm.cpp +++ b/src/plssvm/backends/OpenCL/csvm.cpp @@ -21,11 +21,14 @@ #include "plssvm/parameter.hpp" // plssvm::parameter #include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "fmt/chrono.h" // can directly print std::chrono literals #include "fmt/core.h" // fmt::print, fmt::format #include "fmt/ostream.h" // can use fmt using operator<< overloads +#include // std::chrono #include // std::terminate #include // std::string +#include // std::tie #include // std::pair, std::make_pair, std::move #include // std::vector @@ -60,14 +63,18 @@ csvm::csvm(const parameter ¶ms) : break; } + // get all available devices wrt the requested target platform + target_platform used_target; + std::tie(devices_, used_target) = detail::get_command_queues(target_); + devices_.resize(std::min(devices_.size(), num_features_)); + if (print_info_) { fmt::print("Using OpenCL as backend.\n"); + if (target_ == target_platform::automatic) { + fmt::print("Using {} as automatic target platform.\n", used_target); + } } - // get all available devices wrt the requested target platform - devices_ = detail::get_command_queues(target_); - devices_.resize(std::min(devices_.size(), num_features_)); - // throw exception if no devices for the requested target could be found if (devices_.empty()) { throw backend_exception{ fmt::format("OpenCL backend selected but no devices for the target {} were found!", target_) }; @@ -91,6 +98,8 @@ csvm::csvm(const parameter ¶ms) : fmt::print("\n"); } + auto jit_start_time = std::chrono::steady_clock::now(); + // get kernel names std::pair kernel_names = detail::kernel_type_to_function_name(kernel_); // build necessary kernel @@ -110,6 +119,11 @@ csvm::csvm(const parameter ¶ms) : break; } + auto jit_end_time = std::chrono::steady_clock::now(); + if (print_info_) { + fmt::print("OpenCL kernel JIT compilation done in {}.\n", std::chrono::duration_cast(jit_end_time - jit_start_time)); + } + // sanity checks for the number of OpenCL kernels PLSSVM_ASSERT(devices_.size() == q_kernel_.size(), fmt::format("Number of kernels for the q kernel ({}) must match the number of devices ({})!", q_kernel_.size(), devices_.size())); PLSSVM_ASSERT(devices_.size() == svm_kernel_.size(), fmt::format("Number of kernels for the svm kernel ({}) must match the number of devices ({})!", svm_kernel_.size(), devices_.size())); diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp index 1ba75d35e..afb4a7d0f 100644 --- a/src/plssvm/backends/OpenCL/detail/utility.cpp +++ b/src/plssvm/backends/OpenCL/detail/utility.cpp @@ -45,7 +45,7 @@ void device_assert(const error_code ec, const std::string_view msg) { } } -std::vector get_command_queues_impl(const target_platform target) { +[[nodiscard]] std::vector get_command_queues_impl(const target_platform target) { std::map> platform_devices; // get number of platforms @@ -124,21 +124,25 @@ std::vector get_command_queues_impl(const target_platform target) return command_queues; } -std::vector get_command_queues(const target_platform target) { +std::pair, target_platform> get_command_queues(const target_platform target) { if (target != target_platform::automatic) { - return get_command_queues_impl(target); + return std::make_pair(get_command_queues_impl(target), target); } else { - std::vector target_devices = get_command_queues_impl(target_platform::gpu_nvidia); + target_platform used_target = target_platform::gpu_nvidia; + std::vector target_devices = get_command_queues_impl(used_target); if (target_devices.empty()) { - target_devices = get_command_queues_impl(target_platform::gpu_amd); + used_target = target_platform::gpu_amd; + target_devices = get_command_queues_impl(used_target); if (target_devices.empty()) { - target_devices = get_command_queues_impl(target_platform::gpu_intel); + used_target = target_platform::gpu_intel; + target_devices = get_command_queues_impl(used_target); if (target_devices.empty()) { - target_devices = get_command_queues_impl(target_platform::cpu); + used_target = target_platform::cpu; + target_devices = get_command_queues_impl(used_target); } } } - return target_devices; + return std::make_pair(std::move(target_devices), used_target); } } diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp index e9d1ccaa0..7e95d8879 100644 --- a/src/plssvm/backends/OpenMP/csvm.cpp +++ b/src/plssvm/backends/OpenMP/csvm.cpp @@ -205,6 +205,8 @@ auto csvm::predict(const std::vector> &points) -> std: PLSSVM_ASSERT(data_ptr_->size() == alpha_ptr_->size(), "Sizes mismatch!: {} != {}", data_ptr_->size(), alpha_ptr_->size()); // exception in constructor + auto start_time = std::chrono::steady_clock::now(); + std::vector out(points.size(), bias_); if (kernel_ == kernel_type::linear) { // use faster methode in case of the linear kernel function @@ -228,6 +230,11 @@ auto csvm::predict(const std::vector> &points) -> std: } } + auto end_time = std::chrono::steady_clock::now(); + if (print_info_) { + fmt::print("Predicted {} data points in {}.\n", points.size(), std::chrono::duration_cast(end_time - start_time)); + } + return out; } diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt index 8d484c644..10665c1be 100644 --- a/src/plssvm/backends/SYCL/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/CMakeLists.txt @@ -8,17 +8,27 @@ message(CHECK_START "Checking for SYCL backend") # reformat PLSSVM_TARGET_PLATFORMS to be usable with HIPSYCL_TARGETS (in case hipSYCL may be available) -set(HIPSYCL_TARGETS ${PLSSVM_TARGET_PLATFORMS} CACHE STRING "" FORCE) +set(HIPSYCL_TARGETS "${PLSSVM_TARGET_PLATFORMS}" CACHE STRING "" FORCE) list(TRANSFORM HIPSYCL_TARGETS REPLACE "cpu" "omp") list(TRANSFORM HIPSYCL_TARGETS REPLACE "nvidia" "cuda") list(TRANSFORM HIPSYCL_TARGETS REPLACE "amd" "hip") list(TRANSFORM HIPSYCL_TARGETS REPLACE "intel" "spirv") +# remove CPU and Intel GPU target architectures since they are not supported when using hipSYCL +if(DEFINED PLSSVM_CPU_TARGET_ARCHS AND PLSSVM_NUM_CPU_TARGET_ARCHS GREATER 0) + string(REPLACE ";" "," PLSSVM_CPU_TARGET_ARCHS_COMMA "${PLSSVM_CPU_TARGET_ARCHS}") + string(REPLACE ":${PLSSVM_CPU_TARGET_ARCHS_COMMA}" "" HIPSYCL_TARGETS "${HIPSYCL_TARGETS}") +endif() +if(DEFINED PLSSVM_INTEL_TARGET_ARCHS) + string(REPLACE ";" "," PLSSVM_INTEL_TARGET_ARCHS_COMMA "${PLSSVM_INTEL_TARGET_ARCHS}") + string(REPLACE ":${PLSSVM_INTEL_TARGET_ARCHS_COMMA}" "" HIPSYCL_TARGETS "${HIPSYCL_TARGETS}") +endif() # check if hipSYCL is used as SYCL compiler find_package(hipSYCL CONFIG) if(hipSYCL_FOUND) set(PLSSVM_SYCL_BACKEND_COMPILER "hipSYCL" CACHE STRING "" FORCE) message(CHECK_PASS "found hipSYCL") + message(STATUS "Setting HIPSYCL_TARGETS to \"${HIPSYCL_TARGETS}\".") else() # if not, check if DPC++ is used instead try_compile(PLSSVM_SYCL_BACKEND_CHECK_FOR_DPCPP_COMPILER @@ -59,25 +69,65 @@ if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL") target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=1) # silence unknown options warnings target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Wno-unknown-warning-option) + + # print note that Intel GPU architecture specifications are ignored when using hipSYCL + if(DEFINED PLSSVM_INTEL_TARGET_ARCHS) + message(STATUS "Ignoring specified Intel architectures \"${PLSSVM_INTEL_TARGET_ARCHS}\" in favor of SPIR-V when using hipSYCL!") + endif() elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++") + # TODO: remove if DPC++ bug is fixed + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + message(FATAL_ERROR "The SYCL backend when using DPC++ currently does not support the Debug build type. For more information see: https://github.com/intel/llvm/issues/5754") + endif() + # enable DPC++ SYCL support - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl) target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl) + set(PLSSVM_DPCPP_FSYCL_TARGETS "") + # cpu targets + if(DEFINED PLSSVM_CPU_TARGET_ARCHS) + # assemble -fsycl-targets + list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "spir64_x86_64") + # add target specific flags for AOT + if(PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}") + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}") + endif() + endif() # nvidia targets if(DEFINED PLSSVM_NVIDIA_TARGET_ARCHS) - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda) - target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=nvptx64-nvidia-cuda) + # assemble -fsycl-targets + list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "nvptx64-nvidia-cuda") + # add target specific flags for AOT + list(JOIN PLSSVM_NVIDIA_TARGET_ARCHS "," PLSSVM_NVIDIA_TARGET_ARCHS_STRING) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda "-cuda-gpu-arch=${PLSSVM_NVIDIA_TARGET_ARCHS_STRING}") + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda "-cuda-gpu-arch=${PLSSVM_NVIDIA_TARGET_ARCHS_STRING}") endif() # amd targets if(DEFINED PLSSVM_AMD_TARGET_ARCHS) - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=amdgcn-amd-amdhsa) - target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=amdgcn-amd-amdhsa) - foreach(PLSSVM_AMD_TARGET_ARCH_NAME ${PLSSVM_AMD_TARGET_ARCHS}) - target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCH_NAME}) - target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCH_NAME}) - endforeach() + # assemble -fsycl-targets + list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "amdgcn-amd-amdhsa") + # add target specific flags for AOT + if(NOT PLSSVM_NUM_AMD_TARGET_ARCHS EQUAL 1) + message(FATAL_ERROR "DPC++ currently only supports a single AMD architecture specification but ${PLSSVM_NUM_AMD_TARGET_ARCHS} were provided!") + endif() + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS}) + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS}) endif() + # intel targets + if(DEFINED PLSSVM_INTEL_TARGET_ARCHS) + # assemble -fsycl-targets + list(APPEND PLSSVM_DPCPP_FSYCL_TARGETS "spir64_gen") + # add target specific flags for AOT + list(JOIN PLSSVM_INTEL_TARGET_ARCHS "," PLSSVM_INTEL_TARGET_ARCHS_STRING) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}") + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}") + endif() + # set -fsycl-targets + list(JOIN PLSSVM_DPCPP_FSYCL_TARGETS "," PLSSVM_DPCPP_FSYCL_TARGETS_STRING) + target_compile_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=${PLSSVM_DPCPP_FSYCL_TARGETS_STRING}) + target_link_options(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE -fsycl-targets=${PLSSVM_DPCPP_FSYCL_TARGETS_STRING}) # set backend compiler to DPC++ (= 0) target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_COMPILER=0) @@ -85,7 +135,7 @@ elseif("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "DPC++") # be able to choose between the Level-Zero and OpenCL DPC++ backend option(PLSSVM_SYCL_DPCPP_USE_LEVEL_ZERO "Enable Level-Zero backend in favor of OpenCL when using DPC++." OFF) if(PLSSVM_SYCL_BACKEND_DPCPP_USE_LEVEL_ZERO) - target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_DPCPP_BACKEND_TYPE="level-zero") + target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_DPCPP_BACKEND_TYPE="level_zero") else() target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_BACKEND_DPCPP_BACKEND_TYPE="opencl") endif() @@ -97,6 +147,7 @@ target_link_libraries(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PUBLIC ${PLSSVM_BASE_L # set compile definition that the SYCL backend is available target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_HAS_SYCL_BACKEND) +target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_SYCL_BACKEND_COMPILER_NAME="${PLSSVM_SYCL_BACKEND_COMPILER}") # link against interface library target_link_libraries(${PLSSVM_ALL_LIBRARY_NAME} INTERFACE ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME}) @@ -110,5 +161,10 @@ set(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_TARGETS_TO_INSTALL} PARENT_SCOPE) set(PLSSVM_SYCL_BACKEND_SUMMARY_STRING_COMPILER " - SYCL (${PLSSVM_SYCL_BACKEND_COMPILER}):") include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake) assemble_summary_string(PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS) +# do not print CPU and Intel GPU target architectures when using hipSYCL +if("${PLSSVM_SYCL_BACKEND_COMPILER}" STREQUAL "hipSYCL") + string(REPLACE " (${PLSSVM_CPU_TARGET_ARCHS})" "" PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}") + string(REPLACE " (${PLSSVM_INTEL_TARGET_ARCHS})" "" PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}") +endif() set(PLSSVM_SYCL_BACKEND_SUMMARY_STRING "${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_COMPILER}${PLSSVM_SYCL_BACKEND_SUMMARY_STRING_ARCHS}" PARENT_SCOPE) diff --git a/src/plssvm/backends/SYCL/csvm.cpp b/src/plssvm/backends/SYCL/csvm.cpp index 4a8777dec..09e18f17d 100644 --- a/src/plssvm/backends/SYCL/csvm.cpp +++ b/src/plssvm/backends/SYCL/csvm.cpp @@ -8,20 +8,21 @@ #include "plssvm/backends/SYCL/csvm.hpp" -#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::detail::sycl::device_ptr -#include "plssvm/backends/SYCL/detail/utility.hpp" // plssvm::detail::sycl::get_device_list, plssvm::detail::sycl::device_synchronize -#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::sycl::backend_exception -#include "plssvm/backends/SYCL/predict_kernel.hpp" // plssvm::sycl::kernel_w, plssvm::sycl::predict_points_poly, plssvm::sycl::predict_points_rbf -#include "plssvm/backends/SYCL/q_kernel.hpp" // plssvm::sycl::device_kernel_q_linear, plssvm::sycl::device_kernel_q_poly, plssvm::sycl::device_kernel_q_radial -#include "plssvm/backends/SYCL/svm_kernel.hpp" // plssvm::sycl::device_kernel_linear, plssvm::sycl::device_kernel_poly, plssvm::sycl::device_kernel_radial -#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm -#include "plssvm/constants.hpp" // plssvm::kernel_index_type -#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include "plssvm/detail/execution_range.hpp" // plssvm::detail::execution_range -#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::detail::sycl::device_ptr +#include "plssvm/backends/SYCL/detail/utility.hpp" // plssvm::detail::sycl::get_device_list, plssvm::detail::sycl::device_synchronize +#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::sycl::backend_exception +#include "plssvm/backends/SYCL/predict_kernel.hpp" // plssvm::sycl::kernel_w, plssvm::sycl::predict_points_poly, plssvm::sycl::predict_points_rbf +#include "plssvm/backends/SYCL/q_kernel.hpp" // plssvm::sycl::device_kernel_q_linear, plssvm::sycl::device_kernel_q_poly, plssvm::sycl::device_kernel_q_radial +#include "plssvm/backends/SYCL/svm_kernel_hierarchical.hpp" // plssvm::sycl::hierarchical_device_kernel_linear, plssvm::sycl::hierarchical_device_kernel_poly, plssvm::sycl::hierarchical_device_kernel_radial +#include "plssvm/backends/SYCL/svm_kernel_nd_range.hpp" // plssvm::sycl::nd_range_device_kernel_linear, plssvm::sycl::nd_range_device_kernel_poly, plssvm::sycl::nd_range_device_kernel_radial +#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm +#include "plssvm/constants.hpp" // plssvm::kernel_index_type +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/execution_range.hpp" // plssvm::detail::execution_range +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "fmt/core.h" // fmt::print, fmt::format #include "fmt/ostream.h" // can use fmt using operator<< overloads @@ -29,13 +30,14 @@ #include // std::size_t #include // std::terminate +#include // std::tie #include // std::vector namespace plssvm::sycl { template csvm::csvm(const parameter ¶ms) : - base_type{ params } { + base_type{ params }, invocation_type_{ params.sycl_kernel_invocation_type } { // check whether the requested target platform has been enabled switch (target_) { case target_platform::automatic: @@ -62,14 +64,28 @@ csvm::csvm(const parameter ¶ms) : break; } - if (print_info_) { - fmt::print("Using SYCL as backend.\n"); - } - // get all available devices wrt the requested target platform - devices_ = detail::get_device_list(target_); + target_platform used_target; + std::tie(devices_, used_target) = detail::get_device_list(target_); devices_.resize(std::min(devices_.size(), num_features_)); + // set correct kernel invocation type if "automatic" has been provided + if (invocation_type_ == kernel_invocation_type::automatic) { + // always use nd_range except for hipSYCL on the CPU + if (used_target == target_platform::cpu && PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL) { + invocation_type_ = kernel_invocation_type::hierarchical; + } else { + invocation_type_ = kernel_invocation_type::nd_range; + } + } + + if (print_info_) { + fmt::print("Using SYCL ({}) as backend with the kernel invocation type \"{}\" for the svm_kernel.\n", PLSSVM_SYCL_BACKEND_COMPILER_NAME, invocation_type_); + if (target_ == target_platform::automatic) { + fmt::print("Using {} as automatic target platform.\n", used_target); + } + } + // throw exception if no devices for the requested target could be found if (devices_.empty()) { throw backend_exception{ fmt::format("SYCL backend selected but no devices for the target {} were found!", target_) }; @@ -113,17 +129,31 @@ void csvm::device_synchronize(queue_type &queue) { } template -::sycl::nd_range execution_range_to_native(const ::plssvm::detail::execution_range &range) { +::sycl::nd_range execution_range_to_native(const ::plssvm::detail::execution_range &range, const kernel_invocation_type invocation_type) { + PLSSVM_ASSERT(invocation_type != kernel_invocation_type::automatic, "The SYCL kernel invocation type may not be automatic anymore at this point!"); + + // set grid value based on used kernel invocation type + const auto fill_grid = [&](const std::size_t i) { + switch (invocation_type) { + case kernel_invocation_type::nd_range: + return range.grid[i] * range.block[i]; + case kernel_invocation_type::hierarchical: + return range.grid[i]; + case kernel_invocation_type::automatic: + throw backend_exception{ "Can't create native execution range from kernel invocation type automatic!" }; + } + }; + if constexpr (I == 1) { - ::sycl::range<1> grid{ range.grid[0] * range.block[0] }; + ::sycl::range<1> grid{ fill_grid(0) }; ::sycl::range<1> block{ range.block[0] }; return ::sycl::nd_range<1>{ grid, block }; } else if constexpr (I == 2) { - ::sycl::range<2> grid{ range.grid[0] * range.block[0], range.grid[1] * range.block[1] }; + ::sycl::range<2> grid{ fill_grid(0), fill_grid(1) }; ::sycl::range<2> block{ range.block[0], range.block[1] }; return ::sycl::nd_range<2>{ grid, block }; } else if constexpr (I == 3) { - ::sycl::range<3> grid{ range.grid[0] * range.block[0], range.grid[1] * range.block[1], range.grid[2] * range.block[2] }; + ::sycl::range<3> grid{ fill_grid(0), fill_grid(1), fill_grid(2) }; ::sycl::range<3> block{ range.block[0], range.block[1], range.block[2] }; return ::sycl::nd_range<3>{ grid, block }; } else { @@ -132,61 +162,72 @@ ::sycl::nd_range execution_range_to_native(const ::plssvm::detail::execution_ } template -void csvm::run_q_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, const std::size_t num_features) { - const ::sycl::nd_range execution_range = execution_range_to_native<1>(range); +void csvm::run_q_kernel(const std::size_t device, [[maybe_unused]] const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, const std::size_t num_features) { switch (kernel_) { case kernel_type::linear: - devices_[device].parallel_for(execution_range, device_kernel_q_linear(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_features)); + devices_[device].parallel_for(::sycl::range<1>{ dept_ }, device_kernel_q_linear(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_features)); break; case kernel_type::polynomial: PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!"); - devices_[device].parallel_for(execution_range, device_kernel_q_poly(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, degree_, gamma_, coef0_)); + devices_[device].parallel_for(::sycl::range<1>{ dept_ }, device_kernel_q_poly(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, degree_, gamma_, coef0_)); break; case kernel_type::rbf: PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!"); - devices_[device].parallel_for(execution_range, device_kernel_q_radial(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, gamma_)); + devices_[device].parallel_for(::sycl::range<1>{ dept_ }, device_kernel_q_radial(q_d.get(), data_d_[device].get(), data_last_d_[device].get(), num_rows_, num_cols_, gamma_)); break; } } template void csvm::run_svm_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, const real_type add, const std::size_t num_features) { - const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); + const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, invocation_type_); switch (kernel_) { case kernel_type::linear: devices_[device].submit([&](::sycl::handler &cgh) { - cgh.parallel_for(execution_range, device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)); + if (invocation_type_ == kernel_invocation_type::nd_range) { + cgh.parallel_for(execution_range, nd_range_device_kernel_linear(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)); + } else if (invocation_type_ == kernel_invocation_type::hierarchical) { + cgh.parallel_for_work_group(execution_range.get_global_range(), execution_range.get_local_range(), hierarchical_device_kernel_linear(q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_features, add, device)); + } }); break; case kernel_type::polynomial: PLSSVM_ASSERT(device == 0, "The polynomial kernel function currently only supports single GPU execution!"); devices_[device].submit([&](::sycl::handler &cgh) { - cgh.parallel_for(execution_range, device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); + if (invocation_type_ == kernel_invocation_type::nd_range) { + cgh.parallel_for(execution_range, nd_range_device_kernel_poly(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); + } else if (invocation_type_ == kernel_invocation_type::hierarchical) { + cgh.parallel_for_work_group(execution_range.get_global_range(), execution_range.get_local_range(), hierarchical_device_kernel_poly(q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, degree_, gamma_, coef0_)); + } }); break; case kernel_type::rbf: PLSSVM_ASSERT(device == 0, "The radial basis function kernel function currently only supports single GPU execution!"); devices_[device].submit([&](::sycl::handler &cgh) { - cgh.parallel_for(execution_range, device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)); + if (invocation_type_ == kernel_invocation_type::nd_range) { + cgh.parallel_for(execution_range, nd_range_device_kernel_radial(cgh, q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)); + } else if (invocation_type_ == kernel_invocation_type::hierarchical) { + cgh.parallel_for_work_group(execution_range.get_global_range(), execution_range.get_local_range(), hierarchical_device_kernel_radial(q_d.get(), r_d.get(), x_d.get(), data_d_[device].get(), QA_cost_, 1 / cost_, num_rows_, num_cols_, add, gamma_)); + } }); break; } } template -void csvm::run_w_kernel(const std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, const std::size_t num_features) { - const ::sycl::nd_range execution_range = execution_range_to_native<1>(range); - devices_[device].parallel_for(execution_range, device_kernel_w_linear(w_d.get(), data_d_[device].get(), data_last_d_[device].get(), alpha_d.get(), num_data_points_, num_features)); +void csvm::run_w_kernel(const std::size_t device, [[maybe_unused]] const ::plssvm::detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, const std::size_t num_features) { + devices_[device].parallel_for(::sycl::range<1>{ num_features_ }, device_kernel_w_linear(w_d.get(), data_d_[device].get(), data_last_d_[device].get(), alpha_d.get(), num_data_points_, num_features)); } template void csvm::run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, const std::size_t num_predict_points) { - const ::sycl::nd_range execution_range = execution_range_to_native<2>(range); + const ::sycl::nd_range execution_range = execution_range_to_native<2>(range, kernel_invocation_type::nd_range); switch (kernel_) { case kernel_type::linear: break; case kernel_type::polynomial: devices_[0].parallel_for(execution_range, device_kernel_predict_poly(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, degree_, gamma_, coef0_)); + break; case kernel_type::rbf: devices_[0].parallel_for(execution_range, device_kernel_predict_radial(out_d.get(), data_d_[0].get(), data_last_d_[0].get(), alpha_d.get(), num_data_points_, point_d.get(), num_predict_points, num_features_, gamma_)); diff --git a/src/plssvm/backends/SYCL/detail/utility.cpp b/src/plssvm/backends/SYCL/detail/utility.cpp index cac051ef0..d47550582 100644 --- a/src/plssvm/backends/SYCL/detail/utility.cpp +++ b/src/plssvm/backends/SYCL/detail/utility.cpp @@ -14,8 +14,9 @@ #include "sycl/sycl.hpp" // sycl::queue, sycl::platform, sycl::device, sycl::property::queue, sycl::info, sycl::gpu_selector -#include // std::string -#include // std::vector +#include // std::string +#include // std::pair, std::make_pair +#include // std::vector namespace plssvm::sycl::detail { @@ -70,21 +71,25 @@ namespace plssvm::sycl::detail { return target_devices; } -[[nodiscard]] std::vector<::sycl::queue> get_device_list(const target_platform target) { +std::pair, ::plssvm::target_platform> get_device_list(const target_platform target) { if (target != target_platform::automatic) { - return get_device_list_impl(target); + return std::make_pair(get_device_list_impl(target), target); } else { - std::vector<::sycl::queue> target_devices = get_device_list_impl(target_platform::gpu_nvidia); + target_platform used_target = target_platform::gpu_nvidia; + std::vector<::sycl::queue> target_devices = get_device_list_impl(used_target); if (target_devices.empty()) { - target_devices = get_device_list_impl(target_platform::gpu_amd); + used_target = target_platform::gpu_amd; + target_devices = get_device_list_impl(used_target); if (target_devices.empty()) { - target_devices = get_device_list_impl(target_platform::gpu_intel); + used_target = target_platform::gpu_intel; + target_devices = get_device_list_impl(used_target); if (target_devices.empty()) { - target_devices = get_device_list_impl(target_platform::cpu); + used_target = target_platform::cpu; + target_devices = get_device_list_impl(used_target); } } } - return target_devices; + return std::make_pair(std::move(target_devices), used_target); } } diff --git a/src/plssvm/backends/SYCL/kernel_invocation_type.cpp b/src/plssvm/backends/SYCL/kernel_invocation_type.cpp new file mode 100644 index 000000000..89a8f2348 --- /dev/null +++ b/src/plssvm/backends/SYCL/kernel_invocation_type.cpp @@ -0,0 +1,49 @@ +/** +* @author Alexander Van Craen +* @author Marcel Breyer +* @copyright 2018-today The PLSSVM project - All Rights Reserved +* @license This file is part of the PLSSVM project which is released under the MIT license. +* See the LICENSE.md file in the project root for full license information. +*/ + +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" + +#include "plssvm/detail/string_utility.hpp" // plssvm::detail::to_lower_case + +#include // std::ios::failbit +#include // std::istream +#include // std::ostream +#include // std::string + +namespace plssvm::sycl { + +std::ostream &operator<<(std::ostream &out, const kernel_invocation_type target) { + switch (target) { + case kernel_invocation_type::automatic: + return out << "automatic"; + case kernel_invocation_type::nd_range: + return out << "nd_range"; + case kernel_invocation_type::hierarchical: + return out << "hierarchical"; + } + return out << "unknown"; +} + +std::istream &operator>>(std::istream &in, kernel_invocation_type &target) { + std::string str; + in >> str; + detail::to_lower_case(str); + + if (str == "automatic") { + target = kernel_invocation_type::automatic; + } else if (str == "nd_range") { + target = kernel_invocation_type::nd_range; + } else if (str == "hierarchical") { + target = kernel_invocation_type::hierarchical; + } else { + in.setstate(std::ios::failbit); + } + return in; +} + +} // namespace plssvm::sycl \ No newline at end of file diff --git a/src/plssvm/backends/gpu_csvm.cpp b/src/plssvm/backends/gpu_csvm.cpp index 8a50aedf4..d9b563756 100644 --- a/src/plssvm/backends/gpu_csvm.cpp +++ b/src/plssvm/backends/gpu_csvm.cpp @@ -65,6 +65,8 @@ auto gpu_csvm::predict(const std::vector out(points.size()); if (kernel_ == kernel_type::linear) { @@ -103,6 +105,11 @@ auto gpu_csvm::predict(const std::vector(end_time - start_time)); + } + return out; } @@ -380,4 +387,4 @@ template class gpu_csvm, ::sycl template class gpu_csvm, ::sycl::queue>; #endif -} // namespace plssvm::detail \ No newline at end of file +} // namespace plssvm::detail diff --git a/src/plssvm/parameter.cpp b/src/plssvm/parameter.cpp index aa2d5a760..97e6c5042 100644 --- a/src/plssvm/parameter.cpp +++ b/src/plssvm/parameter.cpp @@ -425,7 +425,7 @@ void parameter::parse_model_file(const std::string &filename) { } else if (detail::starts_with(line, "total_sv")) { // the total number of support vectors must be greater than 0 num_sv = detail::convert_to(value); - if (num_sv <= 0) { + if (num_sv == 0) { throw invalid_file_format_exception{ fmt::format("The number of support vectors must be greater than 0, but is {}!", num_sv) }; } } else if (detail::starts_with(line, "rho")) { @@ -535,20 +535,21 @@ void parameter::parse_test_file(const std::string &filename) { template std::ostream &operator<<(std::ostream &out, const parameter ¶ms) { return out << fmt::format( - "kernel_type {}\n" - "degree {}\n" - "gamma {}\n" - "coef0 {}\n" - "cost {}\n" - "epsilon {}\n" - "print_info {}\n" - "backend {}\n" - "target platform {}\n" - "input_filename '{}'\n" - "model_filename '{}'\n" - "predict_filename '{}'\n" - "rho {}\n" - "real_type {}\n", + "kernel_type {}\n" + "degree {}\n" + "gamma {}\n" + "coef0 {}\n" + "cost {}\n" + "epsilon {}\n" + "print_info {}\n" + "backend {}\n" + "target platform {}\n" + "SYCL kernel invocation type {}\n" + "input_filename '{}'\n" + "model_filename '{}'\n" + "predict_filename '{}'\n" + "rho {}\n" + "real_type {}\n", params.kernel, params.degree, params.gamma, @@ -558,6 +559,7 @@ std::ostream &operator<<(std::ostream &out, const parameter ¶ms) { params.print_info, params.backend, params.target, + params.sycl_kernel_invocation_type, params.input_filename, params.model_filename, params.predict_filename, diff --git a/src/plssvm/parameter_train.cpp b/src/plssvm/parameter_train.cpp index 29e716f2c..b4cf3c55c 100644 --- a/src/plssvm/parameter_train.cpp +++ b/src/plssvm/parameter_train.cpp @@ -50,6 +50,7 @@ parameter_train::parameter_train(int argc, char **argv) { ("e,epsilon", "set the tolerance of termination criterion", cxxopts::value()->default_value(fmt::format("{}", epsilon))) ("b,backend", "choose the backend: openmp|cuda|opencl|sycl", cxxopts::value()->default_value(detail::as_lower_case(fmt::format("{}", backend)))) ("p,target_platform", "choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel", cxxopts::value()->default_value(detail::as_lower_case(fmt::format("{}", target)))) + ("sycl_kernel_invocation_type", "choose the kernel invocation type when using SYCL as backend: automatic|nd_range|hierarchical", cxxopts::value()->default_value(detail::as_lower_case(fmt::format("{}", sycl_kernel_invocation_type)))) ("q,quiet", "quiet mode (no outputs)", cxxopts::value(print_info)->default_value(fmt::format("{}", !print_info))) ("h,help", "print this helper message", cxxopts::value()) ("input", "", cxxopts::value(), "training_set_file") @@ -105,6 +106,9 @@ parameter_train::parameter_train(int argc, char **argv) { // parse target_platform and cast the value to the respective enum target = result["target_platform"].as(); + // parse kernel invocation type when using SYCL as backend + sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as(); + // parse print info print_info = !print_info; diff --git a/tests/backends/SYCL/test.cpp b/tests/backends/SYCL/test.cpp index 89d8d9d00..727ff0e12 100644 --- a/tests/backends/SYCL/test.cpp +++ b/tests/backends/SYCL/test.cpp @@ -13,9 +13,10 @@ #include "backends/generic_tests.hpp" // generic::write_model_test, generic::generate_q_test, generic::device_kernel_test, generic::predict_test, generic::accuracy_test #include "utility.hpp" // util::google_test::parameter_definition, util::google_test::parameter_definition_to_name -#include "plssvm/backends/SYCL/csvm.hpp" // plssvm::sycl::csvm -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/backends/SYCL/csvm.hpp" // plssvm::sycl::csvm +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/parameter.hpp" // plssvm::parameter #include "gtest/gtest.h" // ::testing::StaticAssertTypeEq, ::testing::Test, ::testing::Types, TYPED_TEST_SUITE, TYPED_TEST @@ -47,9 +48,13 @@ TYPED_TEST(SYCL_CSVM, generate_q) { generic::generate_q_test(); } -// check whether the device kernels are correct -TYPED_TEST(SYCL_CSVM, device_kernel) { - generic::device_kernel_test(); +// check whether the nd_range device kernels are correct +TYPED_TEST(SYCL_CSVM, device_kernel_nd_range) { + generic::device_kernel_test(); +} +// check whether the hierarchical device kernels are correct +TYPED_TEST(SYCL_CSVM, device_kernel_hierarchical) { + generic::device_kernel_test(); } // check whether the correct labels are predicted diff --git a/tests/backends/generic_tests.hpp b/tests/backends/generic_tests.hpp index 3fadd734d..bf342ab31 100644 --- a/tests/backends/generic_tests.hpp +++ b/tests/backends/generic_tests.hpp @@ -15,22 +15,24 @@ #include "mock_csvm.hpp" // mock_csvm #include "utility.hpp" // util::gtest_assert_floating_point_near, util::gtest_assert_floating_point_eq, util::gtest_expect_correct_csvm_factory, util::create_temp_file -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE -#include "plssvm/detail/string_conversion.hpp" // plssvm::detail::convert_to -#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE +#include "plssvm/detail/string_conversion.hpp" // plssvm::detail::convert_to +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/parameter.hpp" // plssvm::parameter #include "fmt/format.h" // fmt::format #include "fmt/ostream.h" // can use fmt using operator<< overloads #include "gmock/gmock.h" // EXPECT_THAT -#include "gtest/gtest.h" // GTEST_USES_POSIX_RE, ASSERT_EQ, EXPECT_EQ, EXPECT_GT, testing::ContainsRegex, testing::StaticAssertTypeEq +#include "gtest/gtest.h" // ASSERT_GT, ASSERT_TRUE, ASSERT_EQ, EXPECT_EQ, EXPECT_GT, testing::ContainsRegex, testing::StaticAssertTypeEq #include // std::generate #include // std::filesystem::remove #include // std::ifstream #include // std::random_device, std::mt19937, std::uniform_real_distribution +#include // std::regex, std::regex_match #include // std::string, std::getline #include // std::vector @@ -69,26 +71,48 @@ inline void write_model_test() { // write learned model to file csvm.write_model(model_file); - // read content of model file and delete it - std::ifstream model_ifs(model_file); - std::string file_content((std::istreambuf_iterator(model_ifs)), std::istreambuf_iterator()); - model_ifs.close(); + // read content of model file line by line and delete it + std::vector lines; + { + std::ifstream model_ifs(model_file); + std::string line; + while (std::getline(model_ifs, line)) { + lines.push_back(std::move(line)); + } + } std::filesystem::remove(model_file); - // check model file content for correctness -#ifdef GTEST_USES_POSIX_RE + // create vector containing correct regex + std::vector regex_patterns; + regex_patterns.emplace_back("svm_type c_svc"); + regex_patterns.emplace_back(fmt::format("kernel_type {}", params.kernel)); switch (params.kernel) { case plssvm::kernel_type::linear: - EXPECT_THAT(file_content, testing::ContainsRegex("^svm_type c_svc\nkernel_type linear\nnr_class 2\ntotal_sv [0-9]+\nrho [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nlabel 1 -1\nnr_sv [0-9]+ [0-9]+\nSV")); break; case plssvm::kernel_type::polynomial: - EXPECT_THAT(file_content, testing::ContainsRegex("^svm_type c_svc\nkernel_type polynomial\ndegree [0-9]+\ngamma [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\ncoef0 [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nnr_class 2\ntotal_sv [0-9]+\nrho [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nlabel 1 -1\nnr_sv [0-9]+ [0-9]+\nSV")); + regex_patterns.emplace_back("degree [0-9]+"); + regex_patterns.emplace_back("gamma [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?"); + regex_patterns.emplace_back("coef0 [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?"); break; case plssvm::kernel_type::rbf: - EXPECT_THAT(file_content, testing::ContainsRegex("^svm_type c_svc\nkernel_type rbf\ngamma [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nnr_class 2\ntotal_sv [0-9]+\nrho [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?\nlabel 1 -1\nnr_sv [0-9]+ [0-9]+\nSV")); + regex_patterns.emplace_back("gamma [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?"); break; } -#endif + regex_patterns.emplace_back("nr_class 2"); + regex_patterns.emplace_back("total_sv [0-9]+"); + regex_patterns.emplace_back("rho [-+]?[0-9]*.?[0-9]+([eE][-+]?[0-9]+)?"); + regex_patterns.emplace_back("label 1 -1"); + regex_patterns.emplace_back("nr_sv [0-9]+ [0-9]+"); + regex_patterns.emplace_back("SV"); + + // at least number of header entries lines must be present + ASSERT_GT(lines.size(), regex_patterns.size()); + + // check if the model header is valid + for (std::vector::size_type i = 0; i < regex_patterns.size(); ++i) { + std::regex reg(regex_patterns[i], std::regex::extended); + ASSERT_TRUE(std::regex_match(lines[i], reg)) << "line: " << i << " doesn't match regex pattern: " << regex_patterns[i]; + } } template