diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 541bb7201cd..488c6948654 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -35,4 +35,7 @@ add_subdirectory("randomStrategies/") add_subdirectory("randomCells2D/") add_subdirectory("reduce/") add_subdirectory("tagSpecialization/") +add_subdirectory("useBLASInAlpaka/useCuBLASInAlpaka/") +add_subdirectory("useBLASInAlpaka/useRocBLASInAlpaka/") add_subdirectory("vectorAdd/") + diff --git a/example/useBLASInAlpaka/useCuBLASInAlpaka/CMakeLists.txt b/example/useBLASInAlpaka/useCuBLASInAlpaka/CMakeLists.txt new file mode 100644 index 00000000000..9d3ed02e4cc --- /dev/null +++ b/example/useBLASInAlpaka/useCuBLASInAlpaka/CMakeLists.txt @@ -0,0 +1,58 @@ +# +# Copyright 2023 Benjamin Worpitz, Jan Stephan +# SPDX-License-Identifier: ISC +# + +################################################################################ +# Required CMake version. + +cmake_minimum_required(VERSION 3.25) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. + +set(_TARGET_NAME useCuBLASInAlpaka) + +project(${_TARGET_NAME} LANGUAGES CXX) + +# Check if the cmake variables to see if the Acc option is Cuda Only. +if(NOT alpaka_ACC_GPU_CUDA_ONLY_MODE) + # Print a warning and skip target creation + message(WARNING "Skipping build of 'useCuBLASInAlpaka' because alpaka_ACC_GPU_CUDA_ONLY_MODE is not enabled.") + return() +endif() + +# Add cuBLAS library +find_package(CUDA REQUIRED) +set(CUDA_LIBRARIES ${CUDA_LIBRARIES} cublas) + +#------------------------------------------------------------------------------- +# Find alpaka. + +if(NOT TARGET alpaka::alpaka) + option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(alpaka_USE_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() +endif() + +#------------------------------------------------------------------------------- +# Add executable. + +alpaka_add_executable( + ${_TARGET_NAME} + src/useCuBLASInAlpaka.cpp) +target_link_libraries( + ${_TARGET_NAME} + PUBLIC alpaka::alpaka ${CUDA_LIBRARIES}) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) +#set_target_properties(${_TARGET_NAME} PROPERTIES CUDA_STANDARD 17) +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) diff --git a/example/useBLASInAlpaka/useCuBLASInAlpaka/src/useCuBLASInAlpaka.cpp b/example/useBLASInAlpaka/useCuBLASInAlpaka/src/useCuBLASInAlpaka.cpp new file mode 100644 index 00000000000..226909c7670 --- /dev/null +++ b/example/useBLASInAlpaka/useCuBLASInAlpaka/src/useCuBLASInAlpaka.cpp @@ -0,0 +1,178 @@ +/* Copyright 2023 Mehmet Yusufoglu, Rene Widera, + * SPDX-License-Identifier: ISC + */ +/* + * This example uses cuBLAS library functions in alpaka. A cuBLAS function cublasSgemm is called by using alpaka + * buffers and queue. Since the code needs only AccGpuCuda backend. Make sure the correct alpaka cmake backend flag is + * set for alpaka. + */ +#include + +#include + +#include +#include + +// Index type +using Idx = std::size_t; +// Set data type +using DataType = float; + +// Initialize the matrix in column-major order (1D buffer) +void initializeMatrix(DataType* buffer, Idx rows, Idx cols) +{ + for(Idx j = 0; j < rows; ++j) + { + for(Idx i = 0; i < cols; ++i) + { + // generate some values and set buffer + buffer[i + j * cols] = static_cast((i + j * cols) % 10); + } + } +} + +auto main() -> int +{ + using Dim1D = alpaka::DimInt<1>; + + // Define matrix dimensions, A is MxK and B is KxN + Idx const M = 4; // Rows in A and C + Idx const N = 2; // Columns in B and C + Idx const K = 3; // Columns in A and rows in B + + // Define the accelerator and queue + // Use Cuda Accelerator. Cmake Acc flags should be set to Cuda-Only + using Acc = alpaka::TagToAcc; + using Queue = alpaka::Queue; + + auto const platformHost = alpaka::PlatformCpu{}; + auto const devHost = alpaka::getDevByIdx(platformHost, 0); + auto const platformAcc = alpaka::Platform{}; + auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); + + Queue queue(devAcc); + + // Allocate 1D host memory + auto bufHostA = alpaka::allocBuf(devHost, M * K); + auto bufHostB = alpaka::allocBuf(devHost, K * N); + auto bufHostC = alpaka::allocBuf(devHost, M * N); + + DataType* hostA = std::data(bufHostA); + DataType* hostB = std::data(bufHostB); + DataType* hostC = std::data(bufHostC); + + // Initialize host matrices with some values + initializeMatrix(hostA, M, K); + initializeMatrix(hostB, K, N); + std::fill(hostC, hostC + (M * N), 0); // Initialize C with 0s + + // Print initialized matrices + std::cout << "Matrix A (Host):" << std::endl; + for(Idx j = 0; j < M; ++j) + { + for(Idx i = 0; i < K; ++i) + { + std::cout << hostA[i + j * K] << " "; + } + std::cout << std::endl; + } + + std::cout << "Matrix B (Host):" << std::endl; + for(Idx j = 0; j < K; ++j) + { + for(Idx i = 0; i < N; ++i) + { + std::cout << hostB[i + j * N] << " "; + } + std::cout << std::endl; + } + + // Allocate 1D device memory + auto bufDevA = alpaka::allocBuf(devAcc, M * K); + auto bufDevB = alpaka::allocBuf(devAcc, K * N); + auto bufDevC = alpaka::allocBuf(devAcc, M * N); + + // Copy data to device + alpaka::memcpy(queue, bufDevA, bufHostA); + alpaka::memcpy(queue, bufDevB, bufHostB); + alpaka::memcpy(queue, bufDevC, bufHostC); + alpaka::wait(queue); + + std::cout << "Copied matrices A and B to the device." << std::endl; + + // Get the native CUDA stream from Alpaka queue + auto alpakaStream = alpaka::getNativeHandle(queue); + + // cuBLAS setup + cublasHandle_t cublasHandle; + cublasCreate(&cublasHandle); + cublasSetStream(cublasHandle, alpakaStream); + + // Perform matrix multiplication: C = A * B + float alpha = 1.0f, beta = 0.0f; // Set beta to 0.0f to overwrite C + cublasSgemm( + cublasHandle, + CUBLAS_OP_N, + CUBLAS_OP_N, // No transpose for A and B + M, + N, + K, // Dimensions: C = A * B + &alpha, + std::data(bufDevA), + M, // Leading dimension of A + std::data(bufDevB), + K, // Leading dimension of B + &beta, + std::data(bufDevC), + M // Leading dimension of C + ); + + alpaka::wait(queue); // Wait for multiplication to complete + std::cout << "Matrix multiplication completed." << std::endl; + + // Copy result back to host + alpaka::memcpy(queue, bufHostC, bufDevC); + alpaka::wait(queue); + std::cout << "Copied result matrix C back to the host." << std::endl; + + // Print result matrix C + std::cout << "Matrix C (Host):" << std::endl; + for(Idx j = 0; j < M; ++j) + { + for(Idx i = 0; i < N; ++i) + { + std::cout << hostC[i + j * N] << " "; + } + std::cout << std::endl; + } + + // Expected values of elements of C + std::vector expectedResult{20, 23, 6, 9, 56, 68, 30, 42}; + + // Verify the result + bool success = true; + for(Idx j = 0; j < M; ++j) + { + for(Idx i = 0; i < N; ++i) + { + if(std::fabs(hostC[i + j * N] - expectedResult[i + j * N]) > 1e-5f) + { // Allow small floating-point errors + std::cout << "Mismatch at (" << i << ", " << j << "): " << hostC[i + j * N] + << " != " << expectedResult[i + j * N] << std::endl; + success = false; + } + } + } + + std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N + << (success ? " succeeded!" : " failed!") << std::endl; + + if(!success) + { + return EXIT_FAILURE; + } + + // Cleanup cuBLAS + cublasDestroy(cublasHandle); + return EXIT_SUCCESS; +} diff --git a/example/useBLASInAlpaka/useRocBLASInAlpaka/CMakeLists.txt b/example/useBLASInAlpaka/useRocBLASInAlpaka/CMakeLists.txt new file mode 100644 index 00000000000..dcb3db83988 --- /dev/null +++ b/example/useBLASInAlpaka/useRocBLASInAlpaka/CMakeLists.txt @@ -0,0 +1,58 @@ +# +# Copyright 2024 Mehmet Yusufoglu, Simeon Ehrig, René Widera +# SPDX-License-Identifier: ISC +# + +################################################################################ +# Required CMake version. + +cmake_minimum_required(VERSION 3.25) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. + +set(_TARGET_NAME useRocBLASInAlpaka) + +project(${_TARGET_NAME} LANGUAGES CXX) + +if(NOT alpaka_ACC_GPU_HIP_ONLY_MODE) + # Print a warning and skip target creation + message(WARNING "Skipping build of 'useRocBLASInAlpaka' because alpaka_ACC_GPU_HIP_ONLY_MODE is not enabled.") + return() +endif() + + +#------------------------------------------------------------------------------- +# Find alpaka. + +if(NOT TARGET alpaka::alpaka) + option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if(alpaka_USE_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else() + find_package(alpaka REQUIRED) + endif() +endif() + +#------------------------------------------------------------------------------- +# Add executable. +#------------------------------------------------------------------------------- +# Locate rocBLAS. +find_package(rocblas REQUIRED CONFIG ) +# Use the line below if the rocblas configuration files (e.g., rocblasConfig.cmake) are not in the default search paths. +# find_package(rocblas REQUIRED CONFIG HINTS /opt/rocm /opt/rocm-) + +alpaka_add_executable( + ${_TARGET_NAME} + src/useRocBLASInAlpaka.cpp) +target_link_libraries( + ${_TARGET_NAME} + PUBLIC alpaka::alpaka rocblas) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) +add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) diff --git a/example/useBLASInAlpaka/useRocBLASInAlpaka/src/useRocBLASInAlpaka.cpp b/example/useBLASInAlpaka/useRocBLASInAlpaka/src/useRocBLASInAlpaka.cpp new file mode 100644 index 00000000000..01102ea927f --- /dev/null +++ b/example/useBLASInAlpaka/useRocBLASInAlpaka/src/useRocBLASInAlpaka.cpp @@ -0,0 +1,175 @@ +/* Copyright 2024 Mehmet Yusufoglu, René Widera, Simeon Ehrig + * SPDX-License-Identifier: ISC + */ +/* + * This example uses rocBLAS library functions in alpaka. A rocBLAS function of mutrix multiplication is called by + * using alpaka buffers and queue. Since the code needs only AccGpuHip backend. Make sure the correct alpaka cmake + * backend flag is set for alpaka. + */ + +#include + +#include + +#include +#include +#include + +// Index type +using Idx = std::size_t; + +// Set data type +using DataType = float; + +// Initialize the matrix in column-major order (1D buffer) +void initializeMatrix(DataType* buffer, Idx rows, Idx cols) +{ + for(Idx j = 0; j < rows; ++j) + { + for(Idx i = 0; i < cols; ++i) + { + // Generate some values and set buffer + buffer[i + j * cols] = static_cast((i + j * cols) % 10); + } + } +} + +auto main() -> int +{ + using Dim1D = alpaka::DimInt<1>; + + // Define matrix dimensions, A is MxK and B is KxN + Idx const M = 4; // Rows in A and C + Idx const N = 2; // Columns in B and C + Idx const K = 3; // Columns in A and rows in B + + // Define the accelerator and queue + // Use Hip Accelerator. Cmake Acc flags should be set to Hip-Only + using Acc = alpaka::TagToAcc; + using Queue = alpaka::Queue; + + auto const platformHost = alpaka::PlatformCpu{}; + auto const devHost = alpaka::getDevByIdx(platformHost, 0); + auto const platformAcc = alpaka::Platform{}; + auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); + + Queue queue(devAcc); + + // Allocate 1D host memory + auto bufHostA = alpaka::allocBuf(devHost, M * K); + auto bufHostB = alpaka::allocBuf(devHost, K * N); + auto bufHostC = alpaka::allocBuf(devHost, M * N); + + DataType* hostA = alpaka::getPtrNative(bufHostA); + DataType* hostB = alpaka::getPtrNative(bufHostB); + DataType* hostC = alpaka::getPtrNative(bufHostC); + + // Initialize host matrices with some values + initializeMatrix(hostA, M, K); + initializeMatrix(hostB, K, N); + std::fill(hostC, hostC + (M * N), 0); // Initialize C with 0s + + // Allocate 1D device memory + auto bufDevA = alpaka::allocBuf(devAcc, M * K); + auto bufDevB = alpaka::allocBuf(devAcc, K * N); + auto bufDevC = alpaka::allocBuf(devAcc, M * N); + + // Copy data to device + alpaka::memcpy(queue, bufDevA, bufHostA, M * K); + alpaka::memcpy(queue, bufDevB, bufHostB, K * N); + alpaka::memcpy(queue, bufDevC, bufHostC, M * N); + alpaka::wait(queue); + + // Obtain the native HIP stream from the Alpaka queue + auto rocStream = alpaka::getNativeHandle(queue); + // rocBLAS setup + rocblas_handle rocblasHandle; + rocblas_status status = rocblas_create_handle(&rocblasHandle); + + if(status != rocblas_status_success) + { + std::cerr << "rocblas_create_handle failed with status: " << status << std::endl; + return EXIT_FAILURE; + } + // Associate the HIP stream with the rocBLAS handle + status = rocblas_set_stream(rocblasHandle, rocStream); + if(status != rocblas_status_success) + { + std::cerr << "rocblas_set_stream failed with status: " << status << std::endl; + rocblas_destroy_handle(rocblasHandle); + return EXIT_FAILURE; + } + // Perform matrix multiplication: C = alpha * A * B + beta * C + float alpha = 1.0f, beta = 0.0f; + + // call general matrix multiply function + status = rocblas_sgemm( + rocblasHandle, + rocblas_operation_none, + rocblas_operation_none, // No transpose for A and B + M, + N, + K, + &alpha, + alpaka::getPtrNative(bufDevA), + M, // Leading dimension of A + alpaka::getPtrNative(bufDevB), + K, // Leading dimension of B + &beta, + alpaka::getPtrNative(bufDevC), + M // Leading dimension of C + ); + + if(status != rocblas_status_success) + { + std::cerr << "rocblas_sgemm failed: " << status << std::endl; + rocblas_destroy_handle(rocblasHandle); + return EXIT_FAILURE; + } + alpaka::wait(queue); + + // Copy result back to host + alpaka::memcpy(queue, bufHostC, bufDevC, M * N); + alpaka::wait(queue); + + // Verify the result + std::cout << "Matrix C (Host):" << std::endl; + for(Idx j = 0; j < M; ++j) + { + for(Idx i = 0; i < N; ++i) + { + std::cout << hostC[i + j * N] << " "; + } + std::cout << std::endl; + } + + // Expected values of elements of C + std::vector expectedResult{20, 23, 6, 9, 56, 68, 30, 42}; + + // Verify the result + bool success = true; + for(Idx j = 0; j < M; ++j) + { + for(Idx i = 0; i < N; ++i) + { + if(std::fabs(hostC[i + j * N] - expectedResult[i + j * N]) > 1e-5f) + { // Allow small floating-point errors + std::cout << "Mismatch at (" << i << ", " << j << "): " << hostC[i + j * N] + << " != " << expectedResult[i + j * N] << std::endl; + success = false; + } + } + } + + std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N + << (success ? " succeeded!" : " failed!") << std::endl; + + if(!success) + { + return EXIT_FAILURE; + } + + // Cleanup + rocblas_destroy_handle(rocblasHandle); + return EXIT_SUCCESS; +}