diff --git a/CMakeLists.txt b/CMakeLists.txt index c03f5c78..2830e644 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,15 +154,12 @@ include(cmake/register_models.cmake) register_model(serial SERIAL SerialStream.cpp) register_model(omp OMP OMPStream.cpp) register_model(ocl OCL OCLStream.cpp) -register_model(std-data STD_DATA STDDataStream.cpp) -register_model(std-indices STD_INDICES STDIndicesStream.cpp) -register_model(std-ranges STD_RANGES STDRangesStream.cpp) +register_model(std STD STDStream.cpp) register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) register_model(kokkos KOKKOS KokkosStream.cpp) register_model(sycl SYCL SYCLStream.cpp) -register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp) -register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp) +register_model(sycl2020 SYCL2020 SYCLStream2020.cpp) register_model(acc ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(raja USE_RAJA RAJAStream.cpp) diff --git a/src/Stream.h b/src/Stream.h index c8c6af1c..f233f54a 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -7,11 +7,14 @@ #pragma once +#include #include #include #include #include "benchmark.h" +using std::intptr_t; + template class Stream { diff --git a/src/StreamModels.h b/src/StreamModels.h index b13d5b34..820c08a4 100644 --- a/src/StreamModels.h +++ b/src/StreamModels.h @@ -3,12 +3,8 @@ #if defined(CUDA) #include "CUDAStream.h" -#elif defined(STD_DATA) -#include "STDDataStream.h" -#elif defined(STD_INDICES) -#include "STDIndicesStream.h" -#elif defined(STD_RANGES) -#include "STDRangesStream.hpp" +#elif defined(STD) +#include "STDStream.h" #elif defined(TBB) #include "TBBStream.hpp" #elif defined(THRUST) @@ -63,17 +59,9 @@ std::unique_ptr> make_stream(Args... args) { // Use the Kokkos implementation return std::make_unique>(args...); -#elif defined(STD_DATA) +#elif defined(STD) // Use the C++ STD data-oriented implementation - return std::make_unique>(args...); - -#elif defined(STD_INDICES) - // Use the C++ STD index-oriented implementation - return std::make_unique>(args...); - -#elif defined(STD_RANGES) - // Use the C++ STD ranges implementation - return std::make_unique>(args...); + return std::make_unique>(args...); #elif defined(TBB) // Use the C++20 implementation diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 57b89afb..d10e8eef 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -152,9 +152,10 @@ build_gcc() { *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${GCC_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17" + # Requires GCC 14 and newer CMake for C++23 support + #run_build $name "${GCC_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA23" + run_build $name "${GCC_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES" done run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" @@ -251,9 +252,11 @@ build_clang() { OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac - run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported + run_build $name "${CLANG_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17" + # Requires GCC 14 and newer CMake for C++23 support + # run_build $name "${CLANG_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA23" + # TODO: clang is too old + #run_build $name "${CLANG_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES" done run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" @@ -270,8 +273,10 @@ build_clang() { build_nvhpc() { local name="nvhpc_build" local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}" - run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" - run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" + run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA17" + # Requires GCC 14 and newer CMake for C++23 support + # run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA23" + run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=INDICES" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" diff --git a/src/dpl_shim.h b/src/dpl_shim.h index 226693bd..9b8a7acc 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -29,12 +29,15 @@ T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue template void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } +#define WORKAROUND + #else // auto exe_policy = dpl::execution::seq; // auto exe_policy = dpl::execution::par; static constexpr auto exe_policy = dpl::execution::par_unseq; #define USE_STD_PTR_ALLOC_DEALLOC +#define WORKAROUND #endif diff --git a/src/main.cpp b/src/main.cpp index 604e14a2..af9fac6c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -332,13 +332,13 @@ void check_solution(const size_t num_times, T const* a, T const* b, T const* c, size_t failed = 0; T max_rel = std::numeric_limits::epsilon() * T(100.0); T max_rel_dot = std::numeric_limits::epsilon() * T(10000000.0); - auto check = [&](const char* name, T is, T should, T max_rel, size_t i = size_t(-1)) { + auto check = [&](const char* name, T is, T should, T mrel, size_t i = size_t(-1)) { // Relative difference: T diff = std::abs(is - should); T abs_is = std::abs(is); T abs_sh = std::abs(should); T largest = std::max(abs_is, abs_sh); - T same = diff <= largest * max_rel; + T same = diff <= largest * mrel; if (!same || std::isnan(is)) { ++failed; if (failed > 10) return; @@ -346,7 +346,8 @@ void check_solution(const size_t num_times, T const* a, T const* b, T const* c, if (i != size_t(-1)) std::cerr << "[" << i << "]"; std::cerr << ": " << is << " (is) != " << should << " (should)" << ", diff=" << diff << " > " - << largest * max_rel << std::endl; + << largest * mrel << " (largest=" << largest + << ", max_rel=" << mrel << ")" << std::endl; } }; @@ -360,9 +361,9 @@ void check_solution(const size_t num_times, T const* a, T const* b, T const* c, // Calculate the L^infty-norm relative error for (size_t i = 0; i < array_size; ++i) { - check("a", a[i], goldA, i, max_rel); - check("b", b[i], goldB, i, max_rel); - check("c", c[i], goldC, i, max_rel); + check("a", a[i], goldA, max_rel, i); + check("b", b[i], goldB, max_rel, i); + check("c", c[i], goldC, max_rel, i); } if (failed > 0 && !silence_errors) diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp deleted file mode 100644 index 8c280f8a..00000000 --- a/src/std-data/STDDataStream.cpp +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -// Updated 2021 by University of Bristol -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "STDDataStream.h" - -template -STDDataStream::STDDataStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) - noexcept : array_size{array_size}, - a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) -{ - std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#ifdef USE_ONEDPL - std::cout << "Using oneDPL backend: "; -#if ONEDPL_USE_DPCPP_BACKEND - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif ONEDPL_USE_TBB_BACKEND - std::cout << "TBB " TBB_VERSION_STRING; -#elif ONEDPL_USE_OPENMP_BACKEND - std::cout << "OpenMP"; -#else - std::cout << "Default"; -#endif - std::cout << std::endl; -#endif - init_arrays(initA, initB, initC); -} - -template -STDDataStream::~STDDataStream() { - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -} - -template -void STDDataStream::init_arrays(T initA, T initB, T initC) -{ - std::fill(exe_policy, a, a + array_size, initA); - std::fill(exe_policy, b, b + array_size, initB); - std::fill(exe_policy, c, c + array_size, initC); -} - -template -void STDDataStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) -{ - h_a = a; - h_b = b; - h_c = c; -} - -template -void STDDataStream::copy() -{ - // c[i] = a[i] - std::copy(exe_policy, a, a + array_size, c); -} - -template -void STDDataStream::mul() -{ - // b[i] = scalar * c[i]; - std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; }); -} - -template -void STDDataStream::add() -{ - // c[i] = a[i] + b[i]; - std::transform(exe_policy, a, a + array_size, b, c, std::plus()); -} - -template -void STDDataStream::triad() -{ - // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); -} - -template -void STDDataStream::nstream() -{ - // a[i] += b[i] + scalar * c[i]; - // Need to do in two stages with C++11 STL. - // 1: a[i] += b[i] - // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); -} - - -template -T STDDataStream::dot() -{ - // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); -} - -void listDevices(void) -{ - std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} -template class STDDataStream; -template class STDDataStream; diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp deleted file mode 100644 index 4f8efe20..00000000 --- a/src/std-indices/STDIndicesStream.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2021 Tom Deakin and Tom Lin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "STDIndicesStream.h" - -#ifndef ALIGNMENT -#define ALIGNMENT (2*1024*1024) // 2MB -#endif - -template -STDIndicesStream::STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) -noexcept : array_size{array_size}, range(0, array_size), - a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) -{ - std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#ifdef USE_ONEDPL - std::cout << "Using oneDPL backend: "; -#if ONEDPL_USE_DPCPP_BACKEND - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif ONEDPL_USE_TBB_BACKEND - std::cout << "TBB " TBB_VERSION_STRING; -#elif ONEDPL_USE_OPENMP_BACKEND - std::cout << "OpenMP"; -#else - std::cout << "Default"; -#endif - std::cout << std::endl; -#endif - init_arrays(initA, initB, initC); -} - -template -STDIndicesStream::~STDIndicesStream() { - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -} - -template -void STDIndicesStream::init_arrays(T initA, T initB, T initC) -{ - std::fill(exe_policy, a, a + array_size, initA); - std::fill(exe_policy, b, b + array_size, initB); - std::fill(exe_policy, c, c + array_size, initC); -} - -template -void STDIndicesStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) -{ - h_a = a; - h_b = b; - h_c = c; -} - -template -void STDIndicesStream::copy() -{ - // c[i] = a[i] - std::copy(exe_policy, a, a + array_size, c); -} - -template -void STDIndicesStream::mul() -{ - // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](intptr_t i) { - return scalar * c[i]; - }); -} - -template -void STDIndicesStream::add() -{ - // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](intptr_t i) { - return a[i] + b[i]; - }); -} - -template -void STDIndicesStream::triad() -{ - // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](intptr_t i) { - return b[i] + scalar * c[i]; - }); -} - -template -void STDIndicesStream::nstream() -{ - // a[i] += b[i] + scalar * c[i]; - // Need to do in two stages with C++11 STL. - // 1: a[i] += b[i] - // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](intptr_t i) { - return a[i] + b[i] + scalar * c[i]; - }); -} - - -template -T STDIndicesStream::dot() -{ - // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); -} - -void listDevices(void) -{ - std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} -template class STDIndicesStream; -template class STDIndicesStream; diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h deleted file mode 100644 index 7a43b1ec..00000000 --- a/src/std-indices/STDIndicesStream.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2021 Tom Deakin and Tom Lin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once -#include "dpl_shim.h" - -#include -#include -#include "Stream.h" - -#define IMPLEMENTATION_STRING "STD (index-oriented)" - -// A lightweight counting iterator which will be used by the STL algorithms -// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this -// implementation doesn't target -template -class ranged { -public: - class iterator { - friend class ranged; - public: - using difference_type = N; - using value_type = N; - using pointer = const N*; - using reference = N; - using iterator_category = std::random_access_iterator_tag; - - // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled. - // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device. - // This type is unused for any nother STL impl. - using is_passed_directly = std::true_type; - - reference operator *() const { return i_; } - iterator &operator ++() { ++i_; return *this; } - iterator operator ++(int) { iterator copy(*this); ++i_; return copy; } - - iterator &operator --() { --i_; return *this; } - iterator operator --(int) { iterator copy(*this); --i_; return copy; } - - iterator &operator +=(N by) { i_+=by; return *this; } - - value_type operator[](const difference_type &i) const { return i_ + i; } - - difference_type operator-(const iterator &it) const { return i_ - it.i_; } - iterator operator+(const value_type v) const { return iterator(i_ + v); } - - bool operator ==(const iterator &other) const { return i_ == other.i_; } - bool operator !=(const iterator &other) const { return i_ != other.i_; } - bool operator < (const iterator &other) const { return i_ < other.i_; } - - protected: - explicit iterator(N start) : i_ (start) {} - - private: - N i_; - }; - - [[nodiscard]] iterator begin() const { return begin_; } - [[nodiscard]] iterator end() const { return end_; } - ranged(N begin, N end) : begin_(begin), end_(end) {} -private: - iterator begin_; - iterator end_; -}; - -template -class STDIndicesStream : public Stream -{ - protected: - // Size of arrays - intptr_t array_size; - - // induction range - ranged range; - - // Device side pointers - T *a, *b, *c; - - public: - STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) noexcept; - ~STDIndicesStream(); - - void copy() override; - void add() override; - void mul() override; - void triad() override; - void nstream() override; - T dot() override; - - void get_arrays(T const*& a, T const*& b, T const*& c) override; - void init_arrays(T initA, T initB, T initC); -}; - diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake deleted file mode 100644 index 60ef575f..00000000 --- a/src/std-indices/model.cmake +++ /dev/null @@ -1,53 +0,0 @@ - -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection" - "c++") - -register_flag_optional(NVHPC_OFFLOAD - "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. - The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) - - Possible values are: - cc35 - Compile for compute capability 3.5 - cc50 - Compile for compute capability 5.0 - cc60 - Compile for compute capability 6.0 - cc62 - Compile for compute capability 6.2 - cc70 - Compile for compute capability 7.0 - cc72 - Compile for compute capability 7.2 - cc75 - Compile for compute capability 7.5 - cc80 - Compile for compute capability 8.0 - ccall - Compile for all supported compute capabilities" - "") - -register_flag_optional(USE_TBB - "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." - "OFF") - -register_flag_optional(USE_ONEDPL - "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. - - Possible values are: - OPENMP - Implements policies using OpenMP. - CMake will handle any flags needed to enable OpenMP if the compiler supports it. - TBB - Implements policies using TBB. - TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - DPCPP - Implements policies through SYCL2020. - This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." - "OFF") - -macro(setup) - set(CMAKE_CXX_STANDARD 17) - if (NVHPC_OFFLOAD) - set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) - # propagate flags to linker so that it links with the gpu stuff as well - register_append_cxx_flags(ANY ${NVHPC_FLAGS}) - register_append_link_flags(${NVHPC_FLAGS}) - endif () - if (USE_TBB) - register_link_library(TBB::tbb) - endif () - if (USE_ONEDPL) - register_definitions(USE_ONEDPL) - register_link_library(oneDPL) - endif () -endmacro() diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp deleted file mode 100644 index 02bd56b2..00000000 --- a/src/std-ranges/STDRangesStream.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "STDRangesStream.hpp" -#include - -#ifndef ALIGNMENT -#define ALIGNMENT (2*1024*1024) // 2MB -#endif - -template -STDRangesStream::STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) - noexcept : array_size{array_size}, - a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) -{ - std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#ifdef USE_ONEDPL - std::cout << "Using oneDPL backend: "; -#if ONEDPL_USE_DPCPP_BACKEND - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif ONEDPL_USE_TBB_BACKEND - std::cout << "TBB " TBB_VERSION_STRING; -#elif ONEDPL_USE_OPENMP_BACKEND - std::cout << "OpenMP"; -#else - std::cout << "Default"; -#endif - std::cout << std::endl; -#endif - init_arrays(initA, initB, initC); -} - -template -STDRangesStream::~STDRangesStream() { - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -} - -template -void STDRangesStream::init_arrays(T initA, T initB, T initC) -{ - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, // loop range - [=, this] (intptr_t i) { - a[i] = initA; - b[i] = initB; - c[i] = initC; - } - ); -} - -template -void STDRangesStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) -{ - h_a = a; - h_b = b; - h_c = c; -} - -template -void STDRangesStream::copy() -{ - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - c[i] = a[i]; - } - ); -} - -template -void STDRangesStream::mul() -{ - const T scalar = startScalar; - - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - b[i] = scalar * c[i]; - } - ); -} - -template -void STDRangesStream::add() -{ - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - c[i] = a[i] + b[i]; - } - ); -} - -template -void STDRangesStream::triad() -{ - const T scalar = startScalar; - - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - a[i] = b[i] + scalar * c[i]; - } - ); -} - -template -void STDRangesStream::nstream() -{ - const T scalar = startScalar; - - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - a[i] += b[i] + scalar * c[i]; - } - ); -} - -template -T STDRangesStream::dot() -{ - // sum += a[i] * b[i]; - return - std::transform_reduce( - exe_policy, - a, a + array_size, b, T{}); -} - -void listDevices(void) -{ - std::cout << "C++20 does not expose devices" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} - -template class STDRangesStream; -template class STDRangesStream; diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp deleted file mode 100644 index da04f1f4..00000000 --- a/src/std-ranges/STDRangesStream.hpp +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once -#include "dpl_shim.h" - -#include -#include -#include "Stream.h" - -#define IMPLEMENTATION_STRING "STD C++ ranges" - -template -class STDRangesStream : public Stream -{ - protected: - // Size of arrays - intptr_t array_size; - - // Device side pointers - T *a, *b, *c; - - public: - STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) noexcept; - ~STDRangesStream(); - - void copy() override; - void add() override; - void mul() override; - void triad() override; - void nstream() override; - T dot() override; - - void get_arrays(T const*& a, T const*& b, T const*& c) override; - void init_arrays(T initA, T initB, T initC); -}; - diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake deleted file mode 100644 index d7fd6a8b..00000000 --- a/src/std-ranges/model.cmake +++ /dev/null @@ -1,68 +0,0 @@ - -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" - "c++") - -register_flag_optional(NVHPC_OFFLOAD - "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK. - The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) - - Possible values are: - cc35 - Compile for compute capability 3.5 - cc50 - Compile for compute capability 5.0 - cc60 - Compile for compute capability 6.0 - cc62 - Compile for compute capability 6.2 - cc70 - Compile for compute capability 7.0 - cc72 - Compile for compute capability 7.2 - cc75 - Compile for compute capability 7.5 - cc80 - Compile for compute capability 8.0 - ccall - Compile for all supported compute capabilities" - "") - -register_flag_optional(USE_TBB - "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." - "OFF") - -register_flag_optional(USE_ONEDPL - "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. - - Possible values are: - OPENMP - Implements policies using OpenMP. - CMake will handle any flags needed to enable OpenMP if the compiler supports it. - TBB - Implements policies using TBB. - TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - DPCPP - Implements policies through SYCL2020. - This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." - "OFF") - -macro(setup) - - # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here - - # C++ 2a is too new, disable CMake's std flags completely: - set(CMAKE_CXX_EXTENSIONS OFF) - set(CMAKE_CXX_STANDARD_REQUIRED OFF) - unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default - # and append our own: - register_append_cxx_flags(ANY -std=c++20) - if (NVHPC_OFFLOAD) - set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD}) - # propagate flags to linker so that it links with the gpu stuff as well - register_append_cxx_flags(ANY ${NVHPC_FLAGS}) - register_append_link_flags(${NVHPC_FLAGS}) - endif () - if (USE_TBB) - register_link_library(TBB::tbb) - endif () - if (USE_ONEDPL) - register_definitions(USE_ONEDPL) - register_link_library(oneDPL) - endif () -endmacro() - -macro(setup_target NAME) - if (USE_ONEDPL) - target_compile_features(${NAME} INTERFACE cxx_std_20) - target_compile_features(oneDPL INTERFACE cxx_std_20) - endif () -endmacro() diff --git a/src/std/STDStream.cpp b/src/std/STDStream.cpp new file mode 100644 index 00000000..484ecda2 --- /dev/null +++ b/src/std/STDStream.cpp @@ -0,0 +1,208 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Updated 2021 by University of Bristol +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "STDStream.h" +#include +#include + +#if defined(DATA23) || defined(INDICES) +#include +#endif + + // OneDPL workaround; TODO: remove this eventually +#include "dpl_shim.h" + +#ifdef INDICES +// NVHPC workaround: TODO: remove this eventually +#if defined(__NVCOMPILER) && defined(_NVHPC_STDPAR_GPU) +#define WORKAROUND +#include +auto counting_iter(intptr_t i) { return thrust::counting_iterator(i); } +auto counting_range(intptr_t b, intptr_t e) { + struct R { + thrust::counting_iterator b, e; + thrust::counting_iterator begin() { return b; } + thrust::counting_iterator end() { return e; } + }; + return R { .b = counting_iter(b), .e = counting_iter(e) }; +} +#else // NVHPC Workaround +auto counting_iter(intptr_t i) { return std::views::iota(i).begin(); } +auto counting_range(intptr_t b, intptr_t e) { return std::views::iota(b, e); } +#endif // NVHPC Workaround +#endif // INDICES + +template +STDStream::STDStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) + noexcept : array_size{array_size}, + a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) +{ + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#ifdef USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if ONEDPL_USE_DPCPP_BACKEND + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif ONEDPL_USE_TBB_BACKEND + std::cout << "TBB " TBB_VERSION_STRING; +#elif ONEDPL_USE_OPENMP_BACKEND + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; +#endif + +#ifdef WORKAROUND + std::cout << "Non-conforming implementation: requires non-portable workarounds to run STREAM" << std::endl; +#endif + init_arrays(initA, initB, initC); +} + +template +STDStream::~STDStream() { + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); +} + +template +void STDStream::init_arrays(T initA, T initB, T initC) +{ + std::fill_n(exe_policy, a, array_size, initA); + std::fill_n(exe_policy, b, array_size, initB); + std::fill_n(exe_policy, c, array_size, initC); +} + +template +void STDStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) +{ + h_a = a; + h_b = b; + h_c = c; +} + +template +void STDStream::copy() +{ + // c[i] = a[i] +#if defined(DATA17) || defined(DATA23) + std::copy(exe_policy, a, a + array_size, c); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,c=c](intptr_t i) { + c[i] = a[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::mul() +{ + // b[i] = scalar * c[i]; +#if defined(DATA17) || defined(DATA23) + std::transform(exe_policy, c, c + array_size, b, [](T ci){ return startScalar*ci; }); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [b=b, c=c](intptr_t i) { + b[i] = startScalar * c[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::add() +{ + // c[i] = a[i] + b[i]; +#if defined(DATA17) || defined(DATA23) + std::transform(exe_policy, a, a + array_size, b, c, std::plus()); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) { + c[i] = a[i] + b[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::triad() +{ + // a[i] = b[i] + scalar * c[i]; +#if defined(DATA17) || defined(DATA23) + std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) { + a[i] = b[i] + startScalar * c[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::nstream() +{ + // a[i] += b[i] + scalar * c[i]; +#if defined(DATA17) || defined(DATA23) // Until we can require GCC 14.1 + // Need to do in two round-trips with C++17 STL. + // 1: a[i] += b[i] + // 2: a[i] += scalar * c[i]; + std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, a, a + array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; }); +#elif DATA23 + // Requires GCC 14.1 (Ubuntu 24.04): + auto as = std::ranges::subrange(a, a + array_size); + auto bs = std::ranges::subrange(b, b + array_size); + auto cs = std::ranges::subrange(c, c + array_size); + std::transform(exe_policy, r.begin(), r.end(), a, [](auto vs) { + auto [a, b, c] = vs; + return a + b + startScalar * c; + }); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,b=b,c=c](intptr_t i) { + a[i] += b[i] + startScalar * c[i]; + }); +#else + #error unimplemented +#endif +} + + +template +T STDStream::dot() +{ +#if defined(DATA17) || defined(DATA23) + // sum = 0; sum += a[i] * b[i]; return sum; + return std::transform_reduce(exe_policy, a, a + array_size, b, T{0}); +#elif INDICES + auto r = counting_range(intptr_t(0), array_size); + return std::transform_reduce(exe_policy, r.begin(), r.end(), T{0}, std::plus{}, [a=a, b=b](intptr_t i) { + return a[i] * b[i]; + }); +#else + #error unimplemented +#endif +} + +void listDevices(void) +{ + std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class STDStream; +template class STDStream; diff --git a/src/std-data/STDDataStream.h b/src/std/STDStream.h similarity index 69% rename from src/std-data/STDDataStream.h rename to src/std/STDStream.h index 6db998b2..254d68d7 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std/STDStream.h @@ -5,17 +5,26 @@ // source code #pragma once -#include "dpl_shim.h" #include #include #include "Stream.h" -#define IMPLEMENTATION_STRING "STD (data-oriented)" +#ifdef DATA17 +#define STDIMPL "DATA17" +#elif DATA23 +#define STDIMPL "DATA23" +#elif INDICES +#define STDIMPL "INDICES" +#else +#error unimplemented +#endif + +#define IMPLEMENTATION_STRING "STD (" STDIMPL ")" template -class STDDataStream : public Stream +class STDStream : public Stream { protected: // Size of arrays @@ -25,9 +34,9 @@ class STDDataStream : public Stream T *a, *b, *c; public: - STDDataStream(BenchId bs, const intptr_t array_size, const int device_id, + STDStream(BenchId bs, const intptr_t array_size, const int device_id, T initA, T initB, T initC) noexcept; - ~STDDataStream(); + ~STDStream(); void copy() override; void add() override; diff --git a/src/std-data/model.cmake b/src/std/model.cmake similarity index 74% rename from src/std-data/model.cmake rename to src/std/model.cmake index 837d26bf..2c24015b 100644 --- a/src/std-data/model.cmake +++ b/src/std/model.cmake @@ -1,4 +1,3 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") @@ -16,8 +15,10 @@ register_flag_optional(NVHPC_OFFLOAD cc72 - Compile for compute capability 7.2 cc75 - Compile for compute capability 7.5 cc80 - Compile for compute capability 8.0 - ccall - Compile for all supported compute capabilities" - "") + cc90 - Compile for compute capability 8.0 + ccall - Compile for all supported compute capabilities + ccnative - Compiles for compute capability of current device" + "") register_flag_optional(USE_TBB "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." @@ -35,8 +36,23 @@ register_flag_optional(USE_ONEDPL This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." "OFF") +register_flag_optional(STDIMPL + "Implementation strategy (default = DATA20): + DATA17 - Parallel algorithms over data (requires C++17). + DATA23 - (default) Parallel algorithms over data (requires C++20). + INDICES - Parallel algorithms over indices (requires C++20)." + "DATA20" +) + macro(setup) - set(CMAKE_CXX_STANDARD 17) + register_definitions(${STDIMPL}) + if (${STDIMPL} STREQUAL "DATA17") + set(CMAKE_CXX_STANDARD 17) + elseif (${STDIMPL} STREQUAL "INDICES") + set(CMAKE_CXX_STANDARD 20) + elseif (${STDIMPL} STREQUAL "DATA23") + set(CMAKE_CXX_STANDARD 23) + endif () if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well @@ -44,7 +60,7 @@ macro(setup) register_append_link_flags(${NVHPC_FLAGS}) endif () if (USE_TBB) - register_link_library(TBB::tbb) + register_link_library(TBB::tbb) endif () if (USE_ONEDPL) register_definitions(USE_ONEDPL) diff --git a/src/sycl2020-acc/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp deleted file mode 100644 index d0f97e68..00000000 --- a/src/sycl2020-acc/SYCLStream2020.cpp +++ /dev/null @@ -1,286 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "SYCLStream2020.h" - -#include - -// Cache list of devices -bool cached = false; -std::vector devices; -void getDeviceList(void); - -template -SYCLStream::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index, - T initA, T initB, T initC) - : array_size(array_size), - d_a {array_size}, - d_b {array_size}, - d_c {array_size}, - d_sum {1} -{ - if (!cached) - getDeviceList(); - - if (device_index >= devices.size()) - throw std::runtime_error("Invalid device index"); - - sycl::device dev = devices[device_index]; - - // Print out device information - std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; - std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - - // Check device can support FP64 if needed - if (sizeof(T) == sizeof(double)) - { - if (!dev.has(sycl::aspect::fp64)) - { - throw std::runtime_error("Device does not support double precision, please use --float"); - } - } - - queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) - { - bool error = false; - for(auto e: l) - { - try - { - std::rethrow_exception(e); - } - catch (sycl::exception e) - { - std::cout << e.what(); - error = true; - } - } - if(error) - { - throw std::runtime_error("SYCL errors detected"); - } - }}); - - // No longer need list of devices - devices.clear(); - cached = true; - - init_arrays(initA, initB, initC); -} - - -template -void SYCLStream::copy() -{ - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::write_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - kc[idx] = ka[idx]; - }); - }); - queue->wait(); -} - -template -void SYCLStream::mul() -{ - const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor kb {d_b, cgh, sycl::write_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - kb[idx] = scalar * kc[idx]; - }); - }); - queue->wait(); -} - -template -void SYCLStream::add() -{ - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::write_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - kc[idx] = ka[idx] + kb[idx]; - }); - }); - queue->wait(); -} - -template -void SYCLStream::triad() -{ - const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::write_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - ka[idx] = kb[idx] + scalar * kc[idx]; - }); - }); - queue->wait(); -} - -template -void SYCLStream::nstream() -{ - const T scalar = startScalar; - - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - ka[idx] += kb[idx] + scalar * kc[idx]; - }); - }); - queue->wait(); -} - -template -T SYCLStream::dot() -{ - - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - - cgh.parallel_for(sycl::range<1>{array_size}, - // Reduction object, to perform summation - initialises the result to zero - // AdaptiveCpp doesn't sypport the initialize_to_identity property yet -#if defined(__HIPSYCL__) || defined(__OPENSYCL__) || defined(__ADAPTIVECPP__) - sycl::reduction(d_sum. template get_access(cgh), sycl::plus()), -#else - sycl::reduction(sum, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), -#endif - [=](sycl::id<1> idx, auto& sum) - { - sum += ka[idx] * kb[idx]; - }); - }); - - // Get access on the host, and return a copy of the data (single number) - // This will block until the result is available, so no need to wait on the queue. - sycl::host_accessor result {d_sum, sycl::read_only}; - return result[0]; - -} - -template -void SYCLStream::init_arrays(T initA, T initB, T initC) -{ - queue->submit([&](sycl::handler &cgh) - { - sycl::accessor ka {d_a, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kb {d_b, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kc {d_c, cgh, sycl::write_only, sycl::no_init}; - - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) - { - ka[idx] = initA; - kb[idx] = initB; - kc[idx] = initC; - }); - }); - - queue->wait(); -} - -template -void SYCLStream::get_arrays(T const*& a, T const*& b, T const*& c) -{ - sycl::host_accessor _a {d_a, sycl::read_only}; - sycl::host_accessor _b {d_b, sycl::read_only}; - sycl::host_accessor _c {d_c, sycl::read_only}; - a = &_a[0]; - b = &_b[0]; - c = &_c[0]; -} - -void getDeviceList(void) -{ - // Ask SYCL runtime for all devices in system - devices = sycl::device::get_devices(); - cached = true; -} - -void listDevices(void) -{ - getDeviceList(); - - // Print device names - if (devices.size() == 0) - { - std::cerr << "No devices found." << std::endl; - } - else - { - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - for (int i = 0; i < devices.size(); i++) - { - std::cout << i << ": " << getDeviceName(i) << std::endl; - } - std::cout << std::endl; - } -} - -std::string getDeviceName(const int device) -{ - if (!cached) - getDeviceList(); - - std::string name; - - if (device < devices.size()) - { - name = devices[device].get_info(); - } - else - { - throw std::runtime_error("Error asking for name for non-existant device"); - } - - return name; -} - -std::string getDeviceDriver(const int device) -{ - if (!cached) - getDeviceList(); - - std::string driver; - - if (device < devices.size()) - { - driver = devices[device].get_info(); - } - else - { - throw std::runtime_error("Error asking for driver for non-existant device"); - } - - return driver; -} - -template class SYCLStream; -template class SYCLStream; diff --git a/src/sycl2020-acc/model.cmake b/src/sycl2020-acc/model.cmake deleted file mode 100644 index 9847b348..00000000 --- a/src/sycl2020-acc/model.cmake +++ /dev/null @@ -1,92 +0,0 @@ - -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" - "c++") - -register_flag_required(SYCL_COMPILER - "Compile using the specified SYCL compiler implementation - Supported values are - ONEAPI-ICPX - icpx as a standalone compiler - ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) - DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) - HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) - AdaptiveCpp - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)") - -register_flag_optional(SYCL_COMPILER_DIR - "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: - ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) - ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. - AdaptiveCpp|HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" - "") - -macro(setup) - set(CMAKE_CXX_STANDARD 17) - - - if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") - set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp) - - if (NOT EXISTS "${AdaptiveCpp_DIR}") - message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure") - set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake) - endif () - if (NOT EXISTS "${AdaptiveCpp_DIR}") - message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp") - endif () - - # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) - find_package(AdaptiveCpp CONFIG REQUIRED) - message(STATUS "ok") - elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL") - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) - - if (NOT EXISTS "${hipSYCL_DIR}") - message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) - endif () - if (NOT EXISTS "${hipSYCL_DIR}") - message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") - endif () - - # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) - find_package(hipSYCL CONFIG REQUIRED) - message(STATUS "ok") - elseif (${SYCL_COMPILER} STREQUAL "DPCPP") - set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) - include_directories(${SYCL_COMPILER_DIR}/include/sycl) - register_append_cxx_flags(ANY -fsycl) - register_append_link_flags(-fsycl) - elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") - set(CMAKE_CXX_COMPILER icpx) - set(CMAKE_C_COMPILER icx) - register_append_cxx_flags(ANY -fsycl) - register_append_link_flags(-fsycl) - elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") - set(CMAKE_CXX_COMPILER clang++) - set(CMAKE_C_COMPILER clang) - register_append_cxx_flags(ANY -fsycl) - register_append_link_flags(-fsycl) - else () - message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") - endif () - -endmacro() - - -macro(setup_target NAME) - if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") - # so AdaptiveCpp has this weird (and bad) CMake usage where they append their - # own custom integration header flags AFTER the target has been specified - # hence this macro here - add_sycl_to_target( - TARGET ${NAME} - SOURCES ${IMPL_SOURCES}) - elseif (${SYCL_COMPILER} STREQUAL "HIPSYCL") - # so hipSYCL has this weird (and bad) CMake usage where they append their - # own custom integration header flags AFTER the target has been specified - # hence this macro here - add_sycl_to_target( - TARGET ${NAME} - SOURCES ${IMPL_SOURCES}) - endif () -endmacro() diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h deleted file mode 100644 index c88c87a3..00000000 --- a/src/sycl2020-usm/SYCLStream2020.h +++ /dev/null @@ -1,54 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once - -#include -#include - -#include "Stream.h" - -#include - -#define IMPLEMENTATION_STRING "SYCL2020 USM" - -template -class SYCLStream : public Stream -{ - protected: - // Size of arrays - size_t array_size; - - // SYCL objects - // Queue is a pointer because we allow device selection - std::unique_ptr queue; - - // Buffers - T *a{}; - T *b{}; - T *c{}; - T *sum{}; - - public: - - SYCLStream(BenchId bs, const intptr_t array_size, const int device_index, - T initA, T initB, T initC); - ~SYCLStream(); - - void copy() override; - void add() override; - void mul() override; - void triad() override; - void nstream() override; - T dot() override; - - void get_arrays(T const*& a, T const*& b, T const*& c) override; - void init_arrays(T initA, T initB, T initC); -}; - -// Populate the devices list -void getDeviceList(void); diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020/SYCLStream2020.cpp similarity index 68% rename from src/sycl2020-usm/SYCLStream2020.cpp rename to src/sycl2020/SYCLStream2020.cpp index c8b863ad..56292046 100644 --- a/src/sycl2020-usm/SYCLStream2020.cpp +++ b/src/sycl2020/SYCLStream2020.cpp @@ -1,5 +1,5 @@ -// Copyright (c) 2015-23 Tom Deakin, Simon McIntosh-Smith, and Tom Lin +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this @@ -61,11 +61,21 @@ SYCLStream::SYCLStream(BenchId bs, const intptr_t array_size, const int devic } }}); + // Allocate memory + #ifdef SYCL2020ACC + d_a = sycl::buffer{array_size}; + d_b = sycl::buffer{array_size}; + d_c = sycl::buffer{array_size}; + d_sum = sycl::buffer{1}; + #elif SYCL2020USM a = sycl::malloc_shared(array_size, *queue); b = sycl::malloc_shared(array_size, *queue); c = sycl::malloc_shared(array_size, *queue); sum = sycl::malloc_shared(1, *queue); - + #else + #error unimplemented + #endif + // No longer need list of devices devices.clear(); cached = true; @@ -75,10 +85,12 @@ SYCLStream::SYCLStream(BenchId bs, const intptr_t array_size, const int devic template SYCLStream::~SYCLStream() { - sycl::free(a, *queue); - sycl::free(b, *queue); - sycl::free(c, *queue); - sycl::free(sum, *queue); +#ifdef SYCL2020USM + sycl::free(a, *queue); + sycl::free(b, *queue); + sycl::free(c, *queue); + sycl::free(sum, *queue); +#endif } template @@ -86,7 +98,11 @@ void SYCLStream::copy() { queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a](sycl::id<1> idx) +#ifdef SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::read_only}; + sycl::accessor c {d_c, cgh, sycl::write_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [c=c,a=a](sycl::id<1> idx) { c[idx] = a[idx]; }); @@ -100,7 +116,11 @@ void SYCLStream::mul() const T scalar = startScalar; queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, b = this->b, c = this->c](sycl::id<1> idx) +#ifdef SYCL2020ACC + sycl::accessor b {d_b, cgh, sycl::write_only}; + sycl::accessor c {d_c, cgh, sycl::read_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [=,b=b,c=c](sycl::id<1> idx) { b[idx] = scalar * c[idx]; }); @@ -113,7 +133,12 @@ void SYCLStream::add() { queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, c = this->c, a = this->a, b = this->b](sycl::id<1> idx) +#ifdef SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::read_only}; + sycl::accessor b {d_b, cgh, sycl::read_only}; + sycl::accessor c {d_c, cgh, sycl::write_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [c=c,a=a,b=b](sycl::id<1> idx) { c[idx] = a[idx] + b[idx]; }); @@ -127,7 +152,12 @@ void SYCLStream::triad() const T scalar = startScalar; queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) +#ifdef SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::write_only}; + sycl::accessor b {d_b, cgh, sycl::read_only}; + sycl::accessor c {d_c, cgh, sycl::read_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [=,a=a,b=b,c=c](sycl::id<1> idx) { a[idx] = b[idx] + scalar * c[idx]; }); @@ -139,10 +169,14 @@ template void SYCLStream::nstream() { const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) +#if SYCL2020ACC + sycl::accessor a {d_a, cgh}; + sycl::accessor b {d_b, cgh, sycl::read_only}; + sycl::accessor c {d_c, cgh, sycl::read_only}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [=,a=a,b=b,c=c](sycl::id<1> idx) { a[idx] += b[idx] + scalar * c[idx]; }); @@ -155,6 +189,10 @@ T SYCLStream::dot() { queue->submit([&](sycl::handler &cgh) { +#if SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::read_only}; + sycl::accessor b {d_b, cgh, sycl::read_only}; +#endif cgh.parallel_for(sycl::range<1>{array_size}, // Reduction object, to perform summation - initialises the result to zero // AdaptiveCpp doesn't sypport the initialize_to_identity property yet @@ -163,7 +201,7 @@ T SYCLStream::dot() #else sycl::reduction(sum, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), #endif - [a = this->a, b = this->b](sycl::id<1> idx, auto& sum) + [a=a,b=b](sycl::id<1> idx, auto& sum) { sum += a[idx] * b[idx]; }); @@ -177,23 +215,32 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) { queue->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) +#if SYCL2020ACC + sycl::accessor a {d_a, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor b {d_b, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor c {d_c, cgh, sycl::write_only, sycl::no_init}; +#endif + cgh.parallel_for(sycl::range<1>{array_size}, [=,a=a,b=b,c=c](sycl::id<1> idx) { a[idx] = initA; b[idx] = initB; c[idx] = initC; }); }); - queue->wait(); } template void SYCLStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { - h_a = a; - h_b = b; - h_c = c; +#if SYCL2020ACC + sycl::host_accessor a {d_a, sycl::read_only}; + sycl::host_accessor b {d_b, sycl::read_only}; + sycl::host_accessor c {d_c, sycl::read_only}; +#endif + h_a = &a[0]; + h_b = &b[0]; + h_c = &c[0]; } void getDeviceList(void) diff --git a/src/sycl2020-acc/SYCLStream2020.h b/src/sycl2020/SYCLStream2020.h similarity index 79% rename from src/sycl2020-acc/SYCLStream2020.h rename to src/sycl2020/SYCLStream2020.h index c0caae2e..3b4e1ef0 100644 --- a/src/sycl2020-acc/SYCLStream2020.h +++ b/src/sycl2020/SYCLStream2020.h @@ -14,7 +14,15 @@ #include -#define IMPLEMENTATION_STRING "SYCL2020 accessors" +#ifdef SYCL2020ACC +#define SYCLIMPL "Accessors" +#elif SYCL2020USM +#define SYCLIMPL "USM" +#else +#error unimplemented +#endif + +#define IMPLEMENTATION_STRING "SYCL2020 " SYCLIMPL template class SYCLStream : public Stream @@ -28,16 +36,14 @@ class SYCLStream : public Stream std::unique_ptr queue; // Buffers - sycl::buffer d_a; - sycl::buffer d_b; - sycl::buffer d_c; - sycl::buffer d_sum; + T *a, *b, *c, *sum{}; + sycl::buffer d_a, d_b, d_c, d_sum; public: SYCLStream(BenchId bs, const intptr_t array_size, const int device_id, T initA, T initB, T initC); - ~SYCLStream() = default; + ~SYCLStream(); void copy() override; void add() override; diff --git a/src/sycl2020-usm/model.cmake b/src/sycl2020/model.cmake similarity index 96% rename from src/sycl2020-usm/model.cmake rename to src/sycl2020/model.cmake index 72aa7c40..d6452534 100644 --- a/src/sycl2020-usm/model.cmake +++ b/src/sycl2020/model.cmake @@ -19,9 +19,15 @@ register_flag_optional(SYCL_COMPILER_DIR AdaptiveCpp|HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") +register_flag_optional(SYCL_ACCESS + "Data access method: + - ACCESSOR + - USM" + "ACCESSOR") + macro(setup) set(CMAKE_CXX_STANDARD 17) - + register_definitions(${SYCL_ACCESS}) if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp)